mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
adding black formatter to all the code
This commit is contained in:
parent
5bd27191a2
commit
e211ec0aa2
21
conftest.py
21
conftest.py
@ -9,7 +9,7 @@ from tests.keys import generate_keys
|
||||
|
||||
|
||||
def _py_files(folder):
|
||||
return (str(p) for p in Path(folder).rglob('*.py'))
|
||||
return (str(p) for p in Path(folder).rglob("*.py"))
|
||||
|
||||
|
||||
collect_ignore = [
|
||||
@ -21,16 +21,16 @@ collect_ignore = [
|
||||
*_py_files("tests/CrawlerRunner"),
|
||||
]
|
||||
|
||||
with Path('tests/ignores.txt').open(encoding="utf-8") as reader:
|
||||
with Path("tests/ignores.txt").open(encoding="utf-8") as reader:
|
||||
for line in reader:
|
||||
file_path = line.strip()
|
||||
if file_path and file_path[0] != '#':
|
||||
if file_path and file_path[0] != "#":
|
||||
collect_ignore.append(file_path)
|
||||
|
||||
if not H2_ENABLED:
|
||||
collect_ignore.extend(
|
||||
(
|
||||
'scrapy/core/downloader/handlers/http2.py',
|
||||
"scrapy/core/downloader/handlers/http2.py",
|
||||
*_py_files("scrapy/core/http2"),
|
||||
)
|
||||
)
|
||||
@ -50,7 +50,7 @@ def pytest_addoption(parser):
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope='class')
|
||||
@pytest.fixture(scope="class")
|
||||
def reactor_pytest(request):
|
||||
if not request.cls:
|
||||
# doctests
|
||||
@ -61,14 +61,17 @@ def reactor_pytest(request):
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def only_asyncio(request, reactor_pytest):
|
||||
if request.node.get_closest_marker('only_asyncio') and reactor_pytest != 'asyncio':
|
||||
pytest.skip('This test is only run with --reactor=asyncio')
|
||||
if request.node.get_closest_marker("only_asyncio") and reactor_pytest != "asyncio":
|
||||
pytest.skip("This test is only run with --reactor=asyncio")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def only_not_asyncio(request, reactor_pytest):
|
||||
if request.node.get_closest_marker('only_not_asyncio') and reactor_pytest == 'asyncio':
|
||||
pytest.skip('This test is only run without --reactor=asyncio')
|
||||
if (
|
||||
request.node.get_closest_marker("only_not_asyncio")
|
||||
and reactor_pytest == "asyncio"
|
||||
):
|
||||
pytest.skip("This test is only run without --reactor=asyncio")
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
|
@ -11,15 +11,15 @@ class settingslist_node(nodes.General, nodes.Element):
|
||||
|
||||
class SettingsListDirective(Directive):
|
||||
def run(self):
|
||||
return [settingslist_node('')]
|
||||
return [settingslist_node("")]
|
||||
|
||||
|
||||
def is_setting_index(node):
|
||||
if node.tagname == 'index' and node['entries']:
|
||||
if node.tagname == "index" and node["entries"]:
|
||||
# index entries for setting directives look like:
|
||||
# [('pair', 'SETTING_NAME; setting', 'std:setting-SETTING_NAME', '')]
|
||||
entry_type, info, refid = node['entries'][0][:3]
|
||||
return entry_type == 'pair' and info.endswith('; setting')
|
||||
entry_type, info, refid = node["entries"][0][:3]
|
||||
return entry_type == "pair" and info.endswith("; setting")
|
||||
return False
|
||||
|
||||
|
||||
@ -30,14 +30,14 @@ def get_setting_target(node):
|
||||
|
||||
def get_setting_name_and_refid(node):
|
||||
"""Extract setting name from directive index node"""
|
||||
entry_type, info, refid = node['entries'][0][:3]
|
||||
return info.replace('; setting', ''), refid
|
||||
entry_type, info, refid = node["entries"][0][:3]
|
||||
return info.replace("; setting", ""), refid
|
||||
|
||||
|
||||
def collect_scrapy_settings_refs(app, doctree):
|
||||
env = app.builder.env
|
||||
|
||||
if not hasattr(env, 'scrapy_all_settings'):
|
||||
if not hasattr(env, "scrapy_all_settings"):
|
||||
env.scrapy_all_settings = []
|
||||
|
||||
for node in doctree.traverse(is_setting_index):
|
||||
@ -46,18 +46,23 @@ def collect_scrapy_settings_refs(app, doctree):
|
||||
|
||||
setting_name, refid = get_setting_name_and_refid(node)
|
||||
|
||||
env.scrapy_all_settings.append({
|
||||
'docname': env.docname,
|
||||
'setting_name': setting_name,
|
||||
'refid': refid,
|
||||
})
|
||||
env.scrapy_all_settings.append(
|
||||
{
|
||||
"docname": env.docname,
|
||||
"setting_name": setting_name,
|
||||
"refid": refid,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def make_setting_element(setting_data, app, fromdocname):
|
||||
refnode = make_refnode(app.builder, fromdocname,
|
||||
todocname=setting_data['docname'],
|
||||
targetid=setting_data['refid'],
|
||||
child=nodes.Text(setting_data['setting_name']))
|
||||
refnode = make_refnode(
|
||||
app.builder,
|
||||
fromdocname,
|
||||
todocname=setting_data["docname"],
|
||||
targetid=setting_data["refid"],
|
||||
child=nodes.Text(setting_data["setting_name"]),
|
||||
)
|
||||
p = nodes.paragraph()
|
||||
p += refnode
|
||||
|
||||
@ -71,10 +76,13 @@ def replace_settingslist_nodes(app, doctree, fromdocname):
|
||||
|
||||
for node in doctree.traverse(settingslist_node):
|
||||
settings_list = nodes.bullet_list()
|
||||
settings_list.extend([make_setting_element(d, app, fromdocname)
|
||||
for d in sorted(env.scrapy_all_settings,
|
||||
key=itemgetter('setting_name'))
|
||||
if fromdocname != d['docname']])
|
||||
settings_list.extend(
|
||||
[
|
||||
make_setting_element(d, app, fromdocname)
|
||||
for d in sorted(env.scrapy_all_settings, key=itemgetter("setting_name"))
|
||||
if fromdocname != d["docname"]
|
||||
]
|
||||
)
|
||||
node.replace_self(settings_list)
|
||||
|
||||
|
||||
@ -99,41 +107,41 @@ def setup(app):
|
||||
rolename="reqmeta",
|
||||
indextemplate="pair: %s; reqmeta",
|
||||
)
|
||||
app.add_role('source', source_role)
|
||||
app.add_role('commit', commit_role)
|
||||
app.add_role('issue', issue_role)
|
||||
app.add_role('rev', rev_role)
|
||||
app.add_role("source", source_role)
|
||||
app.add_role("commit", commit_role)
|
||||
app.add_role("issue", issue_role)
|
||||
app.add_role("rev", rev_role)
|
||||
|
||||
app.add_node(settingslist_node)
|
||||
app.add_directive('settingslist', SettingsListDirective)
|
||||
app.add_directive("settingslist", SettingsListDirective)
|
||||
|
||||
app.connect('doctree-read', collect_scrapy_settings_refs)
|
||||
app.connect('doctree-resolved', replace_settingslist_nodes)
|
||||
app.connect("doctree-read", collect_scrapy_settings_refs)
|
||||
app.connect("doctree-resolved", replace_settingslist_nodes)
|
||||
|
||||
|
||||
def source_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
ref = 'https://github.com/scrapy/scrapy/blob/master/' + text
|
||||
ref = "https://github.com/scrapy/scrapy/blob/master/" + text
|
||||
set_classes(options)
|
||||
node = nodes.reference(rawtext, text, refuri=ref, **options)
|
||||
return [node], []
|
||||
|
||||
|
||||
def issue_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
ref = 'https://github.com/scrapy/scrapy/issues/' + text
|
||||
ref = "https://github.com/scrapy/scrapy/issues/" + text
|
||||
set_classes(options)
|
||||
node = nodes.reference(rawtext, 'issue ' + text, refuri=ref, **options)
|
||||
node = nodes.reference(rawtext, "issue " + text, refuri=ref, **options)
|
||||
return [node], []
|
||||
|
||||
|
||||
def commit_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
ref = 'https://github.com/scrapy/scrapy/commit/' + text
|
||||
ref = "https://github.com/scrapy/scrapy/commit/" + text
|
||||
set_classes(options)
|
||||
node = nodes.reference(rawtext, 'commit ' + text, refuri=ref, **options)
|
||||
node = nodes.reference(rawtext, "commit " + text, refuri=ref, **options)
|
||||
return [node], []
|
||||
|
||||
|
||||
def rev_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
ref = 'http://hg.scrapy.org/scrapy/changeset/' + text
|
||||
ref = "http://hg.scrapy.org/scrapy/changeset/" + text
|
||||
set_classes(options)
|
||||
node = nodes.reference(rawtext, 'r' + text, refuri=ref, **options)
|
||||
node = nodes.reference(rawtext, "r" + text, refuri=ref, **options)
|
||||
return [node], []
|
||||
|
175
docs/conf.py
175
docs/conf.py
@ -25,30 +25,30 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
# Add any Sphinx extension module names here, as strings. They can be extensions
|
||||
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||
extensions = [
|
||||
'hoverxref.extension',
|
||||
'notfound.extension',
|
||||
'scrapydocs',
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.coverage',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.viewcode',
|
||||
"hoverxref.extension",
|
||||
"notfound.extension",
|
||||
"scrapydocs",
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.coverage",
|
||||
"sphinx.ext.intersphinx",
|
||||
"sphinx.ext.viewcode",
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# The suffix of source filenames.
|
||||
source_suffix = '.rst'
|
||||
source_suffix = ".rst"
|
||||
|
||||
# The encoding of source files.
|
||||
#source_encoding = 'utf-8'
|
||||
# source_encoding = 'utf-8'
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = 'index'
|
||||
master_doc = "index"
|
||||
|
||||
# General information about the project.
|
||||
project = 'Scrapy'
|
||||
copyright = f'2008–{datetime.now().year}, Scrapy developers'
|
||||
project = "Scrapy"
|
||||
copyright = f"2008–{datetime.now().year}, Scrapy developers"
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
@ -57,50 +57,51 @@ copyright = f'2008–{datetime.now().year}, Scrapy developers'
|
||||
# The short X.Y version.
|
||||
try:
|
||||
import scrapy
|
||||
version = '.'.join(map(str, scrapy.version_info[:2]))
|
||||
|
||||
version = ".".join(map(str, scrapy.version_info[:2]))
|
||||
release = scrapy.__version__
|
||||
except ImportError:
|
||||
version = ''
|
||||
release = ''
|
||||
version = ""
|
||||
release = ""
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
language = 'en'
|
||||
language = "en"
|
||||
|
||||
# There are two options for replacing |today|: either, you set today to some
|
||||
# non-false value, then it is used:
|
||||
#today = ''
|
||||
# today = ''
|
||||
# Else, today_fmt is used as the format for a strftime call.
|
||||
#today_fmt = '%B %d, %Y'
|
||||
# today_fmt = '%B %d, %Y'
|
||||
|
||||
# List of documents that shouldn't be included in the build.
|
||||
#unused_docs = []
|
||||
# unused_docs = []
|
||||
|
||||
exclude_patterns = ['build']
|
||||
exclude_patterns = ["build"]
|
||||
|
||||
# List of directories, relative to source directory, that shouldn't be searched
|
||||
# for source files.
|
||||
exclude_trees = ['.build']
|
||||
exclude_trees = [".build"]
|
||||
|
||||
# The reST default role (used for this markup: `text`) to use for all documents.
|
||||
#default_role = None
|
||||
# default_role = None
|
||||
|
||||
# If true, '()' will be appended to :func: etc. cross-reference text.
|
||||
#add_function_parentheses = True
|
||||
# add_function_parentheses = True
|
||||
|
||||
# If true, the current module name will be prepended to all description
|
||||
# unit titles (such as .. function::).
|
||||
#add_module_names = True
|
||||
# add_module_names = True
|
||||
|
||||
# If true, sectionauthor and moduleauthor directives will be shown in the
|
||||
# output. They are ignored by default.
|
||||
#show_authors = False
|
||||
# show_authors = False
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'sphinx'
|
||||
pygments_style = "sphinx"
|
||||
|
||||
# List of Sphinx warnings that will not be raised
|
||||
suppress_warnings = ['epub.unknown_project_files']
|
||||
suppress_warnings = ["epub.unknown_project_files"]
|
||||
|
||||
|
||||
# Options for HTML output
|
||||
@ -108,17 +109,18 @@ suppress_warnings = ['epub.unknown_project_files']
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#html_theme_options = {}
|
||||
# html_theme_options = {}
|
||||
|
||||
# Add any paths that contain custom themes here, relative to this directory.
|
||||
# Add path to the RTD explicitly to robustify builds (otherwise might
|
||||
# fail in a clean Debian build env)
|
||||
import sphinx_rtd_theme
|
||||
|
||||
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
|
||||
|
||||
# The style sheet to use for HTML and HTML Help pages. A file of that name
|
||||
@ -128,44 +130,44 @@ html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
|
||||
|
||||
# The name for this set of Sphinx documents. If None, it defaults to
|
||||
# "<project> v<release> documentation".
|
||||
#html_title = None
|
||||
# html_title = None
|
||||
|
||||
# A shorter title for the navigation bar. Default is the same as html_title.
|
||||
#html_short_title = None
|
||||
# html_short_title = None
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top
|
||||
# of the sidebar.
|
||||
#html_logo = None
|
||||
# html_logo = None
|
||||
|
||||
# The name of an image file (within the static path) to use as favicon of the
|
||||
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
||||
# pixels large.
|
||||
#html_favicon = None
|
||||
# html_favicon = None
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
html_static_path = ["_static"]
|
||||
|
||||
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
||||
# using the given strftime format.
|
||||
html_last_updated_fmt = '%b %d, %Y'
|
||||
html_last_updated_fmt = "%b %d, %Y"
|
||||
|
||||
# Custom sidebar templates, maps document names to template names.
|
||||
#html_sidebars = {}
|
||||
# html_sidebars = {}
|
||||
|
||||
# Additional templates that should be rendered to pages, maps page names to
|
||||
# template names.
|
||||
#html_additional_pages = {}
|
||||
# html_additional_pages = {}
|
||||
|
||||
# If false, no module index is generated.
|
||||
#html_use_modindex = True
|
||||
# html_use_modindex = True
|
||||
|
||||
# If false, no index is generated.
|
||||
#html_use_index = True
|
||||
# html_use_index = True
|
||||
|
||||
# If true, the index is split into individual pages for each letter.
|
||||
#html_split_index = False
|
||||
# html_split_index = False
|
||||
|
||||
# If true, the reST sources are included in the HTML build as _sources/<name>.
|
||||
html_copy_source = True
|
||||
@ -173,16 +175,16 @@ html_copy_source = True
|
||||
# If true, an OpenSearch description file will be output, and all pages will
|
||||
# contain a <link> tag referring to it. The value of this option must be the
|
||||
# base URL from which the finished HTML is served.
|
||||
#html_use_opensearch = ''
|
||||
# html_use_opensearch = ''
|
||||
|
||||
# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
|
||||
#html_file_suffix = ''
|
||||
# html_file_suffix = ''
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'Scrapydoc'
|
||||
htmlhelp_basename = "Scrapydoc"
|
||||
|
||||
html_css_files = [
|
||||
'custom.css',
|
||||
"custom.css",
|
||||
]
|
||||
|
||||
|
||||
@ -190,34 +192,33 @@ html_css_files = [
|
||||
# ------------------------
|
||||
|
||||
# The paper size ('letter' or 'a4').
|
||||
#latex_paper_size = 'letter'
|
||||
# latex_paper_size = 'letter'
|
||||
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#latex_font_size = '10pt'
|
||||
# latex_font_size = '10pt'
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title, author, document class [howto/manual]).
|
||||
latex_documents = [
|
||||
('index', 'Scrapy.tex', 'Scrapy Documentation',
|
||||
'Scrapy developers', 'manual'),
|
||||
("index", "Scrapy.tex", "Scrapy Documentation", "Scrapy developers", "manual"),
|
||||
]
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top of
|
||||
# the title page.
|
||||
#latex_logo = None
|
||||
# latex_logo = None
|
||||
|
||||
# For "manual" documents, if this is true, then toplevel headings are parts,
|
||||
# not chapters.
|
||||
#latex_use_parts = False
|
||||
# latex_use_parts = False
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#latex_preamble = ''
|
||||
# latex_preamble = ''
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
#latex_appendices = []
|
||||
# latex_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
#latex_use_modindex = True
|
||||
# latex_use_modindex = True
|
||||
|
||||
|
||||
# Options for the linkcheck builder
|
||||
@ -226,8 +227,9 @@ latex_documents = [
|
||||
# A list of regular expressions that match URIs that should not be checked when
|
||||
# doing a linkcheck build.
|
||||
linkcheck_ignore = [
|
||||
'http://localhost:\d+', 'http://hg.scrapy.org',
|
||||
'http://directory.google.com/'
|
||||
"http://localhost:\d+",
|
||||
"http://hg.scrapy.org",
|
||||
"http://directory.google.com/",
|
||||
]
|
||||
|
||||
|
||||
@ -237,44 +239,35 @@ coverage_ignore_pyobjects = [
|
||||
# Contract’s add_pre_hook and add_post_hook are not documented because
|
||||
# they should be transparent to contract developers, for whom pre_hook and
|
||||
# post_hook should be the actual concern.
|
||||
r'\bContract\.add_(pre|post)_hook$',
|
||||
|
||||
r"\bContract\.add_(pre|post)_hook$",
|
||||
# ContractsManager is an internal class, developers are not expected to
|
||||
# interact with it directly in any way.
|
||||
r'\bContractsManager\b$',
|
||||
|
||||
r"\bContractsManager\b$",
|
||||
# For default contracts we only want to document their general purpose in
|
||||
# their __init__ method, the methods they reimplement to achieve that purpose
|
||||
# should be irrelevant to developers using those contracts.
|
||||
r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
|
||||
|
||||
r"\w+Contract\.(adjust_request_args|(pre|post)_process)$",
|
||||
# Methods of downloader middlewares are not documented, only the classes
|
||||
# themselves, since downloader middlewares are controlled through Scrapy
|
||||
# settings.
|
||||
r'^scrapy\.downloadermiddlewares\.\w*?\.(\w*?Middleware|DownloaderStats)\.',
|
||||
|
||||
r"^scrapy\.downloadermiddlewares\.\w*?\.(\w*?Middleware|DownloaderStats)\.",
|
||||
# Base classes of downloader middlewares are implementation details that
|
||||
# are not meant for users.
|
||||
r'^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware',
|
||||
|
||||
r"^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware",
|
||||
# Private exception used by the command-line interface implementation.
|
||||
r'^scrapy\.exceptions\.UsageError',
|
||||
|
||||
r"^scrapy\.exceptions\.UsageError",
|
||||
# Methods of BaseItemExporter subclasses are only documented in
|
||||
# BaseItemExporter.
|
||||
r'^scrapy\.exporters\.(?!BaseItemExporter\b)\w*?\.',
|
||||
|
||||
r"^scrapy\.exporters\.(?!BaseItemExporter\b)\w*?\.",
|
||||
# Extension behavior is only modified through settings. Methods of
|
||||
# extension classes, as well as helper functions, are implementation
|
||||
# details that are not documented.
|
||||
r'^scrapy\.extensions\.[a-z]\w*?\.[A-Z]\w*?\.', # methods
|
||||
r'^scrapy\.extensions\.[a-z]\w*?\.[a-z]', # helper functions
|
||||
|
||||
r"^scrapy\.extensions\.[a-z]\w*?\.[A-Z]\w*?\.", # methods
|
||||
r"^scrapy\.extensions\.[a-z]\w*?\.[a-z]", # helper functions
|
||||
# Never documented before, and deprecated now.
|
||||
r'^scrapy\.linkextractors\.FilteringLinkExtractor$',
|
||||
|
||||
r"^scrapy\.linkextractors\.FilteringLinkExtractor$",
|
||||
# Implementation detail of LxmlLinkExtractor
|
||||
r'^scrapy\.linkextractors\.lxmlhtml\.LxmlParserLinkExtractor',
|
||||
r"^scrapy\.linkextractors\.lxmlhtml\.LxmlParserLinkExtractor",
|
||||
]
|
||||
|
||||
|
||||
@ -282,18 +275,18 @@ coverage_ignore_pyobjects = [
|
||||
# -------------------------------------
|
||||
|
||||
intersphinx_mapping = {
|
||||
'attrs': ('https://www.attrs.org/en/stable/', None),
|
||||
'coverage': ('https://coverage.readthedocs.io/en/stable', None),
|
||||
'cryptography' : ('https://cryptography.io/en/latest/', None),
|
||||
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
|
||||
'itemloaders': ('https://itemloaders.readthedocs.io/en/latest/', None),
|
||||
'pytest': ('https://docs.pytest.org/en/latest', None),
|
||||
'python': ('https://docs.python.org/3', None),
|
||||
'sphinx': ('https://www.sphinx-doc.org/en/master', None),
|
||||
'tox': ('https://tox.wiki/en/latest/', None),
|
||||
'twisted': ('https://docs.twisted.org/en/stable/', None),
|
||||
'twistedapi': ('https://docs.twisted.org/en/stable/api/', None),
|
||||
'w3lib': ('https://w3lib.readthedocs.io/en/latest', None),
|
||||
"attrs": ("https://www.attrs.org/en/stable/", None),
|
||||
"coverage": ("https://coverage.readthedocs.io/en/stable", None),
|
||||
"cryptography": ("https://cryptography.io/en/latest/", None),
|
||||
"cssselect": ("https://cssselect.readthedocs.io/en/latest", None),
|
||||
"itemloaders": ("https://itemloaders.readthedocs.io/en/latest/", None),
|
||||
"pytest": ("https://docs.pytest.org/en/latest", None),
|
||||
"python": ("https://docs.python.org/3", None),
|
||||
"sphinx": ("https://www.sphinx-doc.org/en/master", None),
|
||||
"tox": ("https://tox.wiki/en/latest/", None),
|
||||
"twisted": ("https://docs.twisted.org/en/stable/", None),
|
||||
"twistedapi": ("https://docs.twisted.org/en/stable/api/", None),
|
||||
"w3lib": ("https://w3lib.readthedocs.io/en/latest", None),
|
||||
}
|
||||
intersphinx_disabled_reftypes = []
|
||||
|
||||
@ -313,16 +306,16 @@ hoverxref_role_types = {
|
||||
"setting": "tooltip",
|
||||
"signal": "tooltip",
|
||||
}
|
||||
hoverxref_roles = ['command', 'reqmeta', 'setting', 'signal']
|
||||
hoverxref_roles = ["command", "reqmeta", "setting", "signal"]
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.connect('autodoc-skip-member', maybe_skip_member)
|
||||
app.connect("autodoc-skip-member", maybe_skip_member)
|
||||
|
||||
|
||||
def maybe_skip_member(app, what, name, obj, skip, options):
|
||||
if not skip:
|
||||
# autodocs was generating a text "alias of" for the following members
|
||||
# https://github.com/sphinx-doc/sphinx/issues/4422
|
||||
return name in {'default_item_class', 'default_selector_class'}
|
||||
return name in {"default_item_class", "default_selector_class"}
|
||||
return skip
|
||||
|
@ -15,20 +15,20 @@ from scrapy.http.response.html import HtmlResponse
|
||||
|
||||
|
||||
def load_response(url: str, filename: str) -> HtmlResponse:
|
||||
input_path = Path(__file__).parent / '_tests' / filename
|
||||
input_path = Path(__file__).parent / "_tests" / filename
|
||||
return HtmlResponse(url, body=input_path.read_bytes())
|
||||
|
||||
|
||||
def setup(namespace):
|
||||
namespace['load_response'] = load_response
|
||||
namespace["load_response"] = load_response
|
||||
|
||||
|
||||
pytest_collect_file = Sybil(
|
||||
parsers=[
|
||||
DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE),
|
||||
PythonCodeBlockParser(future_imports=['print_function']),
|
||||
PythonCodeBlockParser(future_imports=["print_function"]),
|
||||
skip,
|
||||
],
|
||||
pattern='*.rst',
|
||||
pattern="*.rst",
|
||||
setup=setup,
|
||||
).pytest()
|
||||
|
@ -25,7 +25,7 @@ def main():
|
||||
_contents = None
|
||||
|
||||
# A regex that matches standard linkcheck output lines
|
||||
line_re = re.compile(r'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))')
|
||||
line_re = re.compile(r"(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))")
|
||||
|
||||
# Read lines from the linkcheck output file
|
||||
try:
|
||||
@ -66,5 +66,5 @@ def main():
|
||||
print("Not Understood: " + line)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -7,7 +7,6 @@ from twisted.internet import reactor
|
||||
|
||||
|
||||
class Root(Resource):
|
||||
|
||||
def __init__(self):
|
||||
Resource.__init__(self)
|
||||
self.concurrent = 0
|
||||
@ -26,9 +25,9 @@ class Root(Resource):
|
||||
delta = now - self.lasttime
|
||||
|
||||
# reset stats on high iter-request times caused by client restarts
|
||||
if delta > 3: # seconds
|
||||
if delta > 3: # seconds
|
||||
self._reset_stats()
|
||||
return ''
|
||||
return ""
|
||||
|
||||
self.tail.appendleft(delta)
|
||||
self.lasttime = now
|
||||
@ -37,15 +36,17 @@ class Root(Resource):
|
||||
if now - self.lastmark >= 3:
|
||||
self.lastmark = now
|
||||
qps = len(self.tail) / sum(self.tail)
|
||||
print(f'samplesize={len(self.tail)} concurrent={self.concurrent} qps={qps:0.2f}')
|
||||
print(
|
||||
f"samplesize={len(self.tail)} concurrent={self.concurrent} qps={qps:0.2f}"
|
||||
)
|
||||
|
||||
if 'latency' in request.args:
|
||||
latency = float(request.args['latency'][0])
|
||||
if "latency" in request.args:
|
||||
latency = float(request.args["latency"][0])
|
||||
reactor.callLater(latency, self._finish, request)
|
||||
return NOT_DONE_YET
|
||||
|
||||
self.concurrent -= 1
|
||||
return ''
|
||||
return ""
|
||||
|
||||
def _finish(self, request):
|
||||
self.concurrent -= 1
|
||||
|
@ -13,13 +13,13 @@ from scrapy.http import Request
|
||||
|
||||
class QPSSpider(Spider):
|
||||
|
||||
name = 'qps'
|
||||
benchurl = 'http://localhost:8880/'
|
||||
name = "qps"
|
||||
benchurl = "http://localhost:8880/"
|
||||
|
||||
# Max concurrency is limited by global CONCURRENT_REQUESTS setting
|
||||
max_concurrent_requests = 8
|
||||
# Requests per second goal
|
||||
qps = None # same as: 1 / download_delay
|
||||
qps = None # same as: 1 / download_delay
|
||||
download_delay = None
|
||||
# time in seconds to delay server responses
|
||||
latency = None
|
||||
@ -37,11 +37,11 @@ class QPSSpider(Spider):
|
||||
def start_requests(self):
|
||||
url = self.benchurl
|
||||
if self.latency is not None:
|
||||
url += f'?latency={self.latency}'
|
||||
url += f"?latency={self.latency}"
|
||||
|
||||
slots = int(self.slots)
|
||||
if slots > 1:
|
||||
urls = [url.replace('localhost', f'127.0.0.{x + 1}') for x in range(slots)]
|
||||
urls = [url.replace("localhost", f"127.0.0.{x + 1}") for x in range(slots)]
|
||||
else:
|
||||
urls = [url]
|
||||
|
||||
|
@ -16,14 +16,21 @@ from scrapy.item import Item, Field
|
||||
|
||||
|
||||
__all__ = [
|
||||
'__version__', 'version_info', 'twisted_version', 'Spider',
|
||||
'Request', 'FormRequest', 'Selector', 'Item', 'Field',
|
||||
"__version__",
|
||||
"version_info",
|
||||
"twisted_version",
|
||||
"Spider",
|
||||
"Request",
|
||||
"FormRequest",
|
||||
"Selector",
|
||||
"Item",
|
||||
"Field",
|
||||
]
|
||||
|
||||
|
||||
# Scrapy and Twisted versions
|
||||
__version__ = (pkgutil.get_data(__package__, "VERSION") or b"").decode("ascii").strip()
|
||||
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split('.'))
|
||||
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split("."))
|
||||
twisted_version = (_txv.major, _txv.minor, _txv.micro)
|
||||
|
||||
|
||||
@ -34,7 +41,7 @@ if sys.version_info < (3, 7):
|
||||
|
||||
|
||||
# Ignore noisy twisted deprecation warnings
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning, module="twisted")
|
||||
|
||||
|
||||
del pkgutil
|
||||
|
@ -1,4 +1,4 @@
|
||||
from scrapy.cmdline import execute
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
execute()
|
||||
|
@ -17,7 +17,7 @@ from scrapy.utils.python import garbage_collect
|
||||
class ScrapyArgumentParser(argparse.ArgumentParser):
|
||||
def _parse_optional(self, arg_string):
|
||||
# if starts with -: it means that is a parameter not a argument
|
||||
if arg_string[:2] == '-:':
|
||||
if arg_string[:2] == "-:":
|
||||
return None
|
||||
|
||||
return super()._parse_optional(arg_string)
|
||||
@ -41,12 +41,12 @@ def _get_commands_from_module(module, inproject):
|
||||
d = {}
|
||||
for cmd in _iter_command_classes(module):
|
||||
if inproject or not cmd.requires_project:
|
||||
cmdname = cmd.__module__.split('.')[-1]
|
||||
cmdname = cmd.__module__.split(".")[-1]
|
||||
d[cmdname] = cmd()
|
||||
return d
|
||||
|
||||
|
||||
def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
|
||||
def _get_commands_from_entry_points(inproject, group="scrapy.commands"):
|
||||
cmds = {}
|
||||
for entry_point in pkg_resources.iter_entry_points(group):
|
||||
obj = entry_point.load()
|
||||
@ -58,9 +58,9 @@ def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
|
||||
|
||||
|
||||
def _get_commands_dict(settings, inproject):
|
||||
cmds = _get_commands_from_module('scrapy.commands', inproject)
|
||||
cmds = _get_commands_from_module("scrapy.commands", inproject)
|
||||
cmds.update(_get_commands_from_entry_points(inproject))
|
||||
cmds_module = settings['COMMANDS_MODULE']
|
||||
cmds_module = settings["COMMANDS_MODULE"]
|
||||
if cmds_module:
|
||||
cmds.update(_get_commands_from_module(cmds_module, inproject))
|
||||
return cmds
|
||||
@ -69,7 +69,7 @@ def _get_commands_dict(settings, inproject):
|
||||
def _pop_command_name(argv):
|
||||
i = 0
|
||||
for arg in argv[1:]:
|
||||
if not arg.startswith('-'):
|
||||
if not arg.startswith("-"):
|
||||
del argv[i]
|
||||
return arg
|
||||
i += 1
|
||||
@ -124,11 +124,11 @@ def execute(argv=None, settings=None):
|
||||
settings = get_project_settings()
|
||||
# set EDITOR from environment if available
|
||||
try:
|
||||
editor = os.environ['EDITOR']
|
||||
editor = os.environ["EDITOR"]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
settings['EDITOR'] = editor
|
||||
settings["EDITOR"] = editor
|
||||
|
||||
inproject = inside_project()
|
||||
cmds = _get_commands_dict(settings, inproject)
|
||||
@ -141,11 +141,13 @@ def execute(argv=None, settings=None):
|
||||
sys.exit(2)
|
||||
|
||||
cmd = cmds[cmdname]
|
||||
parser = ScrapyArgumentParser(formatter_class=ScrapyHelpFormatter,
|
||||
usage=f"scrapy {cmdname} {cmd.syntax()}",
|
||||
conflict_handler='resolve',
|
||||
description=cmd.long_desc())
|
||||
settings.setdict(cmd.default_settings, priority='command')
|
||||
parser = ScrapyArgumentParser(
|
||||
formatter_class=ScrapyHelpFormatter,
|
||||
usage=f"scrapy {cmdname} {cmd.syntax()}",
|
||||
conflict_handler="resolve",
|
||||
description=cmd.long_desc(),
|
||||
)
|
||||
settings.setdict(cmd.default_settings, priority="command")
|
||||
cmd.settings = settings
|
||||
cmd.add_options(parser)
|
||||
opts, args = parser.parse_known_args(args=argv[1:])
|
||||
@ -168,12 +170,12 @@ def _run_command_profiled(cmd, args, opts):
|
||||
sys.stderr.write(f"scrapy: writing cProfile stats to {opts.profile!r}\n")
|
||||
loc = locals()
|
||||
p = cProfile.Profile()
|
||||
p.runctx('cmd.run(args, opts)', globals(), loc)
|
||||
p.runctx("cmd.run(args, opts)", globals(), loc)
|
||||
if opts.profile:
|
||||
p.dump_stats(opts.profile)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
execute()
|
||||
finally:
|
||||
|
@ -27,7 +27,7 @@ class ScrapyCommand:
|
||||
self.settings: Any = None # set in scrapy.cmdline
|
||||
|
||||
def set_crawler(self, crawler):
|
||||
if hasattr(self, '_crawler'):
|
||||
if hasattr(self, "_crawler"):
|
||||
raise RuntimeError("crawler already set")
|
||||
self._crawler = crawler
|
||||
|
||||
@ -61,41 +61,58 @@ class ScrapyCommand:
|
||||
"""
|
||||
Populate option parse with options available for this command
|
||||
"""
|
||||
group = parser.add_argument_group(title='Global Options')
|
||||
group.add_argument("--logfile", metavar="FILE",
|
||||
help="log file. if omitted stderr will be used")
|
||||
group.add_argument("-L", "--loglevel", metavar="LEVEL", default=None,
|
||||
help=f"log level (default: {self.settings['LOG_LEVEL']})")
|
||||
group.add_argument("--nolog", action="store_true",
|
||||
help="disable logging completely")
|
||||
group.add_argument("--profile", metavar="FILE", default=None,
|
||||
help="write python cProfile stats to FILE")
|
||||
group.add_argument("--pidfile", metavar="FILE",
|
||||
help="write process ID to FILE")
|
||||
group.add_argument("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set/override setting (may be repeated)")
|
||||
group = parser.add_argument_group(title="Global Options")
|
||||
group.add_argument(
|
||||
"--logfile", metavar="FILE", help="log file. if omitted stderr will be used"
|
||||
)
|
||||
group.add_argument(
|
||||
"-L",
|
||||
"--loglevel",
|
||||
metavar="LEVEL",
|
||||
default=None,
|
||||
help=f"log level (default: {self.settings['LOG_LEVEL']})",
|
||||
)
|
||||
group.add_argument(
|
||||
"--nolog", action="store_true", help="disable logging completely"
|
||||
)
|
||||
group.add_argument(
|
||||
"--profile",
|
||||
metavar="FILE",
|
||||
default=None,
|
||||
help="write python cProfile stats to FILE",
|
||||
)
|
||||
group.add_argument("--pidfile", metavar="FILE", help="write process ID to FILE")
|
||||
group.add_argument(
|
||||
"-s",
|
||||
"--set",
|
||||
action="append",
|
||||
default=[],
|
||||
metavar="NAME=VALUE",
|
||||
help="set/override setting (may be repeated)",
|
||||
)
|
||||
group.add_argument("--pdb", action="store_true", help="enable pdb on failure")
|
||||
|
||||
def process_options(self, args, opts):
|
||||
try:
|
||||
self.settings.setdict(arglist_to_dict(opts.set),
|
||||
priority='cmdline')
|
||||
self.settings.setdict(arglist_to_dict(opts.set), priority="cmdline")
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False)
|
||||
|
||||
if opts.logfile:
|
||||
self.settings.set('LOG_ENABLED', True, priority='cmdline')
|
||||
self.settings.set('LOG_FILE', opts.logfile, priority='cmdline')
|
||||
self.settings.set("LOG_ENABLED", True, priority="cmdline")
|
||||
self.settings.set("LOG_FILE", opts.logfile, priority="cmdline")
|
||||
|
||||
if opts.loglevel:
|
||||
self.settings.set('LOG_ENABLED', True, priority='cmdline')
|
||||
self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline')
|
||||
self.settings.set("LOG_ENABLED", True, priority="cmdline")
|
||||
self.settings.set("LOG_LEVEL", opts.loglevel, priority="cmdline")
|
||||
|
||||
if opts.nolog:
|
||||
self.settings.set('LOG_ENABLED', False, priority='cmdline')
|
||||
self.settings.set("LOG_ENABLED", False, priority="cmdline")
|
||||
|
||||
if opts.pidfile:
|
||||
Path(opts.pidfile).write_text(str(os.getpid()) + os.linesep, encoding="utf-8")
|
||||
Path(opts.pidfile).write_text(
|
||||
str(os.getpid()) + os.linesep, encoding="utf-8"
|
||||
)
|
||||
|
||||
if opts.pdb:
|
||||
failure.startDebugMode()
|
||||
@ -111,18 +128,39 @@ class BaseRunSpiderCommand(ScrapyCommand):
|
||||
"""
|
||||
Common class used to share functionality between the crawl, parse and runspider commands
|
||||
"""
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_argument("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set spider argument (may be repeated)")
|
||||
parser.add_argument("-o", "--output", metavar="FILE", action="append",
|
||||
help="append scraped items to the end of FILE (use - for stdout),"
|
||||
" to define format set a colon at the end of the output URI (i.e. -o FILE:FORMAT)")
|
||||
parser.add_argument("-O", "--overwrite-output", metavar="FILE", action="append",
|
||||
help="dump scraped items into FILE, overwriting any existing file,"
|
||||
" to define format set a colon at the end of the output URI (i.e. -O FILE:FORMAT)")
|
||||
parser.add_argument("-t", "--output-format", metavar="FORMAT",
|
||||
help="format to use for dumping items")
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
dest="spargs",
|
||||
action="append",
|
||||
default=[],
|
||||
metavar="NAME=VALUE",
|
||||
help="set spider argument (may be repeated)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
metavar="FILE",
|
||||
action="append",
|
||||
help="append scraped items to the end of FILE (use - for stdout),"
|
||||
" to define format set a colon at the end of the output URI (i.e. -o FILE:FORMAT)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-O",
|
||||
"--overwrite-output",
|
||||
metavar="FILE",
|
||||
action="append",
|
||||
help="dump scraped items into FILE, overwriting any existing file,"
|
||||
" to define format set a colon at the end of the output URI (i.e. -O FILE:FORMAT)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--output-format",
|
||||
metavar="FORMAT",
|
||||
help="format to use for dumping items",
|
||||
)
|
||||
|
||||
def process_options(self, args, opts):
|
||||
ScrapyCommand.process_options(self, args, opts)
|
||||
@ -137,16 +175,21 @@ class BaseRunSpiderCommand(ScrapyCommand):
|
||||
opts.output_format,
|
||||
opts.overwrite_output,
|
||||
)
|
||||
self.settings.set('FEEDS', feeds, priority='cmdline')
|
||||
self.settings.set("FEEDS", feeds, priority="cmdline")
|
||||
|
||||
|
||||
class ScrapyHelpFormatter(argparse.HelpFormatter):
|
||||
"""
|
||||
Help Formatter for scrapy command line help messages.
|
||||
"""
|
||||
|
||||
def __init__(self, prog, indent_increment=2, max_help_position=24, width=None):
|
||||
super().__init__(prog, indent_increment=indent_increment,
|
||||
max_help_position=max_help_position, width=width)
|
||||
super().__init__(
|
||||
prog,
|
||||
indent_increment=indent_increment,
|
||||
max_help_position=max_help_position,
|
||||
width=width,
|
||||
)
|
||||
|
||||
def _join_parts(self, part_strings):
|
||||
parts = self.format_part_strings(part_strings)
|
||||
@ -157,11 +200,13 @@ class ScrapyHelpFormatter(argparse.HelpFormatter):
|
||||
Underline and title case command line help message headers.
|
||||
"""
|
||||
if part_strings and part_strings[0].startswith("usage: "):
|
||||
part_strings[0] = "Usage\n=====\n " + part_strings[0][len('usage: '):]
|
||||
headings = [i for i in range(len(part_strings)) if part_strings[i].endswith(':\n')]
|
||||
part_strings[0] = "Usage\n=====\n " + part_strings[0][len("usage: ") :]
|
||||
headings = [
|
||||
i for i in range(len(part_strings)) if part_strings[i].endswith(":\n")
|
||||
]
|
||||
for index in headings[::-1]:
|
||||
char = '-' if "Global Options" in part_strings[index] else '='
|
||||
char = "-" if "Global Options" in part_strings[index] else "="
|
||||
part_strings[index] = part_strings[index][:-2].title()
|
||||
underline = ''.join(["\n", (char * len(part_strings[index])), "\n"])
|
||||
underline = "".join(["\n", (char * len(part_strings[index])), "\n"])
|
||||
part_strings.insert(index + 1, underline)
|
||||
return part_strings
|
||||
|
@ -11,9 +11,9 @@ from scrapy.linkextractors import LinkExtractor
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
default_settings = {
|
||||
'LOG_LEVEL': 'INFO',
|
||||
'LOGSTATS_INTERVAL': 1,
|
||||
'CLOSESPIDER_TIMEOUT': 10,
|
||||
"LOG_LEVEL": "INFO",
|
||||
"LOGSTATS_INTERVAL": 1,
|
||||
"CLOSESPIDER_TIMEOUT": 10,
|
||||
}
|
||||
|
||||
def short_desc(self):
|
||||
@ -26,12 +26,11 @@ class Command(ScrapyCommand):
|
||||
|
||||
|
||||
class _BenchServer:
|
||||
|
||||
def __enter__(self):
|
||||
from scrapy.utils.test import get_testenv
|
||||
pargs = [sys.executable, '-u', '-m', 'scrapy.utils.benchserver']
|
||||
self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE,
|
||||
env=get_testenv())
|
||||
|
||||
pargs = [sys.executable, "-u", "-m", "scrapy.utils.benchserver"]
|
||||
self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE, env=get_testenv())
|
||||
self.proc.stdout.readline()
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
@ -42,15 +41,16 @@ class _BenchServer:
|
||||
|
||||
class _BenchSpider(scrapy.Spider):
|
||||
"""A spider that follows all links"""
|
||||
name = 'follow'
|
||||
|
||||
name = "follow"
|
||||
total = 10000
|
||||
show = 20
|
||||
baseurl = 'http://localhost:8998'
|
||||
baseurl = "http://localhost:8998"
|
||||
link_extractor = LinkExtractor()
|
||||
|
||||
def start_requests(self):
|
||||
qargs = {'total': self.total, 'show': self.show}
|
||||
url = f'{self.baseurl}?{urlencode(qargs, doseq=True)}'
|
||||
qargs = {"total": self.total, "show": self.show}
|
||||
url = f"{self.baseurl}?{urlencode(qargs, doseq=True)}"
|
||||
return [scrapy.Request(url, dont_filter=True)]
|
||||
|
||||
def parse(self, response):
|
||||
|
@ -39,7 +39,7 @@ class TextTestResult(_TextTestResult):
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
default_settings = {"LOG_ENABLED": False}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider>"
|
||||
@ -49,14 +49,25 @@ class Command(ScrapyCommand):
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_argument("-l", "--list", dest="list", action="store_true",
|
||||
help="only list contracts, without checking them")
|
||||
parser.add_argument("-v", "--verbose", dest="verbose", default=False, action='store_true',
|
||||
help="print contract tests for all spiders")
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--list",
|
||||
dest="list",
|
||||
action="store_true",
|
||||
help="only list contracts, without checking them",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
dest="verbose",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="print contract tests for all spiders",
|
||||
)
|
||||
|
||||
def run(self, args, opts):
|
||||
# load contracts
|
||||
contracts = build_component_list(self.settings.getwithbase('SPIDER_CONTRACTS'))
|
||||
contracts = build_component_list(self.settings.getwithbase("SPIDER_CONTRACTS"))
|
||||
conman = ContractsManager(load_object(c) for c in contracts)
|
||||
runner = TextTestRunner(verbosity=2 if opts.verbose else 1)
|
||||
result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity)
|
||||
@ -66,7 +77,7 @@ class Command(ScrapyCommand):
|
||||
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
|
||||
with set_environ(SCRAPY_CHECK='true'):
|
||||
with set_environ(SCRAPY_CHECK="true"):
|
||||
for spidername in args or spider_loader.list():
|
||||
spidercls = spider_loader.load(spidername)
|
||||
spidercls.start_requests = lambda s: conman.from_spider(s, result)
|
||||
@ -85,7 +96,7 @@ class Command(ScrapyCommand):
|
||||
continue
|
||||
print(spider)
|
||||
for method in sorted(methods):
|
||||
print(f' * {method}')
|
||||
print(f" * {method}")
|
||||
else:
|
||||
start = time.time()
|
||||
self.crawler_process.start()
|
||||
|
@ -16,18 +16,23 @@ class Command(BaseRunSpiderCommand):
|
||||
if len(args) < 1:
|
||||
raise UsageError()
|
||||
elif len(args) > 1:
|
||||
raise UsageError("running 'scrapy crawl' with more than one spider is not supported")
|
||||
raise UsageError(
|
||||
"running 'scrapy crawl' with more than one spider is not supported"
|
||||
)
|
||||
spname = args[0]
|
||||
|
||||
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
|
||||
|
||||
if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
|
||||
if getattr(crawl_defer, "result", None) is not None and issubclass(
|
||||
crawl_defer.result.type, Exception
|
||||
):
|
||||
self.exitcode = 1
|
||||
else:
|
||||
self.crawler_process.start()
|
||||
|
||||
if (
|
||||
self.crawler_process.bootstrap_failed
|
||||
or hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception
|
||||
or hasattr(self.crawler_process, "has_exception")
|
||||
and self.crawler_process.has_exception
|
||||
):
|
||||
self.exitcode = 1
|
||||
|
@ -8,7 +8,7 @@ from scrapy.exceptions import UsageError
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
default_settings = {"LOG_ENABLED": False}
|
||||
|
||||
def syntax(self):
|
||||
return "<spider>"
|
||||
@ -17,8 +17,10 @@ class Command(ScrapyCommand):
|
||||
return "Edit spider"
|
||||
|
||||
def long_desc(self):
|
||||
return ("Edit a spider using the editor defined in the EDITOR environment"
|
||||
" variable or else the EDITOR setting")
|
||||
return (
|
||||
"Edit a spider using the editor defined in the EDITOR environment"
|
||||
" variable or else the EDITOR setting"
|
||||
)
|
||||
|
||||
def _err(self, msg):
|
||||
sys.stderr.write(msg + os.linesep)
|
||||
@ -28,12 +30,12 @@ class Command(ScrapyCommand):
|
||||
if len(args) != 1:
|
||||
raise UsageError()
|
||||
|
||||
editor = self.settings['EDITOR']
|
||||
editor = self.settings["EDITOR"]
|
||||
try:
|
||||
spidercls = self.crawler_process.spider_loader.load(args[0])
|
||||
except KeyError:
|
||||
return self._err(f"Spider not found: {args[0]}")
|
||||
|
||||
sfile = sys.modules[spidercls.__module__].__file__
|
||||
sfile = sfile.replace('.pyc', '.py')
|
||||
sfile = sfile.replace(".pyc", ".py")
|
||||
self.exitcode = os.system(f'{editor} "{sfile}"')
|
||||
|
@ -27,38 +27,51 @@ class Command(ScrapyCommand):
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_argument("--spider", dest="spider", help="use this spider")
|
||||
parser.add_argument("--headers", dest="headers", action="store_true",
|
||||
help="print response HTTP headers instead of body")
|
||||
parser.add_argument("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
parser.add_argument(
|
||||
"--headers",
|
||||
dest="headers",
|
||||
action="store_true",
|
||||
help="print response HTTP headers instead of body",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-redirect",
|
||||
dest="no_redirect",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is",
|
||||
)
|
||||
|
||||
def _print_headers(self, headers, prefix):
|
||||
for key, values in headers.items():
|
||||
for value in values:
|
||||
self._print_bytes(prefix + b' ' + key + b': ' + value)
|
||||
self._print_bytes(prefix + b" " + key + b": " + value)
|
||||
|
||||
def _print_response(self, response, opts):
|
||||
if opts.headers:
|
||||
self._print_headers(response.request.headers, b'>')
|
||||
print('>')
|
||||
self._print_headers(response.headers, b'<')
|
||||
self._print_headers(response.request.headers, b">")
|
||||
print(">")
|
||||
self._print_headers(response.headers, b"<")
|
||||
else:
|
||||
self._print_bytes(response.body)
|
||||
|
||||
def _print_bytes(self, bytes_):
|
||||
sys.stdout.buffer.write(bytes_ + b'\n')
|
||||
sys.stdout.buffer.write(bytes_ + b"\n")
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1 or not is_url(args[0]):
|
||||
raise UsageError()
|
||||
request = Request(args[0], callback=self._print_response,
|
||||
cb_kwargs={"opts": opts}, dont_filter=True)
|
||||
request = Request(
|
||||
args[0],
|
||||
callback=self._print_response,
|
||||
cb_kwargs={"opts": opts},
|
||||
dont_filter=True,
|
||||
)
|
||||
# by default, let the framework handle redirects,
|
||||
# i.e. command handles all codes expect 3xx
|
||||
if not opts.no_redirect:
|
||||
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
|
||||
request.meta["handle_httpstatus_list"] = SequenceExclude(range(300, 400))
|
||||
else:
|
||||
request.meta['handle_httpstatus_all'] = True
|
||||
request.meta["handle_httpstatus_all"] = True
|
||||
|
||||
spidercls = DefaultSpider
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
|
@ -18,7 +18,7 @@ def sanitize_module_name(module_name):
|
||||
with underscores and prefixing it with a letter if it doesn't start
|
||||
with one
|
||||
"""
|
||||
module_name = module_name.replace('-', '_').replace('.', '_')
|
||||
module_name = module_name.replace("-", "_").replace(".", "_")
|
||||
if module_name[0] not in string.ascii_letters:
|
||||
module_name = "a" + module_name
|
||||
return module_name
|
||||
@ -27,7 +27,7 @@ def sanitize_module_name(module_name):
|
||||
def extract_domain(url):
|
||||
"""Extract domain name from URL string"""
|
||||
o = urlparse(url)
|
||||
if o.scheme == '' and o.netloc == '':
|
||||
if o.scheme == "" and o.netloc == "":
|
||||
o = urlparse("//" + url.lstrip("/"))
|
||||
return o.netloc
|
||||
|
||||
@ -35,7 +35,7 @@ def extract_domain(url):
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
default_settings = {"LOG_ENABLED": False}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <name> <domain>"
|
||||
@ -45,16 +45,40 @@ class Command(ScrapyCommand):
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_argument("-l", "--list", dest="list", action="store_true",
|
||||
help="List available templates")
|
||||
parser.add_argument("-e", "--edit", dest="edit", action="store_true",
|
||||
help="Edit spider after creating it")
|
||||
parser.add_argument("-d", "--dump", dest="dump", metavar="TEMPLATE",
|
||||
help="Dump template to standard output")
|
||||
parser.add_argument("-t", "--template", dest="template", default="basic",
|
||||
help="Uses a custom template.")
|
||||
parser.add_argument("--force", dest="force", action="store_true",
|
||||
help="If the spider already exists, overwrite it with the template")
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--list",
|
||||
dest="list",
|
||||
action="store_true",
|
||||
help="List available templates",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--edit",
|
||||
dest="edit",
|
||||
action="store_true",
|
||||
help="Edit spider after creating it",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--dump",
|
||||
dest="dump",
|
||||
metavar="TEMPLATE",
|
||||
help="Dump template to standard output",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--template",
|
||||
dest="template",
|
||||
default="basic",
|
||||
help="Uses a custom template.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
dest="force",
|
||||
action="store_true",
|
||||
help="If the spider already exists, overwrite it with the template",
|
||||
)
|
||||
|
||||
def run(self, args, opts):
|
||||
if opts.list:
|
||||
@ -72,7 +96,7 @@ class Command(ScrapyCommand):
|
||||
domain = extract_domain(url)
|
||||
module = sanitize_module_name(name)
|
||||
|
||||
if self.settings.get('BOT_NAME') == module:
|
||||
if self.settings.get("BOT_NAME") == module:
|
||||
print("Cannot create a spider with the same name as your project")
|
||||
return
|
||||
|
||||
@ -87,17 +111,17 @@ class Command(ScrapyCommand):
|
||||
|
||||
def _genspider(self, module, name, domain, template_name, template_file):
|
||||
"""Generate the spider module, based on the given template"""
|
||||
capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
|
||||
capitalized_module = "".join(s.capitalize() for s in module.split("_"))
|
||||
tvars = {
|
||||
'project_name': self.settings.get('BOT_NAME'),
|
||||
'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
|
||||
'module': module,
|
||||
'name': name,
|
||||
'domain': domain,
|
||||
'classname': f'{capitalized_module}Spider'
|
||||
"project_name": self.settings.get("BOT_NAME"),
|
||||
"ProjectName": string_camelcase(self.settings.get("BOT_NAME")),
|
||||
"module": module,
|
||||
"name": name,
|
||||
"domain": domain,
|
||||
"classname": f"{capitalized_module}Spider",
|
||||
}
|
||||
if self.settings.get('NEWSPIDER_MODULE'):
|
||||
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
||||
if self.settings.get("NEWSPIDER_MODULE"):
|
||||
spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
|
||||
spiders_dir = Path(spiders_module.__file__).parent.resolve()
|
||||
else:
|
||||
spiders_module = None
|
||||
@ -105,13 +129,15 @@ class Command(ScrapyCommand):
|
||||
spider_file = f"{spiders_dir / module}.py"
|
||||
shutil.copyfile(template_file, spider_file)
|
||||
render_templatefile(spider_file, **tvars)
|
||||
print(f"Created spider {name!r} using template {template_name!r} ",
|
||||
end=('' if spiders_module else '\n'))
|
||||
print(
|
||||
f"Created spider {name!r} using template {template_name!r} ",
|
||||
end=("" if spiders_module else "\n"),
|
||||
)
|
||||
if spiders_module:
|
||||
print(f"in module:\n {spiders_module.__name__}.{module}")
|
||||
|
||||
def _find_template(self, template: str) -> Optional[Path]:
|
||||
template_file = Path(self.templates_dir, f'{template}.tmpl')
|
||||
template_file = Path(self.templates_dir, f"{template}.tmpl")
|
||||
if template_file.exists():
|
||||
return template_file
|
||||
print(f"Unable to find template: {template}\n")
|
||||
@ -121,11 +147,11 @@ class Command(ScrapyCommand):
|
||||
def _list_templates(self):
|
||||
print("Available templates:")
|
||||
for file in sorted(Path(self.templates_dir).iterdir()):
|
||||
if file.suffix == '.tmpl':
|
||||
if file.suffix == ".tmpl":
|
||||
print(f" {file.stem}")
|
||||
|
||||
def _spider_exists(self, name: str) -> bool:
|
||||
if not self.settings.get('NEWSPIDER_MODULE'):
|
||||
if not self.settings.get("NEWSPIDER_MODULE"):
|
||||
# if run as a standalone command and file with same filename already exists
|
||||
path = Path(name + ".py")
|
||||
if path.exists():
|
||||
@ -148,7 +174,7 @@ class Command(ScrapyCommand):
|
||||
return True
|
||||
|
||||
# a file with the same name exists in the target directory
|
||||
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
||||
spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
|
||||
spiders_dir = Path(cast(str, spiders_module.__file__)).parent
|
||||
spiders_dir_abs = spiders_dir.resolve()
|
||||
path = spiders_dir_abs / (name + ".py")
|
||||
@ -160,7 +186,9 @@ class Command(ScrapyCommand):
|
||||
|
||||
@property
|
||||
def templates_dir(self) -> str:
|
||||
return str(Path(
|
||||
self.settings['TEMPLATES_DIR'] or Path(scrapy.__path__[0], 'templates'),
|
||||
'spiders'
|
||||
))
|
||||
return str(
|
||||
Path(
|
||||
self.settings["TEMPLATES_DIR"] or Path(scrapy.__path__[0], "templates"),
|
||||
"spiders",
|
||||
)
|
||||
)
|
||||
|
@ -4,7 +4,7 @@ from scrapy.commands import ScrapyCommand
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
default_settings = {"LOG_ENABLED": False}
|
||||
|
||||
def short_desc(self):
|
||||
return "List available spiders"
|
||||
|
@ -32,28 +32,72 @@ class Command(BaseRunSpiderCommand):
|
||||
|
||||
def add_options(self, parser):
|
||||
BaseRunSpiderCommand.add_options(self, parser)
|
||||
parser.add_argument("--spider", dest="spider", default=None,
|
||||
help="use this spider without looking for one")
|
||||
parser.add_argument("--pipelines", action="store_true",
|
||||
help="process items through pipelines")
|
||||
parser.add_argument("--nolinks", dest="nolinks", action="store_true",
|
||||
help="don't show links to follow (extracted requests)")
|
||||
parser.add_argument("--noitems", dest="noitems", action="store_true",
|
||||
help="don't show scraped items")
|
||||
parser.add_argument("--nocolour", dest="nocolour", action="store_true",
|
||||
help="avoid using pygments to colorize the output")
|
||||
parser.add_argument("-r", "--rules", dest="rules", action="store_true",
|
||||
help="use CrawlSpider rules to discover the callback")
|
||||
parser.add_argument("-c", "--callback", dest="callback",
|
||||
help="use this callback for parsing, instead looking for a callback")
|
||||
parser.add_argument("-m", "--meta", dest="meta",
|
||||
help="inject extra meta into the Request, it must be a valid raw json string")
|
||||
parser.add_argument("--cbkwargs", dest="cbkwargs",
|
||||
help="inject extra callback kwargs into the Request, it must be a valid raw json string")
|
||||
parser.add_argument("-d", "--depth", dest="depth", type=int, default=1,
|
||||
help="maximum depth for parsing requests [default: %(default)s]")
|
||||
parser.add_argument("-v", "--verbose", dest="verbose", action="store_true",
|
||||
help="print each depth level one by one")
|
||||
parser.add_argument(
|
||||
"--spider",
|
||||
dest="spider",
|
||||
default=None,
|
||||
help="use this spider without looking for one",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pipelines", action="store_true", help="process items through pipelines"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nolinks",
|
||||
dest="nolinks",
|
||||
action="store_true",
|
||||
help="don't show links to follow (extracted requests)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noitems",
|
||||
dest="noitems",
|
||||
action="store_true",
|
||||
help="don't show scraped items",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nocolour",
|
||||
dest="nocolour",
|
||||
action="store_true",
|
||||
help="avoid using pygments to colorize the output",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--rules",
|
||||
dest="rules",
|
||||
action="store_true",
|
||||
help="use CrawlSpider rules to discover the callback",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--callback",
|
||||
dest="callback",
|
||||
help="use this callback for parsing, instead looking for a callback",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--meta",
|
||||
dest="meta",
|
||||
help="inject extra meta into the Request, it must be a valid raw json string",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cbkwargs",
|
||||
dest="cbkwargs",
|
||||
help="inject extra callback kwargs into the Request, it must be a valid raw json string",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--depth",
|
||||
dest="depth",
|
||||
type=int,
|
||||
default=1,
|
||||
help="maximum depth for parsing requests [default: %(default)s]",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
dest="verbose",
|
||||
action="store_true",
|
||||
help="print each depth level one by one",
|
||||
)
|
||||
|
||||
@property
|
||||
def max_level(self):
|
||||
@ -98,13 +142,13 @@ class Command(BaseRunSpiderCommand):
|
||||
|
||||
if opts.verbose:
|
||||
for level in range(1, self.max_level + 1):
|
||||
print(f'\n>>> DEPTH LEVEL: {level} <<<')
|
||||
print(f"\n>>> DEPTH LEVEL: {level} <<<")
|
||||
if not opts.noitems:
|
||||
self.print_items(level, colour)
|
||||
if not opts.nolinks:
|
||||
self.print_requests(level, colour)
|
||||
else:
|
||||
print(f'\n>>> STATUS DEPTH LEVEL {self.max_level} <<<')
|
||||
print(f"\n>>> STATUS DEPTH LEVEL {self.max_level} <<<")
|
||||
if not opts.noitems:
|
||||
self.print_items(colour=colour)
|
||||
if not opts.nolinks:
|
||||
@ -125,14 +169,16 @@ class Command(BaseRunSpiderCommand):
|
||||
return d
|
||||
|
||||
def get_callback_from_rules(self, spider, response):
|
||||
if getattr(spider, 'rules', None):
|
||||
if getattr(spider, "rules", None):
|
||||
for rule in spider.rules:
|
||||
if rule.link_extractor.matches(response.url):
|
||||
return rule.callback or "parse"
|
||||
else:
|
||||
logger.error('No CrawlSpider rules found in spider %(spider)r, '
|
||||
'please specify a callback to use for parsing',
|
||||
{'spider': spider.name})
|
||||
logger.error(
|
||||
"No CrawlSpider rules found in spider %(spider)r, "
|
||||
"please specify a callback to use for parsing",
|
||||
{"spider": spider.name},
|
||||
)
|
||||
|
||||
def set_spidercls(self, url, opts):
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
@ -140,15 +186,17 @@ class Command(BaseRunSpiderCommand):
|
||||
try:
|
||||
self.spidercls = spider_loader.load(opts.spider)
|
||||
except KeyError:
|
||||
logger.error('Unable to find spider: %(spider)s',
|
||||
{'spider': opts.spider})
|
||||
logger.error(
|
||||
"Unable to find spider: %(spider)s", {"spider": opts.spider}
|
||||
)
|
||||
else:
|
||||
self.spidercls = spidercls_for_request(spider_loader, Request(url))
|
||||
if not self.spidercls:
|
||||
logger.error('Unable to find spider for: %(url)s', {'url': url})
|
||||
logger.error("Unable to find spider for: %(url)s", {"url": url})
|
||||
|
||||
def _start_requests(spider):
|
||||
yield self.prepare_request(spider, Request(url), opts)
|
||||
|
||||
if self.spidercls:
|
||||
self.spidercls.start_requests = _start_requests
|
||||
|
||||
@ -158,8 +206,7 @@ class Command(BaseRunSpiderCommand):
|
||||
self.crawler_process.start()
|
||||
|
||||
if not self.first_response:
|
||||
logger.error('No response downloaded for: %(url)s',
|
||||
{'url': url})
|
||||
logger.error("No response downloaded for: %(url)s", {"url": url})
|
||||
|
||||
def scraped_data(self, args):
|
||||
items, requests, opts, depth, spider, callback = args
|
||||
@ -173,8 +220,8 @@ class Command(BaseRunSpiderCommand):
|
||||
scraped_data = items if opts.output else []
|
||||
if depth < opts.depth:
|
||||
for req in requests:
|
||||
req.meta['_depth'] = depth + 1
|
||||
req.meta['_callback'] = req.callback
|
||||
req.meta["_depth"] = depth + 1
|
||||
req.meta["_callback"] = req.callback
|
||||
req.callback = callback
|
||||
scraped_data += requests
|
||||
|
||||
@ -187,7 +234,7 @@ class Command(BaseRunSpiderCommand):
|
||||
self.first_response = response
|
||||
|
||||
# determine real callback
|
||||
cb = response.meta['_callback']
|
||||
cb = response.meta["_callback"]
|
||||
if not cb:
|
||||
if opts.callback:
|
||||
cb = opts.callback
|
||||
@ -195,23 +242,27 @@ class Command(BaseRunSpiderCommand):
|
||||
cb = self.get_callback_from_rules(spider, response)
|
||||
|
||||
if not cb:
|
||||
logger.error('Cannot find a rule that matches %(url)r in spider: %(spider)s',
|
||||
{'url': response.url, 'spider': spider.name})
|
||||
logger.error(
|
||||
"Cannot find a rule that matches %(url)r in spider: %(spider)s",
|
||||
{"url": response.url, "spider": spider.name},
|
||||
)
|
||||
return
|
||||
else:
|
||||
cb = 'parse'
|
||||
cb = "parse"
|
||||
|
||||
if not callable(cb):
|
||||
cb_method = getattr(spider, cb, None)
|
||||
if callable(cb_method):
|
||||
cb = cb_method
|
||||
else:
|
||||
logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
|
||||
{'callback': cb, 'spider': spider.name})
|
||||
logger.error(
|
||||
"Cannot find callback %(callback)r in spider: %(spider)s",
|
||||
{"callback": cb, "spider": spider.name},
|
||||
)
|
||||
return
|
||||
|
||||
# parse items and requests
|
||||
depth = response.meta['_depth']
|
||||
depth = response.meta["_depth"]
|
||||
|
||||
d = self.run_callback(response, cb, cb_kwargs)
|
||||
d.addCallback(self._get_items_and_requests, opts, depth, spider, callback)
|
||||
@ -226,8 +277,8 @@ class Command(BaseRunSpiderCommand):
|
||||
if opts.cbkwargs:
|
||||
request.cb_kwargs.update(opts.cbkwargs)
|
||||
|
||||
request.meta['_depth'] = 1
|
||||
request.meta['_callback'] = request.callback
|
||||
request.meta["_depth"] = 1
|
||||
request.meta["_callback"] = request.callback
|
||||
request.callback = callback
|
||||
return request
|
||||
|
||||
@ -242,16 +293,22 @@ class Command(BaseRunSpiderCommand):
|
||||
try:
|
||||
opts.meta = json.loads(opts.meta)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -m/--meta value, pass a valid json string to -m or --meta. "
|
||||
"Example: --meta='{\"foo\" : \"bar\"}'", print_help=False)
|
||||
raise UsageError(
|
||||
"Invalid -m/--meta value, pass a valid json string to -m or --meta. "
|
||||
'Example: --meta=\'{"foo" : "bar"}\'',
|
||||
print_help=False,
|
||||
)
|
||||
|
||||
def process_request_cb_kwargs(self, opts):
|
||||
if opts.cbkwargs:
|
||||
try:
|
||||
opts.cbkwargs = json.loads(opts.cbkwargs)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
|
||||
"Example: --cbkwargs='{\"foo\" : \"bar\"}'", print_help=False)
|
||||
raise UsageError(
|
||||
"Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
|
||||
'Example: --cbkwargs=\'{"foo" : "bar"}\'',
|
||||
print_help=False,
|
||||
)
|
||||
|
||||
def run(self, args, opts):
|
||||
# parse arguments
|
||||
|
@ -12,7 +12,7 @@ from scrapy.commands import BaseRunSpiderCommand
|
||||
|
||||
def _import_file(filepath: Union[str, PathLike]) -> ModuleType:
|
||||
abspath = Path(filepath).resolve()
|
||||
if abspath.suffix not in ('.py', '.pyw'):
|
||||
if abspath.suffix not in (".py", ".pyw"):
|
||||
raise ValueError(f"Not a Python source file: {abspath}")
|
||||
dirname = str(abspath.parent)
|
||||
sys.path = [dirname] + sys.path
|
||||
@ -26,7 +26,7 @@ def _import_file(filepath: Union[str, PathLike]) -> ModuleType:
|
||||
class Command(BaseRunSpiderCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
|
||||
default_settings = {"SPIDER_LOADER_WARN_ONLY": True}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider_file>"
|
||||
|
@ -7,8 +7,7 @@ from scrapy.settings import BaseSettings
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'LOG_ENABLED': False,
|
||||
'SPIDER_LOADER_WARN_ONLY': True}
|
||||
default_settings = {"LOG_ENABLED": False, "SPIDER_LOADER_WARN_ONLY": True}
|
||||
|
||||
def syntax(self):
|
||||
return "[options]"
|
||||
@ -18,16 +17,33 @@ class Command(ScrapyCommand):
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_argument("--get", dest="get", metavar="SETTING",
|
||||
help="print raw setting value")
|
||||
parser.add_argument("--getbool", dest="getbool", metavar="SETTING",
|
||||
help="print setting value, interpreted as a boolean")
|
||||
parser.add_argument("--getint", dest="getint", metavar="SETTING",
|
||||
help="print setting value, interpreted as an integer")
|
||||
parser.add_argument("--getfloat", dest="getfloat", metavar="SETTING",
|
||||
help="print setting value, interpreted as a float")
|
||||
parser.add_argument("--getlist", dest="getlist", metavar="SETTING",
|
||||
help="print setting value, interpreted as a list")
|
||||
parser.add_argument(
|
||||
"--get", dest="get", metavar="SETTING", help="print raw setting value"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--getbool",
|
||||
dest="getbool",
|
||||
metavar="SETTING",
|
||||
help="print setting value, interpreted as a boolean",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--getint",
|
||||
dest="getint",
|
||||
metavar="SETTING",
|
||||
help="print setting value, interpreted as an integer",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--getfloat",
|
||||
dest="getfloat",
|
||||
metavar="SETTING",
|
||||
help="print setting value, interpreted as a float",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--getlist",
|
||||
dest="getlist",
|
||||
metavar="SETTING",
|
||||
help="print setting value, interpreted as a list",
|
||||
)
|
||||
|
||||
def run(self, args, opts):
|
||||
settings = self.crawler_process.settings
|
||||
|
@ -16,9 +16,9 @@ class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {
|
||||
'KEEP_ALIVE': True,
|
||||
'LOGSTATS_INTERVAL': 0,
|
||||
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
|
||||
"KEEP_ALIVE": True,
|
||||
"LOGSTATS_INTERVAL": 0,
|
||||
"DUPEFILTER_CLASS": "scrapy.dupefilters.BaseDupeFilter",
|
||||
}
|
||||
|
||||
def syntax(self):
|
||||
@ -28,17 +28,26 @@ class Command(ScrapyCommand):
|
||||
return "Interactive scraping console"
|
||||
|
||||
def long_desc(self):
|
||||
return ("Interactive console for scraping the given url or file. "
|
||||
"Use ./file.html syntax or full path for local file.")
|
||||
return (
|
||||
"Interactive console for scraping the given url or file. "
|
||||
"Use ./file.html syntax or full path for local file."
|
||||
)
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_argument("-c", dest="code",
|
||||
help="evaluate the code in the shell, print the result and exit")
|
||||
parser.add_argument("--spider", dest="spider",
|
||||
help="use this spider")
|
||||
parser.add_argument("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
dest="code",
|
||||
help="evaluate the code in the shell, print the result and exit",
|
||||
)
|
||||
parser.add_argument("--spider", dest="spider", help="use this spider")
|
||||
parser.add_argument(
|
||||
"--no-redirect",
|
||||
dest="no_redirect",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is",
|
||||
)
|
||||
|
||||
def update_vars(self, vars):
|
||||
"""You can use this function to update the Scrapy objects that will be
|
||||
@ -58,8 +67,9 @@ class Command(ScrapyCommand):
|
||||
if opts.spider:
|
||||
spidercls = spider_loader.load(opts.spider)
|
||||
elif url:
|
||||
spidercls = spidercls_for_request(spider_loader, Request(url),
|
||||
spidercls, log_multiple=True)
|
||||
spidercls = spidercls_for_request(
|
||||
spider_loader, Request(url), spidercls, log_multiple=True
|
||||
)
|
||||
|
||||
# The crawler is created this way since the Shell manually handles the
|
||||
# crawling engine, so the set up in the crawl method won't work
|
||||
@ -74,7 +84,9 @@ class Command(ScrapyCommand):
|
||||
shell.start(url=url, redirect=not opts.no_redirect)
|
||||
|
||||
def _start_crawler_thread(self):
|
||||
t = Thread(target=self.crawler_process.start,
|
||||
kwargs={'stop_after_crawl': False, 'install_signal_handlers': False})
|
||||
t = Thread(
|
||||
target=self.crawler_process.start,
|
||||
kwargs={"stop_after_crawl": False, "install_signal_handlers": False},
|
||||
)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
@ -13,14 +13,14 @@ from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
TEMPLATES_TO_RENDER = (
|
||||
('scrapy.cfg',),
|
||||
('${project_name}', 'settings.py.tmpl'),
|
||||
('${project_name}', 'items.py.tmpl'),
|
||||
('${project_name}', 'pipelines.py.tmpl'),
|
||||
('${project_name}', 'middlewares.py.tmpl'),
|
||||
("scrapy.cfg",),
|
||||
("${project_name}", "settings.py.tmpl"),
|
||||
("${project_name}", "items.py.tmpl"),
|
||||
("${project_name}", "pipelines.py.tmpl"),
|
||||
("${project_name}", "middlewares.py.tmpl"),
|
||||
)
|
||||
|
||||
IGNORE = ignore_patterns('*.pyc', '__pycache__', '.svn')
|
||||
IGNORE = ignore_patterns("*.pyc", "__pycache__", ".svn")
|
||||
|
||||
|
||||
def _make_writable(path):
|
||||
@ -31,8 +31,7 @@ def _make_writable(path):
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'LOG_ENABLED': False,
|
||||
'SPIDER_LOADER_WARN_ONLY': True}
|
||||
default_settings = {"LOG_ENABLED": False, "SPIDER_LOADER_WARN_ONLY": True}
|
||||
|
||||
def syntax(self):
|
||||
return "<project_name> [project_dir]"
|
||||
@ -45,11 +44,13 @@ class Command(ScrapyCommand):
|
||||
spec = find_spec(module_name)
|
||||
return spec is not None and spec.loader is not None
|
||||
|
||||
if not re.search(r'^[_a-zA-Z]\w*$', project_name):
|
||||
print('Error: Project names must begin with a letter and contain'
|
||||
' only\nletters, numbers and underscores')
|
||||
if not re.search(r"^[_a-zA-Z]\w*$", project_name):
|
||||
print(
|
||||
"Error: Project names must begin with a letter and contain"
|
||||
" only\nletters, numbers and underscores"
|
||||
)
|
||||
elif _module_exists(project_name):
|
||||
print(f'Error: Module {project_name!r} already exists')
|
||||
print(f"Error: Module {project_name!r} already exists")
|
||||
else:
|
||||
return True
|
||||
return False
|
||||
@ -96,9 +97,9 @@ class Command(ScrapyCommand):
|
||||
else:
|
||||
project_dir = Path(args[0])
|
||||
|
||||
if (project_dir / 'scrapy.cfg').exists():
|
||||
if (project_dir / "scrapy.cfg").exists():
|
||||
self.exitcode = 1
|
||||
print(f'Error: scrapy.cfg already exists in {project_dir.resolve()}')
|
||||
print(f"Error: scrapy.cfg already exists in {project_dir.resolve()}")
|
||||
return
|
||||
|
||||
if not self._is_valid_name(project_name):
|
||||
@ -106,12 +107,24 @@ class Command(ScrapyCommand):
|
||||
return
|
||||
|
||||
self._copytree(Path(self.templates_dir), project_dir.resolve())
|
||||
move(project_dir / 'module', project_dir / project_name)
|
||||
move(project_dir / "module", project_dir / project_name)
|
||||
for paths in TEMPLATES_TO_RENDER:
|
||||
tplfile = Path(project_dir, *(string.Template(s).substitute(project_name=project_name) for s in paths))
|
||||
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
|
||||
print(f"New Scrapy project '{project_name}', using template directory "
|
||||
f"'{self.templates_dir}', created in:")
|
||||
tplfile = Path(
|
||||
project_dir,
|
||||
*(
|
||||
string.Template(s).substitute(project_name=project_name)
|
||||
for s in paths
|
||||
),
|
||||
)
|
||||
render_templatefile(
|
||||
tplfile,
|
||||
project_name=project_name,
|
||||
ProjectName=string_camelcase(project_name),
|
||||
)
|
||||
print(
|
||||
f"New Scrapy project '{project_name}', using template directory "
|
||||
f"'{self.templates_dir}', created in:"
|
||||
)
|
||||
print(f" {project_dir.resolve()}\n")
|
||||
print("You can start your first spider with:")
|
||||
print(f" cd {project_dir}")
|
||||
@ -119,7 +132,9 @@ class Command(ScrapyCommand):
|
||||
|
||||
@property
|
||||
def templates_dir(self) -> str:
|
||||
return str(Path(
|
||||
self.settings['TEMPLATES_DIR'] or Path(scrapy.__path__[0], 'templates'),
|
||||
'project'
|
||||
))
|
||||
return str(
|
||||
Path(
|
||||
self.settings["TEMPLATES_DIR"] or Path(scrapy.__path__[0], "templates"),
|
||||
"project",
|
||||
)
|
||||
)
|
||||
|
@ -5,8 +5,7 @@ from scrapy.utils.versions import scrapy_components_versions
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
default_settings = {'LOG_ENABLED': False,
|
||||
'SPIDER_LOADER_WARN_ONLY': True}
|
||||
default_settings = {"LOG_ENABLED": False, "SPIDER_LOADER_WARN_ONLY": True}
|
||||
|
||||
def syntax(self):
|
||||
return "[-v]"
|
||||
@ -16,8 +15,13 @@ class Command(ScrapyCommand):
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_argument("--verbose", "-v", dest="verbose", action="store_true",
|
||||
help="also display twisted/python/platform info (useful for bug reports)")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
dest="verbose",
|
||||
action="store_true",
|
||||
help="also display twisted/python/platform info (useful for bug reports)",
|
||||
)
|
||||
|
||||
def run(self, args, opts):
|
||||
if opts.verbose:
|
||||
|
@ -4,16 +4,17 @@ from scrapy.utils.response import open_in_browser
|
||||
|
||||
|
||||
class Command(fetch.Command):
|
||||
|
||||
def short_desc(self):
|
||||
return "Open URL in browser, as seen by Scrapy"
|
||||
|
||||
def long_desc(self):
|
||||
return "Fetch a URL using the Scrapy downloader and show its contents in a browser"
|
||||
return (
|
||||
"Fetch a URL using the Scrapy downloader and show its contents in a browser"
|
||||
)
|
||||
|
||||
def add_options(self, parser):
|
||||
super().add_options(parser)
|
||||
parser.add_argument('--headers', help=argparse.SUPPRESS)
|
||||
parser.add_argument("--headers", help=argparse.SUPPRESS)
|
||||
|
||||
def _print_response(self, response, opts):
|
||||
open_in_browser(response)
|
||||
|
@ -11,16 +11,17 @@ from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
|
||||
class Contract:
|
||||
""" Abstract class for contracts """
|
||||
"""Abstract class for contracts"""
|
||||
|
||||
request_cls = None
|
||||
|
||||
def __init__(self, method, *args):
|
||||
self.testcase_pre = _create_testcase(method, f'@{self.name} pre-hook')
|
||||
self.testcase_post = _create_testcase(method, f'@{self.name} post-hook')
|
||||
self.testcase_pre = _create_testcase(method, f"@{self.name} pre-hook")
|
||||
self.testcase_post = _create_testcase(method, f"@{self.name} post-hook")
|
||||
self.args = args
|
||||
|
||||
def add_pre_hook(self, request, results):
|
||||
if hasattr(self, 'pre_process'):
|
||||
if hasattr(self, "pre_process"):
|
||||
cb = request.callback
|
||||
|
||||
@wraps(cb)
|
||||
@ -43,7 +44,7 @@ class Contract:
|
||||
return request
|
||||
|
||||
def add_post_hook(self, request, results):
|
||||
if hasattr(self, 'post_process'):
|
||||
if hasattr(self, "post_process"):
|
||||
cb = request.callback
|
||||
|
||||
@wraps(cb)
|
||||
@ -88,12 +89,12 @@ class ContractsManager:
|
||||
|
||||
def extract_contracts(self, method):
|
||||
contracts = []
|
||||
for line in method.__doc__.split('\n'):
|
||||
for line in method.__doc__.split("\n"):
|
||||
line = line.strip()
|
||||
|
||||
if line.startswith('@'):
|
||||
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
|
||||
args = re.split(r'\s+', args)
|
||||
if line.startswith("@"):
|
||||
name, args = re.match(r"@(\w+)\s*(.*)", line).groups()
|
||||
args = re.split(r"\s+", args)
|
||||
|
||||
contracts.append(self.contracts[name](method, *args))
|
||||
|
||||
@ -106,7 +107,7 @@ class ContractsManager:
|
||||
try:
|
||||
requests.append(self.from_method(bound_method, results))
|
||||
except Exception:
|
||||
case = _create_testcase(bound_method, 'contract')
|
||||
case = _create_testcase(bound_method, "contract")
|
||||
results.addError(case, sys.exc_info())
|
||||
|
||||
return requests
|
||||
@ -124,13 +125,13 @@ class ContractsManager:
|
||||
|
||||
# Don't filter requests to allow
|
||||
# testing different callbacks on the same URL.
|
||||
kwargs['dont_filter'] = True
|
||||
kwargs['callback'] = method
|
||||
kwargs["dont_filter"] = True
|
||||
kwargs["callback"] = method
|
||||
|
||||
for contract in contracts:
|
||||
kwargs = contract.adjust_request_args(kwargs)
|
||||
|
||||
args.remove('self')
|
||||
args.remove("self")
|
||||
|
||||
# check if all positional arguments are defined in kwargs
|
||||
if set(args).issubset(set(kwargs)):
|
||||
@ -146,7 +147,7 @@ class ContractsManager:
|
||||
return request
|
||||
|
||||
def _clean_req(self, request, method, results):
|
||||
""" stop the request from returning objects and records any errors """
|
||||
"""stop the request from returning objects and records any errors"""
|
||||
|
||||
cb = request.callback
|
||||
|
||||
@ -156,11 +157,11 @@ class ContractsManager:
|
||||
output = cb(response, **cb_kwargs)
|
||||
output = list(iterate_spider_output(output))
|
||||
except Exception:
|
||||
case = _create_testcase(method, 'callback')
|
||||
case = _create_testcase(method, "callback")
|
||||
results.addError(case, sys.exc_info())
|
||||
|
||||
def eb_wrapper(failure):
|
||||
case = _create_testcase(method, 'errback')
|
||||
case = _create_testcase(method, "errback")
|
||||
exc_info = failure.type, failure.value, failure.getTracebackObject()
|
||||
results.addError(case, exc_info)
|
||||
|
||||
@ -175,6 +176,6 @@ def _create_testcase(method, desc):
|
||||
def __str__(_self):
|
||||
return f"[{spider}] {method.__name__} ({desc})"
|
||||
|
||||
name = f'{spider}_{method.__name__}'
|
||||
name = f"{spider}_{method.__name__}"
|
||||
setattr(ContractTestCase, name, lambda x: x)
|
||||
return ContractTestCase(name)
|
||||
|
@ -9,50 +9,50 @@ from scrapy.http import Request
|
||||
|
||||
# contracts
|
||||
class UrlContract(Contract):
|
||||
""" Contract to set the url of the request (mandatory)
|
||||
@url http://scrapy.org
|
||||
"""Contract to set the url of the request (mandatory)
|
||||
@url http://scrapy.org
|
||||
"""
|
||||
|
||||
name = 'url'
|
||||
name = "url"
|
||||
|
||||
def adjust_request_args(self, args):
|
||||
args['url'] = self.args[0]
|
||||
args["url"] = self.args[0]
|
||||
return args
|
||||
|
||||
|
||||
class CallbackKeywordArgumentsContract(Contract):
|
||||
""" Contract to set the keyword arguments for the request.
|
||||
The value should be a JSON-encoded dictionary, e.g.:
|
||||
"""Contract to set the keyword arguments for the request.
|
||||
The value should be a JSON-encoded dictionary, e.g.:
|
||||
|
||||
@cb_kwargs {"arg1": "some value"}
|
||||
@cb_kwargs {"arg1": "some value"}
|
||||
"""
|
||||
|
||||
name = 'cb_kwargs'
|
||||
name = "cb_kwargs"
|
||||
|
||||
def adjust_request_args(self, args):
|
||||
args['cb_kwargs'] = json.loads(' '.join(self.args))
|
||||
args["cb_kwargs"] = json.loads(" ".join(self.args))
|
||||
return args
|
||||
|
||||
|
||||
class ReturnsContract(Contract):
|
||||
""" Contract to check the output of a callback
|
||||
"""Contract to check the output of a callback
|
||||
|
||||
general form:
|
||||
@returns request(s)/item(s) [min=1 [max]]
|
||||
general form:
|
||||
@returns request(s)/item(s) [min=1 [max]]
|
||||
|
||||
e.g.:
|
||||
@returns request
|
||||
@returns request 2
|
||||
@returns request 2 10
|
||||
@returns request 0 10
|
||||
e.g.:
|
||||
@returns request
|
||||
@returns request 2
|
||||
@returns request 2 10
|
||||
@returns request 0 10
|
||||
"""
|
||||
|
||||
name = 'returns'
|
||||
name = "returns"
|
||||
object_type_verifiers = {
|
||||
'request': lambda x: isinstance(x, Request),
|
||||
'requests': lambda x: isinstance(x, Request),
|
||||
'item': is_item,
|
||||
'items': is_item,
|
||||
"request": lambda x: isinstance(x, Request),
|
||||
"requests": lambda x: isinstance(x, Request),
|
||||
"item": is_item,
|
||||
"items": is_item,
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -73,7 +73,7 @@ class ReturnsContract(Contract):
|
||||
try:
|
||||
self.max_bound = int(self.args[2])
|
||||
except IndexError:
|
||||
self.max_bound = float('inf')
|
||||
self.max_bound = float("inf")
|
||||
|
||||
def post_process(self, output):
|
||||
occurrences = 0
|
||||
@ -81,23 +81,25 @@ class ReturnsContract(Contract):
|
||||
if self.obj_type_verifier(x):
|
||||
occurrences += 1
|
||||
|
||||
assertion = (self.min_bound <= occurrences <= self.max_bound)
|
||||
assertion = self.min_bound <= occurrences <= self.max_bound
|
||||
|
||||
if not assertion:
|
||||
if self.min_bound == self.max_bound:
|
||||
expected = self.min_bound
|
||||
else:
|
||||
expected = f'{self.min_bound}..{self.max_bound}'
|
||||
expected = f"{self.min_bound}..{self.max_bound}"
|
||||
|
||||
raise ContractFail(f"Returned {occurrences} {self.obj_name}, expected {expected}")
|
||||
raise ContractFail(
|
||||
f"Returned {occurrences} {self.obj_name}, expected {expected}"
|
||||
)
|
||||
|
||||
|
||||
class ScrapesContract(Contract):
|
||||
""" Contract to check presence of fields in scraped items
|
||||
@scrapes page_name page_body
|
||||
"""Contract to check presence of fields in scraped items
|
||||
@scrapes page_name page_body
|
||||
"""
|
||||
|
||||
name = 'scrapes'
|
||||
name = "scrapes"
|
||||
|
||||
def post_process(self, output):
|
||||
for x in output:
|
||||
|
@ -41,9 +41,11 @@ class Slot:
|
||||
|
||||
def __repr__(self):
|
||||
cls_name = self.__class__.__name__
|
||||
return (f"{cls_name}(concurrency={self.concurrency!r}, "
|
||||
f"delay={self.delay:.2f}, "
|
||||
f"randomize_delay={self.randomize_delay!r})")
|
||||
return (
|
||||
f"{cls_name}(concurrency={self.concurrency!r}, "
|
||||
f"delay={self.delay:.2f}, "
|
||||
f"randomize_delay={self.randomize_delay!r})"
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
@ -56,11 +58,11 @@ class Slot:
|
||||
|
||||
|
||||
def _get_concurrency_delay(concurrency, spider, settings):
|
||||
delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
if hasattr(spider, 'download_delay'):
|
||||
delay = settings.getfloat("DOWNLOAD_DELAY")
|
||||
if hasattr(spider, "download_delay"):
|
||||
delay = spider.download_delay
|
||||
|
||||
if hasattr(spider, 'max_concurrent_requests'):
|
||||
if hasattr(spider, "max_concurrent_requests"):
|
||||
concurrency = spider.max_concurrent_requests
|
||||
|
||||
return concurrency, delay
|
||||
@ -68,7 +70,7 @@ def _get_concurrency_delay(concurrency, spider, settings):
|
||||
|
||||
class Downloader:
|
||||
|
||||
DOWNLOAD_SLOT = 'download_slot'
|
||||
DOWNLOAD_SLOT = "download_slot"
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.settings = crawler.settings
|
||||
@ -76,10 +78,10 @@ class Downloader:
|
||||
self.slots = {}
|
||||
self.active = set()
|
||||
self.handlers = DownloadHandlers(crawler)
|
||||
self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
|
||||
self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||
self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
|
||||
self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
|
||||
self.total_concurrency = self.settings.getint("CONCURRENT_REQUESTS")
|
||||
self.domain_concurrency = self.settings.getint("CONCURRENT_REQUESTS_PER_DOMAIN")
|
||||
self.ip_concurrency = self.settings.getint("CONCURRENT_REQUESTS_PER_IP")
|
||||
self.randomize_delay = self.settings.getbool("RANDOMIZE_DOWNLOAD_DELAY")
|
||||
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
|
||||
self._slot_gc_loop = task.LoopingCall(self._slot_gc)
|
||||
self._slot_gc_loop.start(60)
|
||||
@ -99,7 +101,9 @@ class Downloader:
|
||||
def _get_slot(self, request, spider):
|
||||
key = self._get_slot_key(request, spider)
|
||||
if key not in self.slots:
|
||||
conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
|
||||
conc = (
|
||||
self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
|
||||
)
|
||||
conc, delay = _get_concurrency_delay(conc, spider, self.settings)
|
||||
self.slots[key] = Slot(conc, delay, self.randomize_delay)
|
||||
|
||||
@ -109,7 +113,7 @@ class Downloader:
|
||||
if self.DOWNLOAD_SLOT in request.meta:
|
||||
return request.meta[self.DOWNLOAD_SLOT]
|
||||
|
||||
key = urlparse_cached(request).hostname or ''
|
||||
key = urlparse_cached(request).hostname or ""
|
||||
if self.ip_concurrency:
|
||||
key = dnscache.get(key, key)
|
||||
|
||||
@ -124,9 +128,9 @@ class Downloader:
|
||||
return response
|
||||
|
||||
slot.active.add(request)
|
||||
self.signals.send_catch_log(signal=signals.request_reached_downloader,
|
||||
request=request,
|
||||
spider=spider)
|
||||
self.signals.send_catch_log(
|
||||
signal=signals.request_reached_downloader, request=request, spider=spider
|
||||
)
|
||||
deferred = defer.Deferred().addBoth(_deactivate)
|
||||
slot.queue.append((request, deferred))
|
||||
self._process_queue(spider, slot)
|
||||
@ -134,6 +138,7 @@ class Downloader:
|
||||
|
||||
def _process_queue(self, spider, slot):
|
||||
from twisted.internet import reactor
|
||||
|
||||
if slot.latercall and slot.latercall.active():
|
||||
return
|
||||
|
||||
@ -143,7 +148,9 @@ class Downloader:
|
||||
if delay:
|
||||
penalty = delay - now + slot.lastseen
|
||||
if penalty > 0:
|
||||
slot.latercall = reactor.callLater(penalty, self._process_queue, spider, slot)
|
||||
slot.latercall = reactor.callLater(
|
||||
penalty, self._process_queue, spider, slot
|
||||
)
|
||||
return
|
||||
|
||||
# Process enqueued requests if there are free slots to transfer for this slot
|
||||
@ -166,11 +173,14 @@ class Downloader:
|
||||
# 2. Notify response_downloaded listeners about the recent download
|
||||
# before querying queue for next request
|
||||
def _downloaded(response):
|
||||
self.signals.send_catch_log(signal=signals.response_downloaded,
|
||||
response=response,
|
||||
request=request,
|
||||
spider=spider)
|
||||
self.signals.send_catch_log(
|
||||
signal=signals.response_downloaded,
|
||||
response=response,
|
||||
request=request,
|
||||
spider=spider,
|
||||
)
|
||||
return response
|
||||
|
||||
dfd.addCallback(_downloaded)
|
||||
|
||||
# 3. After response arrives, remove the request from transferring
|
||||
@ -182,9 +192,9 @@ class Downloader:
|
||||
def finish_transferring(_):
|
||||
slot.transferring.remove(request)
|
||||
self._process_queue(spider, slot)
|
||||
self.signals.send_catch_log(signal=signals.request_left_downloader,
|
||||
request=request,
|
||||
spider=spider)
|
||||
self.signals.send_catch_log(
|
||||
signal=signals.request_left_downloader, request=request, spider=spider
|
||||
)
|
||||
return _
|
||||
|
||||
return dfd.addBoth(finish_transferring)
|
||||
|
@ -2,13 +2,22 @@ import warnings
|
||||
|
||||
from OpenSSL import SSL
|
||||
from twisted.internet._sslverify import _setAcceptableProtocols
|
||||
from twisted.internet.ssl import optionsForClientTLS, CertificateOptions, platformTrust, AcceptableCiphers
|
||||
from twisted.internet.ssl import (
|
||||
optionsForClientTLS,
|
||||
CertificateOptions,
|
||||
platformTrust,
|
||||
AcceptableCiphers,
|
||||
)
|
||||
from twisted.web.client import BrowserLikePolicyForHTTPS
|
||||
from twisted.web.iweb import IPolicyForHTTPS
|
||||
from zope.interface.declarations import implementer
|
||||
from zope.interface.verify import verifyObject
|
||||
|
||||
from scrapy.core.downloader.tls import DEFAULT_CIPHERS, openssl_methods, ScrapyClientTLSOptions
|
||||
from scrapy.core.downloader.tls import (
|
||||
DEFAULT_CIPHERS,
|
||||
openssl_methods,
|
||||
ScrapyClientTLSOptions,
|
||||
)
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
|
||||
|
||||
@ -24,7 +33,14 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
understand the TLSv1, TLSv1.1 and TLSv1.2 protocols.'
|
||||
"""
|
||||
|
||||
def __init__(self, method=SSL.SSLv23_METHOD, tls_verbose_logging=False, tls_ciphers=None, *args, **kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
method=SSL.SSLv23_METHOD,
|
||||
tls_verbose_logging=False,
|
||||
tls_ciphers=None,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._ssl_method = method
|
||||
self.tls_verbose_logging = tls_verbose_logging
|
||||
@ -35,9 +51,15 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings, method=SSL.SSLv23_METHOD, *args, **kwargs):
|
||||
tls_verbose_logging = settings.getbool('DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING')
|
||||
tls_ciphers = settings['DOWNLOADER_CLIENT_TLS_CIPHERS']
|
||||
return cls(method=method, tls_verbose_logging=tls_verbose_logging, tls_ciphers=tls_ciphers, *args, **kwargs)
|
||||
tls_verbose_logging = settings.getbool("DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING")
|
||||
tls_ciphers = settings["DOWNLOADER_CLIENT_TLS_CIPHERS"]
|
||||
return cls(
|
||||
method=method,
|
||||
tls_verbose_logging=tls_verbose_logging,
|
||||
tls_ciphers=tls_ciphers,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def getCertificateOptions(self):
|
||||
# setting verify=True will require you to provide CAs
|
||||
@ -53,7 +75,7 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
# not calling super().__init__
|
||||
return CertificateOptions(
|
||||
verify=False,
|
||||
method=getattr(self, 'method', getattr(self, '_ssl_method', None)),
|
||||
method=getattr(self, "method", getattr(self, "_ssl_method", None)),
|
||||
fixBrokenPeers=True,
|
||||
acceptableCiphers=self.tls_ciphers,
|
||||
)
|
||||
@ -64,8 +86,11 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
return self.getCertificateOptions().getContext()
|
||||
|
||||
def creatorForNetloc(self, hostname, port):
|
||||
return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext(),
|
||||
verbose_logging=self.tls_verbose_logging)
|
||||
return ScrapyClientTLSOptions(
|
||||
hostname.decode("ascii"),
|
||||
self.getContext(),
|
||||
verbose_logging=self.tls_verbose_logging,
|
||||
)
|
||||
|
||||
|
||||
@implementer(IPolicyForHTTPS)
|
||||
@ -95,7 +120,7 @@ class BrowserLikeContextFactory(ScrapyClientContextFactory):
|
||||
return optionsForClientTLS(
|
||||
hostname=hostname.decode("ascii"),
|
||||
trustRoot=platformTrust(),
|
||||
extraCertificateOptions={'method': self._ssl_method},
|
||||
extraCertificateOptions={"method": self._ssl_method},
|
||||
)
|
||||
|
||||
|
||||
@ -118,8 +143,8 @@ class AcceptableProtocolsContextFactory:
|
||||
|
||||
|
||||
def load_context_factory_from_settings(settings, crawler):
|
||||
ssl_method = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
|
||||
context_factory_cls = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
|
||||
ssl_method = openssl_methods[settings.get("DOWNLOADER_CLIENT_TLS_METHOD")]
|
||||
context_factory_cls = load_object(settings["DOWNLOADER_CLIENTCONTEXTFACTORY"])
|
||||
# try method-aware context factory
|
||||
try:
|
||||
context_factory = create_instance(
|
||||
|
@ -15,14 +15,14 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DownloadHandlers:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self._crawler = crawler
|
||||
self._schemes = {} # stores acceptable schemes on instancing
|
||||
self._handlers = {} # stores instanced handlers for schemes
|
||||
self._notconfigured = {} # remembers failed handlers
|
||||
handlers = without_none_values(
|
||||
crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
|
||||
crawler.settings.getwithbase("DOWNLOAD_HANDLERS")
|
||||
)
|
||||
for scheme, clspath in handlers.items():
|
||||
self._schemes[scheme] = clspath
|
||||
self._load_handler(scheme, skip_lazy=True)
|
||||
@ -38,7 +38,7 @@ class DownloadHandlers:
|
||||
if scheme in self._notconfigured:
|
||||
return None
|
||||
if scheme not in self._schemes:
|
||||
self._notconfigured[scheme] = 'no handler available for that scheme'
|
||||
self._notconfigured[scheme] = "no handler available for that scheme"
|
||||
return None
|
||||
|
||||
return self._load_handler(scheme)
|
||||
@ -47,7 +47,7 @@ class DownloadHandlers:
|
||||
path = self._schemes[scheme]
|
||||
try:
|
||||
dhcls = load_object(path)
|
||||
if skip_lazy and getattr(dhcls, 'lazy', True):
|
||||
if skip_lazy and getattr(dhcls, "lazy", True):
|
||||
return None
|
||||
dh = create_instance(
|
||||
objcls=dhcls,
|
||||
@ -58,9 +58,12 @@ class DownloadHandlers:
|
||||
self._notconfigured[scheme] = str(ex)
|
||||
return None
|
||||
except Exception as ex:
|
||||
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
|
||||
{"clspath": path, "scheme": scheme},
|
||||
exc_info=True, extra={'crawler': self._crawler})
|
||||
logger.error(
|
||||
'Loading "%(clspath)s" for scheme "%(scheme)s"',
|
||||
{"clspath": path, "scheme": scheme},
|
||||
exc_info=True,
|
||||
extra={"crawler": self._crawler},
|
||||
)
|
||||
self._notconfigured[scheme] = str(ex)
|
||||
return None
|
||||
else:
|
||||
@ -71,11 +74,13 @@ class DownloadHandlers:
|
||||
scheme = urlparse_cached(request).scheme
|
||||
handler = self._get_handler(scheme)
|
||||
if not handler:
|
||||
raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
|
||||
raise NotSupported(
|
||||
f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}"
|
||||
)
|
||||
return handler.download_request(request, spider)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _close(self, *_a, **_kw):
|
||||
for dh in self._handlers.values():
|
||||
if hasattr(dh, 'close'):
|
||||
if hasattr(dh, "close"):
|
||||
yield dh.close()
|
||||
|
@ -14,9 +14,8 @@ class DataURIDownloadHandler:
|
||||
respcls = responsetypes.from_mimetype(uri.media_type)
|
||||
|
||||
resp_kwargs = {}
|
||||
if (issubclass(respcls, TextResponse)
|
||||
and uri.media_type.split('/')[0] == 'text'):
|
||||
charset = uri.media_type_parameters.get('charset')
|
||||
resp_kwargs['encoding'] = charset
|
||||
if issubclass(respcls, TextResponse) and uri.media_type.split("/")[0] == "text":
|
||||
charset = uri.media_type_parameters.get("charset")
|
||||
resp_kwargs["encoding"] = charset
|
||||
|
||||
return respcls(url=request.url, body=uri.data, **resp_kwargs)
|
||||
|
@ -71,9 +71,9 @@ class FTPDownloadHandler:
|
||||
}
|
||||
|
||||
def __init__(self, settings):
|
||||
self.default_user = settings['FTP_USER']
|
||||
self.default_password = settings['FTP_PASSWORD']
|
||||
self.passive_mode = settings['FTP_PASSIVE_MODE']
|
||||
self.default_user = settings["FTP_USER"]
|
||||
self.default_password = settings["FTP_PASSWORD"]
|
||||
self.passive_mode = settings["FTP_PASSIVE_MODE"]
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
@ -81,12 +81,16 @@ class FTPDownloadHandler:
|
||||
|
||||
def download_request(self, request, spider):
|
||||
from twisted.internet import reactor
|
||||
|
||||
parsed_url = urlparse_cached(request)
|
||||
user = request.meta.get("ftp_user", self.default_user)
|
||||
password = request.meta.get("ftp_password", self.default_password)
|
||||
passive_mode = 1 if bool(request.meta.get("ftp_passive",
|
||||
self.passive_mode)) else 0
|
||||
creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode)
|
||||
passive_mode = (
|
||||
1 if bool(request.meta.get("ftp_passive", self.passive_mode)) else 0
|
||||
)
|
||||
creator = ClientCreator(
|
||||
reactor, FTPClient, user, password, passive=passive_mode
|
||||
)
|
||||
dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
|
||||
return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
|
||||
|
||||
@ -103,7 +107,7 @@ class FTPDownloadHandler:
|
||||
def _build_response(self, result, request, protocol):
|
||||
self.result = result
|
||||
protocol.close()
|
||||
headers = {"local filename": protocol.filename or '', "size": protocol.size}
|
||||
headers = {"local filename": protocol.filename or "", "size": protocol.size}
|
||||
body = to_bytes(protocol.filename or protocol.body.read())
|
||||
respcls = responsetypes.from_args(url=request.url, body=body)
|
||||
return respcls(url=request.url, status=200, body=body, headers=headers)
|
||||
@ -115,5 +119,7 @@ class FTPDownloadHandler:
|
||||
if m:
|
||||
ftpcode = m.group()
|
||||
httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
|
||||
return Response(url=request.url, status=httpcode, body=to_bytes(message))
|
||||
return Response(
|
||||
url=request.url, status=httpcode, body=to_bytes(message)
|
||||
)
|
||||
raise result.type(result.value)
|
||||
|
@ -8,8 +8,10 @@ class HTTP10DownloadHandler:
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings, crawler=None):
|
||||
self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
|
||||
self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
|
||||
self.HTTPClientFactory = load_object(settings["DOWNLOADER_HTTPCLIENTFACTORY"])
|
||||
self.ClientContextFactory = load_object(
|
||||
settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]
|
||||
)
|
||||
self._settings = settings
|
||||
self._crawler = crawler
|
||||
|
||||
@ -25,8 +27,9 @@ class HTTP10DownloadHandler:
|
||||
|
||||
def _connect(self, factory):
|
||||
from twisted.internet import reactor
|
||||
|
||||
host, port = to_unicode(factory.host), factory.port
|
||||
if factory.scheme == b'https':
|
||||
if factory.scheme == b"https":
|
||||
client_context_factory = create_instance(
|
||||
objcls=self.ClientContextFactory,
|
||||
settings=self._settings,
|
||||
|
@ -12,7 +12,13 @@ from twisted.internet import defer, protocol, ssl
|
||||
from twisted.internet.endpoints import TCP4ClientEndpoint
|
||||
from twisted.internet.error import TimeoutError
|
||||
from twisted.python.failure import Failure
|
||||
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
|
||||
from twisted.web.client import (
|
||||
Agent,
|
||||
HTTPConnectionPool,
|
||||
ResponseDone,
|
||||
ResponseFailed,
|
||||
URI,
|
||||
)
|
||||
from twisted.web.http import _DataLoss, PotentialDataLoss
|
||||
from twisted.web.http_headers import Headers as TxHeaders
|
||||
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
|
||||
@ -36,14 +42,17 @@ class HTTP11DownloadHandler:
|
||||
self._crawler = crawler
|
||||
|
||||
from twisted.internet import reactor
|
||||
|
||||
self._pool = HTTPConnectionPool(reactor, persistent=True)
|
||||
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||
self._pool.maxPersistentPerHost = settings.getint(
|
||||
"CONCURRENT_REQUESTS_PER_DOMAIN"
|
||||
)
|
||||
self._pool._factory.noisy = False
|
||||
|
||||
self._contextFactory = load_context_factory_from_settings(settings, crawler)
|
||||
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
|
||||
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
|
||||
self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
|
||||
self._default_maxsize = settings.getint("DOWNLOAD_MAXSIZE")
|
||||
self._default_warnsize = settings.getint("DOWNLOAD_WARNSIZE")
|
||||
self._fail_on_dataloss = settings.getbool("DOWNLOAD_FAIL_ON_DATALOSS")
|
||||
self._disconnect_timeout = 1
|
||||
|
||||
@classmethod
|
||||
@ -55,8 +64,8 @@ class HTTP11DownloadHandler:
|
||||
agent = ScrapyAgent(
|
||||
contextFactory=self._contextFactory,
|
||||
pool=self._pool,
|
||||
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
|
||||
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
|
||||
maxsize=getattr(spider, "download_maxsize", self._default_maxsize),
|
||||
warnsize=getattr(spider, "download_warnsize", self._default_warnsize),
|
||||
fail_on_dataloss=self._fail_on_dataloss,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
@ -64,6 +73,7 @@ class HTTP11DownloadHandler:
|
||||
|
||||
def close(self):
|
||||
from twisted.internet import reactor
|
||||
|
||||
d = self._pool.closeCachedConnections()
|
||||
# closeCachedConnections will hang on network or server issues, so
|
||||
# we'll manually timeout the deferred.
|
||||
@ -96,11 +106,23 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
|
||||
with this endpoint comes from the pool and a CONNECT has already been issued
|
||||
for it.
|
||||
"""
|
||||
|
||||
_truncatedLength = 1000
|
||||
_responseAnswer = r'HTTP/1\.. (?P<status>\d{3})(?P<reason>.{,' + str(_truncatedLength) + r'})'
|
||||
_responseAnswer = (
|
||||
r"HTTP/1\.. (?P<status>\d{3})(?P<reason>.{," + str(_truncatedLength) + r"})"
|
||||
)
|
||||
_responseMatcher = re.compile(_responseAnswer.encode())
|
||||
|
||||
def __init__(self, reactor, host, port, proxyConf, contextFactory, timeout=30, bindAddress=None):
|
||||
def __init__(
|
||||
self,
|
||||
reactor,
|
||||
host,
|
||||
port,
|
||||
proxyConf,
|
||||
contextFactory,
|
||||
timeout=30,
|
||||
bindAddress=None,
|
||||
):
|
||||
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
|
||||
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
|
||||
self._tunnelReadyDeferred = defer.Deferred()
|
||||
@ -111,7 +133,9 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
|
||||
|
||||
def requestTunnel(self, protocol):
|
||||
"""Asks the proxy to open a tunnel."""
|
||||
tunnelReq = tunnel_request_data(self._tunneledHost, self._tunneledPort, self._proxyAuthHeader)
|
||||
tunnelReq = tunnel_request_data(
|
||||
self._tunneledHost, self._tunneledPort, self._proxyAuthHeader
|
||||
)
|
||||
protocol.transport.write(tunnelReq)
|
||||
self._protocolDataReceived = protocol.dataReceived
|
||||
protocol.dataReceived = self.processProxyResponse
|
||||
@ -129,24 +153,30 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
|
||||
# from the proxy so that we don't send those bytes to the TLS layer
|
||||
#
|
||||
# see https://github.com/scrapy/scrapy/issues/2491
|
||||
if b'\r\n\r\n' not in self._connectBuffer:
|
||||
if b"\r\n\r\n" not in self._connectBuffer:
|
||||
return
|
||||
self._protocol.dataReceived = self._protocolDataReceived
|
||||
respm = TunnelingTCP4ClientEndpoint._responseMatcher.match(self._connectBuffer)
|
||||
if respm and int(respm.group('status')) == 200:
|
||||
if respm and int(respm.group("status")) == 200:
|
||||
# set proper Server Name Indication extension
|
||||
sslOptions = self._contextFactory.creatorForNetloc(self._tunneledHost, self._tunneledPort)
|
||||
sslOptions = self._contextFactory.creatorForNetloc(
|
||||
self._tunneledHost, self._tunneledPort
|
||||
)
|
||||
self._protocol.transport.startTLS(sslOptions, self._protocolFactory)
|
||||
self._tunnelReadyDeferred.callback(self._protocol)
|
||||
else:
|
||||
if respm:
|
||||
extra = {'status': int(respm.group('status')),
|
||||
'reason': respm.group('reason').strip()}
|
||||
extra = {
|
||||
"status": int(respm.group("status")),
|
||||
"reason": respm.group("reason").strip(),
|
||||
}
|
||||
else:
|
||||
extra = rcvd_bytes[:self._truncatedLength]
|
||||
extra = rcvd_bytes[: self._truncatedLength]
|
||||
self._tunnelReadyDeferred.errback(
|
||||
TunnelError('Could not open CONNECT tunnel with proxy '
|
||||
f'{self._host}:{self._port} [{extra!r}]')
|
||||
TunnelError(
|
||||
"Could not open CONNECT tunnel with proxy "
|
||||
f"{self._host}:{self._port} [{extra!r}]"
|
||||
)
|
||||
)
|
||||
|
||||
def connectFailed(self, reason):
|
||||
@ -173,12 +203,12 @@ def tunnel_request_data(host, port, proxy_auth_header=None):
|
||||
>>> s(tunnel_request_data(b"example.com", "8090"))
|
||||
'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
|
||||
"""
|
||||
host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
|
||||
tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
|
||||
tunnel_req += b'Host: ' + host_value + b'\r\n'
|
||||
host_value = to_bytes(host, encoding="ascii") + b":" + to_bytes(str(port))
|
||||
tunnel_req = b"CONNECT " + host_value + b" HTTP/1.1\r\n"
|
||||
tunnel_req += b"Host: " + host_value + b"\r\n"
|
||||
if proxy_auth_header:
|
||||
tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
|
||||
tunnel_req += b'\r\n'
|
||||
tunnel_req += b"Proxy-Authorization: " + proxy_auth_header + b"\r\n"
|
||||
tunnel_req += b"\r\n"
|
||||
return tunnel_req
|
||||
|
||||
|
||||
@ -190,8 +220,15 @@ class TunnelingAgent(Agent):
|
||||
proxy involved.
|
||||
"""
|
||||
|
||||
def __init__(self, reactor, proxyConf, contextFactory=None,
|
||||
connectTimeout=None, bindAddress=None, pool=None):
|
||||
def __init__(
|
||||
self,
|
||||
reactor,
|
||||
proxyConf,
|
||||
contextFactory=None,
|
||||
connectTimeout=None,
|
||||
bindAddress=None,
|
||||
pool=None,
|
||||
):
|
||||
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
|
||||
self._proxyConf = proxyConf
|
||||
self._contextFactory = contextFactory
|
||||
@ -207,7 +244,9 @@ class TunnelingAgent(Agent):
|
||||
bindAddress=self._endpointFactory._bindAddress,
|
||||
)
|
||||
|
||||
def _requestWithEndpoint(self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath):
|
||||
def _requestWithEndpoint(
|
||||
self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath
|
||||
):
|
||||
# proxy host and port are required for HTTP pool `key`
|
||||
# otherwise, same remote host connection request could reuse
|
||||
# a cached tunneled connection to a different proxy
|
||||
@ -224,8 +263,9 @@ class TunnelingAgent(Agent):
|
||||
|
||||
|
||||
class ScrapyProxyAgent(Agent):
|
||||
|
||||
def __init__(self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None):
|
||||
def __init__(
|
||||
self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None
|
||||
):
|
||||
super().__init__(
|
||||
reactor=reactor,
|
||||
connectTimeout=connectTimeout,
|
||||
@ -257,8 +297,17 @@ class ScrapyAgent:
|
||||
_ProxyAgent = ScrapyProxyAgent
|
||||
_TunnelingAgent = TunnelingAgent
|
||||
|
||||
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
|
||||
maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
|
||||
def __init__(
|
||||
self,
|
||||
contextFactory=None,
|
||||
connectTimeout=10,
|
||||
bindAddress=None,
|
||||
pool=None,
|
||||
maxsize=0,
|
||||
warnsize=0,
|
||||
fail_on_dataloss=True,
|
||||
crawler=None,
|
||||
):
|
||||
self._contextFactory = contextFactory
|
||||
self._connectTimeout = connectTimeout
|
||||
self._bindAddress = bindAddress
|
||||
@ -271,14 +320,15 @@ class ScrapyAgent:
|
||||
|
||||
def _get_agent(self, request, timeout):
|
||||
from twisted.internet import reactor
|
||||
bindaddress = request.meta.get('bindaddress') or self._bindAddress
|
||||
proxy = request.meta.get('proxy')
|
||||
|
||||
bindaddress = request.meta.get("bindaddress") or self._bindAddress
|
||||
proxy = request.meta.get("proxy")
|
||||
if proxy:
|
||||
proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse(proxy)
|
||||
scheme = _parse(request.url)[0]
|
||||
proxyHost = to_unicode(proxyHost)
|
||||
if scheme == b'https':
|
||||
proxyAuth = request.headers.get(b'Proxy-Authorization', None)
|
||||
if scheme == b"https":
|
||||
proxyAuth = request.headers.get(b"Proxy-Authorization", None)
|
||||
proxyConf = (proxyHost, proxyPort, proxyAuth)
|
||||
return self._TunnelingAgent(
|
||||
reactor=reactor,
|
||||
@ -288,11 +338,11 @@ class ScrapyAgent:
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
)
|
||||
proxyScheme = proxyScheme or b'http'
|
||||
proxyURI = urlunparse((proxyScheme, proxyNetloc, proxyParams, '', '', ''))
|
||||
proxyScheme = proxyScheme or b"http"
|
||||
proxyURI = urlunparse((proxyScheme, proxyNetloc, proxyParams, "", "", ""))
|
||||
return self._ProxyAgent(
|
||||
reactor=reactor,
|
||||
proxyURI=to_bytes(proxyURI, encoding='ascii'),
|
||||
proxyURI=to_bytes(proxyURI, encoding="ascii"),
|
||||
connectTimeout=timeout,
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
@ -308,7 +358,8 @@ class ScrapyAgent:
|
||||
|
||||
def download_request(self, request):
|
||||
from twisted.internet import reactor
|
||||
timeout = request.meta.get('download_timeout') or self._connectTimeout
|
||||
|
||||
timeout = request.meta.get("download_timeout") or self._connectTimeout
|
||||
agent = self._get_agent(request, timeout)
|
||||
|
||||
# request details
|
||||
@ -316,13 +367,15 @@ class ScrapyAgent:
|
||||
method = to_bytes(request.method)
|
||||
headers = TxHeaders(request.headers)
|
||||
if isinstance(agent, self._TunnelingAgent):
|
||||
headers.removeHeader(b'Proxy-Authorization')
|
||||
headers.removeHeader(b"Proxy-Authorization")
|
||||
if request.body:
|
||||
bodyproducer = _RequestBodyProducer(request.body)
|
||||
else:
|
||||
bodyproducer = None
|
||||
start_time = time()
|
||||
d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
|
||||
d = agent.request(
|
||||
method, to_bytes(url, encoding="ascii"), headers, bodyproducer
|
||||
)
|
||||
# set download latency
|
||||
d.addCallback(self._cb_latency, request, start_time)
|
||||
# response body is ready to be consumed
|
||||
@ -345,14 +398,14 @@ class ScrapyAgent:
|
||||
raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
|
||||
|
||||
def _cb_latency(self, result, request, start_time):
|
||||
request.meta['download_latency'] = time() - start_time
|
||||
request.meta["download_latency"] = time() - start_time
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _headers_from_twisted_response(response):
|
||||
headers = Headers()
|
||||
if response.length != UNKNOWN_LENGTH:
|
||||
headers[b'Content-Length'] = str(response.length).encode()
|
||||
headers[b"Content-Length"] = str(response.length).encode()
|
||||
headers.update(response.headers.getAllRawHeaders())
|
||||
return headers
|
||||
|
||||
@ -366,8 +419,10 @@ class ScrapyAgent:
|
||||
)
|
||||
for handler, result in headers_received_result:
|
||||
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
|
||||
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
|
||||
{"request": request, "handler": handler.__qualname__})
|
||||
logger.debug(
|
||||
"Download stopped for %(request)s from signal handler %(handler)s",
|
||||
{"request": request, "handler": handler.__qualname__},
|
||||
)
|
||||
txresponse._transport.stopProducing()
|
||||
txresponse._transport.loseConnection()
|
||||
return {
|
||||
@ -389,15 +444,23 @@ class ScrapyAgent:
|
||||
"ip_address": None,
|
||||
}
|
||||
|
||||
maxsize = request.meta.get('download_maxsize', self._maxsize)
|
||||
warnsize = request.meta.get('download_warnsize', self._warnsize)
|
||||
maxsize = request.meta.get("download_maxsize", self._maxsize)
|
||||
warnsize = request.meta.get("download_warnsize", self._warnsize)
|
||||
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
|
||||
fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
|
||||
fail_on_dataloss = request.meta.get(
|
||||
"download_fail_on_dataloss", self._fail_on_dataloss
|
||||
)
|
||||
|
||||
if maxsize and expected_size > maxsize:
|
||||
warning_msg = ("Cancelling download of %(url)s: expected response "
|
||||
"size (%(size)s) larger than download max size (%(maxsize)s).")
|
||||
warning_args = {'url': request.url, 'size': expected_size, 'maxsize': maxsize}
|
||||
warning_msg = (
|
||||
"Cancelling download of %(url)s: expected response "
|
||||
"size (%(size)s) larger than download max size (%(maxsize)s)."
|
||||
)
|
||||
warning_args = {
|
||||
"url": request.url,
|
||||
"size": expected_size,
|
||||
"maxsize": maxsize,
|
||||
}
|
||||
|
||||
logger.warning(warning_msg, warning_args)
|
||||
|
||||
@ -405,9 +468,11 @@ class ScrapyAgent:
|
||||
raise defer.CancelledError(warning_msg % warning_args)
|
||||
|
||||
if warnsize and expected_size > warnsize:
|
||||
logger.warning("Expected response size (%(size)s) larger than "
|
||||
"download warn size (%(warnsize)s) in request %(request)s.",
|
||||
{'size': expected_size, 'warnsize': warnsize, 'request': request})
|
||||
logger.warning(
|
||||
"Expected response size (%(size)s) larger than "
|
||||
"download warn size (%(warnsize)s) in request %(request)s.",
|
||||
{"size": expected_size, "warnsize": warnsize, "request": request},
|
||||
)
|
||||
|
||||
def _cancel(_):
|
||||
# Abort connection immediately.
|
||||
@ -457,7 +522,6 @@ class ScrapyAgent:
|
||||
|
||||
@implementer(IBodyProducer)
|
||||
class _RequestBodyProducer:
|
||||
|
||||
def __init__(self, body):
|
||||
self.body = body
|
||||
self.length = len(body)
|
||||
@ -474,8 +538,16 @@ class _RequestBodyProducer:
|
||||
|
||||
|
||||
class _ResponseReader(protocol.Protocol):
|
||||
|
||||
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
|
||||
def __init__(
|
||||
self,
|
||||
finished,
|
||||
txresponse,
|
||||
request,
|
||||
maxsize,
|
||||
warnsize,
|
||||
fail_on_dataloss,
|
||||
crawler,
|
||||
):
|
||||
self._finished = finished
|
||||
self._txresponse = txresponse
|
||||
self._request = request
|
||||
@ -491,22 +563,28 @@ class _ResponseReader(protocol.Protocol):
|
||||
self._crawler = crawler
|
||||
|
||||
def _finish_response(self, flags=None, failure=None):
|
||||
self._finished.callback({
|
||||
"txresponse": self._txresponse,
|
||||
"body": self._bodybuf.getvalue(),
|
||||
"flags": flags,
|
||||
"certificate": self._certificate,
|
||||
"ip_address": self._ip_address,
|
||||
"failure": failure,
|
||||
})
|
||||
self._finished.callback(
|
||||
{
|
||||
"txresponse": self._txresponse,
|
||||
"body": self._bodybuf.getvalue(),
|
||||
"flags": flags,
|
||||
"certificate": self._certificate,
|
||||
"ip_address": self._ip_address,
|
||||
"failure": failure,
|
||||
}
|
||||
)
|
||||
|
||||
def connectionMade(self):
|
||||
if self._certificate is None:
|
||||
with suppress(AttributeError):
|
||||
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
|
||||
self._certificate = ssl.Certificate(
|
||||
self.transport._producer.getPeerCertificate()
|
||||
)
|
||||
|
||||
if self._ip_address is None:
|
||||
self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
|
||||
self._ip_address = ipaddress.ip_address(
|
||||
self.transport._producer.getPeer().host
|
||||
)
|
||||
|
||||
def dataReceived(self, bodyBytes):
|
||||
# This maybe called several times after cancel was called with buffered data.
|
||||
@ -524,29 +602,40 @@ class _ResponseReader(protocol.Protocol):
|
||||
)
|
||||
for handler, result in bytes_received_result:
|
||||
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
|
||||
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
|
||||
{"request": self._request, "handler": handler.__qualname__})
|
||||
logger.debug(
|
||||
"Download stopped for %(request)s from signal handler %(handler)s",
|
||||
{"request": self._request, "handler": handler.__qualname__},
|
||||
)
|
||||
self.transport.stopProducing()
|
||||
self.transport.loseConnection()
|
||||
failure = result if result.value.fail else None
|
||||
self._finish_response(flags=["download_stopped"], failure=failure)
|
||||
|
||||
if self._maxsize and self._bytes_received > self._maxsize:
|
||||
logger.warning("Received (%(bytes)s) bytes larger than download "
|
||||
"max size (%(maxsize)s) in request %(request)s.",
|
||||
{'bytes': self._bytes_received,
|
||||
'maxsize': self._maxsize,
|
||||
'request': self._request})
|
||||
logger.warning(
|
||||
"Received (%(bytes)s) bytes larger than download "
|
||||
"max size (%(maxsize)s) in request %(request)s.",
|
||||
{
|
||||
"bytes": self._bytes_received,
|
||||
"maxsize": self._maxsize,
|
||||
"request": self._request,
|
||||
},
|
||||
)
|
||||
# Clear buffer earlier to avoid keeping data in memory for a long time.
|
||||
self._bodybuf.truncate(0)
|
||||
self._finished.cancel()
|
||||
|
||||
if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
|
||||
if (
|
||||
self._warnsize
|
||||
and self._bytes_received > self._warnsize
|
||||
and not self._reached_warnsize
|
||||
):
|
||||
self._reached_warnsize = True
|
||||
logger.warning("Received more bytes than download "
|
||||
"warn size (%(warnsize)s) in request %(request)s.",
|
||||
{'warnsize': self._warnsize,
|
||||
'request': self._request})
|
||||
logger.warning(
|
||||
"Received more bytes than download "
|
||||
"warn size (%(warnsize)s) in request %(request)s.",
|
||||
{"warnsize": self._warnsize, "request": self._request},
|
||||
)
|
||||
|
||||
def connectionLost(self, reason):
|
||||
if self._finished.called:
|
||||
@ -560,16 +649,20 @@ class _ResponseReader(protocol.Protocol):
|
||||
self._finish_response(flags=["partial"])
|
||||
return
|
||||
|
||||
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
|
||||
if reason.check(ResponseFailed) and any(
|
||||
r.check(_DataLoss) for r in reason.value.reasons
|
||||
):
|
||||
if not self._fail_on_dataloss:
|
||||
self._finish_response(flags=["dataloss"])
|
||||
return
|
||||
|
||||
if not self._fail_on_dataloss_warned:
|
||||
logger.warning("Got data loss in %s. If you want to process broken "
|
||||
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
|
||||
" -- This message won't be shown in further requests",
|
||||
self._txresponse.request.absoluteURI.decode())
|
||||
logger.warning(
|
||||
"Got data loss in %s. If you want to process broken "
|
||||
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
|
||||
" -- This message won't be shown in further requests",
|
||||
self._txresponse.request.absoluteURI.decode(),
|
||||
)
|
||||
self._fail_on_dataloss_warned = True
|
||||
|
||||
self._finished.errback(reason)
|
||||
|
@ -17,7 +17,9 @@ from scrapy.spiders import Spider
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
H2DownloadHandlerOrSubclass = TypeVar("H2DownloadHandlerOrSubclass", bound="H2DownloadHandler")
|
||||
H2DownloadHandlerOrSubclass = TypeVar(
|
||||
"H2DownloadHandlerOrSubclass", bound="H2DownloadHandler"
|
||||
)
|
||||
|
||||
|
||||
class H2DownloadHandler:
|
||||
@ -25,11 +27,14 @@ class H2DownloadHandler:
|
||||
self._crawler = crawler
|
||||
|
||||
from twisted.internet import reactor
|
||||
|
||||
self._pool = H2ConnectionPool(reactor, settings)
|
||||
self._context_factory = load_context_factory_from_settings(settings, crawler)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls: Type[H2DownloadHandlerOrSubclass], crawler: Crawler) -> H2DownloadHandlerOrSubclass:
|
||||
def from_crawler(
|
||||
cls: Type[H2DownloadHandlerOrSubclass], crawler: Crawler
|
||||
) -> H2DownloadHandlerOrSubclass:
|
||||
return cls(crawler.settings, crawler)
|
||||
|
||||
def download_request(self, request: Request, spider: Spider) -> Deferred:
|
||||
@ -49,7 +54,8 @@ class ScrapyH2Agent:
|
||||
_ProxyAgent = ScrapyProxyH2Agent
|
||||
|
||||
def __init__(
|
||||
self, context_factory,
|
||||
self,
|
||||
context_factory,
|
||||
pool: H2ConnectionPool,
|
||||
connect_timeout: int = 10,
|
||||
bind_address: Optional[bytes] = None,
|
||||
@ -63,19 +69,22 @@ class ScrapyH2Agent:
|
||||
|
||||
def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
|
||||
from twisted.internet import reactor
|
||||
bind_address = request.meta.get('bindaddress') or self._bind_address
|
||||
proxy = request.meta.get('proxy')
|
||||
|
||||
bind_address = request.meta.get("bindaddress") or self._bind_address
|
||||
proxy = request.meta.get("proxy")
|
||||
if proxy:
|
||||
_, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
|
||||
scheme = _parse(request.url)[0]
|
||||
|
||||
if scheme == b'https':
|
||||
if scheme == b"https":
|
||||
# ToDo
|
||||
raise NotImplementedError('Tunneling via CONNECT method using HTTP/2.0 is not yet supported')
|
||||
raise NotImplementedError(
|
||||
"Tunneling via CONNECT method using HTTP/2.0 is not yet supported"
|
||||
)
|
||||
return self._ProxyAgent(
|
||||
reactor=reactor,
|
||||
context_factory=self._context_factory,
|
||||
proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
|
||||
proxy_uri=URI.fromBytes(to_bytes(proxy, encoding="ascii")),
|
||||
connect_timeout=timeout,
|
||||
bind_address=bind_address,
|
||||
pool=self._pool,
|
||||
@ -91,7 +100,8 @@ class ScrapyH2Agent:
|
||||
|
||||
def download_request(self, request: Request, spider: Spider) -> Deferred:
|
||||
from twisted.internet import reactor
|
||||
timeout = request.meta.get('download_timeout') or self._connect_timeout
|
||||
|
||||
timeout = request.meta.get("download_timeout") or self._connect_timeout
|
||||
agent = self._get_agent(request, timeout)
|
||||
|
||||
start_time = time()
|
||||
@ -103,12 +113,16 @@ class ScrapyH2Agent:
|
||||
return d
|
||||
|
||||
@staticmethod
|
||||
def _cb_latency(response: Response, request: Request, start_time: float) -> Response:
|
||||
request.meta['download_latency'] = time() - start_time
|
||||
def _cb_latency(
|
||||
response: Response, request: Request, start_time: float
|
||||
) -> Response:
|
||||
request.meta["download_latency"] = time() - start_time
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def _cb_timeout(response: Response, request: Request, timeout: float, timeout_cl: DelayedCall) -> Response:
|
||||
def _cb_timeout(
|
||||
response: Response, request: Request, timeout: float, timeout_cl: DelayedCall
|
||||
) -> Response:
|
||||
if timeout_cl.active():
|
||||
timeout_cl.cancel()
|
||||
return response
|
||||
|
@ -6,40 +6,49 @@ from scrapy.utils.misc import create_instance
|
||||
|
||||
|
||||
class S3DownloadHandler:
|
||||
|
||||
def __init__(self, settings, *,
|
||||
crawler=None,
|
||||
aws_access_key_id=None, aws_secret_access_key=None,
|
||||
aws_session_token=None,
|
||||
httpdownloadhandler=HTTPDownloadHandler, **kw):
|
||||
def __init__(
|
||||
self,
|
||||
settings,
|
||||
*,
|
||||
crawler=None,
|
||||
aws_access_key_id=None,
|
||||
aws_secret_access_key=None,
|
||||
aws_session_token=None,
|
||||
httpdownloadhandler=HTTPDownloadHandler,
|
||||
**kw,
|
||||
):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured('missing botocore library')
|
||||
raise NotConfigured("missing botocore library")
|
||||
|
||||
if not aws_access_key_id:
|
||||
aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
|
||||
aws_access_key_id = settings["AWS_ACCESS_KEY_ID"]
|
||||
if not aws_secret_access_key:
|
||||
aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
|
||||
aws_secret_access_key = settings["AWS_SECRET_ACCESS_KEY"]
|
||||
if not aws_session_token:
|
||||
aws_session_token = settings['AWS_SESSION_TOKEN']
|
||||
aws_session_token = settings["AWS_SESSION_TOKEN"]
|
||||
|
||||
# If no credentials could be found anywhere,
|
||||
# consider this an anonymous connection request by default;
|
||||
# unless 'anon' was set explicitly (True/False).
|
||||
anon = kw.get('anon')
|
||||
anon = kw.get("anon")
|
||||
if anon is None and not aws_access_key_id and not aws_secret_access_key:
|
||||
kw['anon'] = True
|
||||
self.anon = kw.get('anon')
|
||||
kw["anon"] = True
|
||||
self.anon = kw.get("anon")
|
||||
|
||||
self._signer = None
|
||||
import botocore.auth
|
||||
import botocore.credentials
|
||||
kw.pop('anon', None)
|
||||
|
||||
kw.pop("anon", None)
|
||||
if kw:
|
||||
raise TypeError(f'Unexpected keyword arguments: {kw}')
|
||||
raise TypeError(f"Unexpected keyword arguments: {kw}")
|
||||
if not self.anon:
|
||||
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
|
||||
self._signer = SignerCls(botocore.credentials.Credentials(
|
||||
aws_access_key_id, aws_secret_access_key, aws_session_token))
|
||||
SignerCls = botocore.auth.AUTH_TYPE_MAPS["s3"]
|
||||
self._signer = SignerCls(
|
||||
botocore.credentials.Credentials(
|
||||
aws_access_key_id, aws_secret_access_key, aws_session_token
|
||||
)
|
||||
)
|
||||
|
||||
_http_handler = create_instance(
|
||||
objcls=httpdownloadhandler,
|
||||
@ -54,20 +63,21 @@ class S3DownloadHandler:
|
||||
|
||||
def download_request(self, request, spider):
|
||||
p = urlparse_cached(request)
|
||||
scheme = 'https' if request.meta.get('is_secure') else 'http'
|
||||
scheme = "https" if request.meta.get("is_secure") else "http"
|
||||
bucket = p.hostname
|
||||
path = p.path + '?' + p.query if p.query else p.path
|
||||
url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
|
||||
path = p.path + "?" + p.query if p.query else p.path
|
||||
url = f"{scheme}://{bucket}.s3.amazonaws.com{path}"
|
||||
if self.anon:
|
||||
request = request.replace(url=url)
|
||||
else:
|
||||
import botocore.awsrequest
|
||||
|
||||
awsrequest = botocore.awsrequest.AWSRequest(
|
||||
method=request.method,
|
||||
url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
|
||||
url=f"{scheme}://s3.amazonaws.com/{bucket}{path}",
|
||||
headers=request.headers.to_unicode_dict(),
|
||||
data=request.body)
|
||||
data=request.body,
|
||||
)
|
||||
self._signer.add_auth(awsrequest)
|
||||
request = request.replace(
|
||||
url=url, headers=awsrequest.headers.items())
|
||||
request = request.replace(url=url, headers=awsrequest.headers.items())
|
||||
return self._download_http(request, spider)
|
||||
|
@ -18,28 +18,31 @@ from scrapy.utils.conf import build_component_list
|
||||
|
||||
class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
|
||||
component_name = 'downloader middleware'
|
||||
component_name = "downloader middleware"
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(
|
||||
settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
|
||||
return build_component_list(settings.getwithbase("DOWNLOADER_MIDDLEWARES"))
|
||||
|
||||
def _add_middleware(self, mw):
|
||||
if hasattr(mw, 'process_request'):
|
||||
self.methods['process_request'].append(mw.process_request)
|
||||
if hasattr(mw, 'process_response'):
|
||||
self.methods['process_response'].appendleft(mw.process_response)
|
||||
if hasattr(mw, 'process_exception'):
|
||||
self.methods['process_exception'].appendleft(mw.process_exception)
|
||||
if hasattr(mw, "process_request"):
|
||||
self.methods["process_request"].append(mw.process_request)
|
||||
if hasattr(mw, "process_response"):
|
||||
self.methods["process_response"].appendleft(mw.process_response)
|
||||
if hasattr(mw, "process_exception"):
|
||||
self.methods["process_exception"].appendleft(mw.process_exception)
|
||||
|
||||
def download(self, download_func: Callable, request: Request, spider: Spider):
|
||||
@defer.inlineCallbacks
|
||||
def process_request(request: Request):
|
||||
for method in self.methods['process_request']:
|
||||
for method in self.methods["process_request"]:
|
||||
method = cast(Callable, method)
|
||||
response = yield deferred_from_coro(method(request=request, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
response = yield deferred_from_coro(
|
||||
method(request=request, spider=spider)
|
||||
)
|
||||
if response is not None and not isinstance(
|
||||
response, (Response, Request)
|
||||
):
|
||||
raise _InvalidOutput(
|
||||
f"Middleware {method.__qualname__} must return None, Response or "
|
||||
f"Request, got {response.__class__.__name__}"
|
||||
@ -55,9 +58,11 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
elif isinstance(response, Request):
|
||||
return response
|
||||
|
||||
for method in self.methods['process_response']:
|
||||
for method in self.methods["process_response"]:
|
||||
method = cast(Callable, method)
|
||||
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
|
||||
response = yield deferred_from_coro(
|
||||
method(request=request, response=response, spider=spider)
|
||||
)
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
f"Middleware {method.__qualname__} must return Response or Request, "
|
||||
@ -70,10 +75,14 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
@defer.inlineCallbacks
|
||||
def process_exception(failure: Failure):
|
||||
exception = failure.value
|
||||
for method in self.methods['process_exception']:
|
||||
for method in self.methods["process_exception"]:
|
||||
method = cast(Callable, method)
|
||||
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
response = yield deferred_from_coro(
|
||||
method(request=request, exception=exception, spider=spider)
|
||||
)
|
||||
if response is not None and not isinstance(
|
||||
response, (Response, Request)
|
||||
):
|
||||
raise _InvalidOutput(
|
||||
f"Middleware {method.__qualname__} must return None, Response or "
|
||||
f"Request, got {type(response)}"
|
||||
|
@ -2,7 +2,11 @@ import logging
|
||||
|
||||
from OpenSSL import SSL
|
||||
from service_identity.exceptions import CertificateError
|
||||
from twisted.internet._sslverify import ClientTLSOptions, verifyHostname, VerificationError
|
||||
from twisted.internet._sslverify import (
|
||||
ClientTLSOptions,
|
||||
verifyHostname,
|
||||
VerificationError,
|
||||
)
|
||||
from twisted.internet.ssl import AcceptableCiphers
|
||||
|
||||
from scrapy.utils.ssl import x509name_to_string, get_temp_key_info
|
||||
@ -10,17 +14,17 @@ from scrapy.utils.ssl import x509name_to_string, get_temp_key_info
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
METHOD_TLS = 'TLS'
|
||||
METHOD_TLSv10 = 'TLSv1.0'
|
||||
METHOD_TLSv11 = 'TLSv1.1'
|
||||
METHOD_TLSv12 = 'TLSv1.2'
|
||||
METHOD_TLS = "TLS"
|
||||
METHOD_TLSv10 = "TLSv1.0"
|
||||
METHOD_TLSv11 = "TLSv1.1"
|
||||
METHOD_TLSv12 = "TLSv1.2"
|
||||
|
||||
|
||||
openssl_methods = {
|
||||
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
|
||||
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
|
||||
METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only
|
||||
METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only
|
||||
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
|
||||
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
|
||||
METHOD_TLSv11: getattr(SSL, "TLSv1_1_METHOD", 5), # TLS 1.1 only
|
||||
METHOD_TLSv12: getattr(SSL, "TLSv1_2_METHOD", 6), # TLS 1.2 only
|
||||
}
|
||||
|
||||
|
||||
@ -44,32 +48,38 @@ class ScrapyClientTLSOptions(ClientTLSOptions):
|
||||
connection.set_tlsext_host_name(self._hostnameBytes)
|
||||
elif where & SSL.SSL_CB_HANDSHAKE_DONE:
|
||||
if self.verbose_logging:
|
||||
logger.debug('SSL connection to %s using protocol %s, cipher %s',
|
||||
self._hostnameASCII,
|
||||
connection.get_protocol_version_name(),
|
||||
connection.get_cipher_name(),
|
||||
)
|
||||
logger.debug(
|
||||
"SSL connection to %s using protocol %s, cipher %s",
|
||||
self._hostnameASCII,
|
||||
connection.get_protocol_version_name(),
|
||||
connection.get_cipher_name(),
|
||||
)
|
||||
server_cert = connection.get_peer_certificate()
|
||||
logger.debug('SSL connection certificate: issuer "%s", subject "%s"',
|
||||
x509name_to_string(server_cert.get_issuer()),
|
||||
x509name_to_string(server_cert.get_subject()),
|
||||
)
|
||||
logger.debug(
|
||||
'SSL connection certificate: issuer "%s", subject "%s"',
|
||||
x509name_to_string(server_cert.get_issuer()),
|
||||
x509name_to_string(server_cert.get_subject()),
|
||||
)
|
||||
key_info = get_temp_key_info(connection._ssl)
|
||||
if key_info:
|
||||
logger.debug('SSL temp key: %s', key_info)
|
||||
logger.debug("SSL temp key: %s", key_info)
|
||||
|
||||
try:
|
||||
verifyHostname(connection, self._hostnameASCII)
|
||||
except (CertificateError, VerificationError) as e:
|
||||
logger.warning(
|
||||
'Remote certificate is not valid for hostname "%s"; %s',
|
||||
self._hostnameASCII, e)
|
||||
self._hostnameASCII,
|
||||
e,
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
logger.warning(
|
||||
'Ignoring error while verifying certificate '
|
||||
"Ignoring error while verifying certificate "
|
||||
'from host "%s" (exception: %r)',
|
||||
self._hostnameASCII, e)
|
||||
self._hostnameASCII,
|
||||
e,
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString('DEFAULT')
|
||||
DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString("DEFAULT")
|
||||
|
@ -15,33 +15,33 @@ from scrapy.responsetypes import responsetypes
|
||||
def _parsed_url_args(parsed):
|
||||
# Assume parsed is urlparse-d from Request.url,
|
||||
# which was passed via safe_url_string and is ascii-only.
|
||||
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
||||
path = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
|
||||
path = to_bytes(path, encoding="ascii")
|
||||
host = to_bytes(parsed.hostname, encoding="ascii")
|
||||
port = parsed.port
|
||||
scheme = to_bytes(parsed.scheme, encoding="ascii")
|
||||
netloc = to_bytes(parsed.netloc, encoding="ascii")
|
||||
if port is None:
|
||||
port = 443 if scheme == b'https' else 80
|
||||
port = 443 if scheme == b"https" else 80
|
||||
return scheme, netloc, host, port, path
|
||||
|
||||
|
||||
def _parse(url):
|
||||
""" Return tuple of (scheme, netloc, host, port, path),
|
||||
"""Return tuple of (scheme, netloc, host, port, path),
|
||||
all in bytes except for port which is int.
|
||||
Assume url is from Request.url, which was passed via safe_url_string
|
||||
and is ascii-only.
|
||||
"""
|
||||
url = url.strip()
|
||||
if not re.match(r'^\w+://', url):
|
||||
url = '//' + url
|
||||
if not re.match(r"^\w+://", url):
|
||||
url = "//" + url
|
||||
parsed = urlparse(url)
|
||||
return _parsed_url_args(parsed)
|
||||
|
||||
|
||||
class ScrapyHTTPPageGetter(HTTPClient):
|
||||
|
||||
delimiter = b'\n'
|
||||
delimiter = b"\n"
|
||||
|
||||
def connectionMade(self):
|
||||
self.headers = Headers() # bucket for response headers
|
||||
@ -75,8 +75,8 @@ class ScrapyHTTPPageGetter(HTTPClient):
|
||||
self.factory.noPage(reason)
|
||||
|
||||
def handleResponse(self, response):
|
||||
if self.factory.method.upper() == b'HEAD':
|
||||
self.factory.page(b'')
|
||||
if self.factory.method.upper() == b"HEAD":
|
||||
self.factory.page(b"")
|
||||
elif self.length is not None and self.length > 0:
|
||||
self.factory.noPage(self._connection_lost_reason)
|
||||
else:
|
||||
@ -87,12 +87,15 @@ class ScrapyHTTPPageGetter(HTTPClient):
|
||||
self.transport.loseConnection()
|
||||
|
||||
# transport cleanup needed for HTTPS connections
|
||||
if self.factory.url.startswith(b'https'):
|
||||
if self.factory.url.startswith(b"https"):
|
||||
self.transport.stopProducing()
|
||||
|
||||
self.factory.noPage(
|
||||
defer.TimeoutError(f"Getting {self.factory.url} took longer "
|
||||
f"than {self.factory.timeout} seconds."))
|
||||
defer.TimeoutError(
|
||||
f"Getting {self.factory.url} took longer "
|
||||
f"than {self.factory.timeout} seconds."
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# This class used to inherit from Twisted’s
|
||||
@ -109,16 +112,24 @@ class ScrapyHTTPClientFactory(ClientFactory):
|
||||
afterFoundGet = False
|
||||
|
||||
def _build_response(self, body, request):
|
||||
request.meta['download_latency'] = self.headers_time - self.start_time
|
||||
request.meta["download_latency"] = self.headers_time - self.start_time
|
||||
status = int(self.status)
|
||||
headers = Headers(self.response_headers)
|
||||
respcls = responsetypes.from_args(headers=headers, url=self._url, body=body)
|
||||
return respcls(url=self._url, status=status, headers=headers, body=body, protocol=to_unicode(self.version))
|
||||
return respcls(
|
||||
url=self._url,
|
||||
status=status,
|
||||
headers=headers,
|
||||
body=body,
|
||||
protocol=to_unicode(self.version),
|
||||
)
|
||||
|
||||
def _set_connection_attributes(self, request):
|
||||
parsed = urlparse_cached(request)
|
||||
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
|
||||
proxy = request.meta.get('proxy')
|
||||
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(
|
||||
parsed
|
||||
)
|
||||
proxy = request.meta.get("proxy")
|
||||
if proxy:
|
||||
self.scheme, _, self.host, self.port, _ = _parse(proxy)
|
||||
self.path = self.url
|
||||
@ -126,12 +137,12 @@ class ScrapyHTTPClientFactory(ClientFactory):
|
||||
def __init__(self, request, timeout=180):
|
||||
self._url = urldefrag(request.url)[0]
|
||||
# converting to bytes to comply to Twisted interface
|
||||
self.url = to_bytes(self._url, encoding='ascii')
|
||||
self.method = to_bytes(request.method, encoding='ascii')
|
||||
self.url = to_bytes(self._url, encoding="ascii")
|
||||
self.method = to_bytes(request.method, encoding="ascii")
|
||||
self.body = request.body or None
|
||||
self.headers = Headers(request.headers)
|
||||
self.response_headers = None
|
||||
self.timeout = request.meta.get('download_timeout') or timeout
|
||||
self.timeout = request.meta.get("download_timeout") or timeout
|
||||
self.start_time = time()
|
||||
self.deferred = defer.Deferred().addCallback(self._build_response, request)
|
||||
|
||||
@ -146,16 +157,16 @@ class ScrapyHTTPClientFactory(ClientFactory):
|
||||
self._set_connection_attributes(request)
|
||||
|
||||
# set Host header based on url
|
||||
self.headers.setdefault('Host', self.netloc)
|
||||
self.headers.setdefault("Host", self.netloc)
|
||||
|
||||
# set Content-Length based len of body
|
||||
if self.body is not None:
|
||||
self.headers['Content-Length'] = len(self.body)
|
||||
self.headers["Content-Length"] = len(self.body)
|
||||
# just in case a broken http/1.1 decides to keep connection alive
|
||||
self.headers.setdefault("Connection", "close")
|
||||
# Content-Length must be specified in POST method even with no body
|
||||
elif self.method == b'POST':
|
||||
self.headers['Content-Length'] = 0
|
||||
elif self.method == b"POST":
|
||||
self.headers["Content-Length"] = 0
|
||||
|
||||
def __repr__(self):
|
||||
return f"<{self.__class__.__name__}: {self.url}>"
|
||||
@ -171,6 +182,7 @@ class ScrapyHTTPClientFactory(ClientFactory):
|
||||
p.afterFoundGet = self.afterFoundGet
|
||||
if self.timeout:
|
||||
from twisted.internet import reactor
|
||||
|
||||
timeoutCall = reactor.callLater(self.timeout, p.timeout)
|
||||
self.deferred.addBoth(self._cancelTimeout, timeoutCall)
|
||||
return p
|
||||
|
@ -79,13 +79,14 @@ class ExecutionEngine:
|
||||
self.running = False
|
||||
self.paused = False
|
||||
self.scheduler_cls = self._get_scheduler_class(crawler.settings)
|
||||
downloader_cls = load_object(self.settings['DOWNLOADER'])
|
||||
downloader_cls = load_object(self.settings["DOWNLOADER"])
|
||||
self.downloader = downloader_cls(crawler)
|
||||
self.scraper = Scraper(crawler)
|
||||
self._spider_closed_callback = spider_closed_callback
|
||||
|
||||
def _get_scheduler_class(self, settings: BaseSettings) -> type:
|
||||
from scrapy.core.scheduler import BaseScheduler
|
||||
|
||||
scheduler_cls = load_object(settings["SCHEDULER"])
|
||||
if not issubclass(scheduler_cls, BaseScheduler):
|
||||
raise TypeError(
|
||||
@ -106,6 +107,7 @@ class ExecutionEngine:
|
||||
|
||||
def stop(self) -> Deferred:
|
||||
"""Gracefully stop the execution engine"""
|
||||
|
||||
@inlineCallbacks
|
||||
def _finish_stopping_engine(_) -> Deferred:
|
||||
yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
|
||||
@ -115,7 +117,11 @@ class ExecutionEngine:
|
||||
raise RuntimeError("Engine not running")
|
||||
|
||||
self.running = False
|
||||
dfd = self.close_spider(self.spider, reason="shutdown") if self.spider is not None else succeed(None)
|
||||
dfd = (
|
||||
self.close_spider(self.spider, reason="shutdown")
|
||||
if self.spider is not None
|
||||
else succeed(None)
|
||||
)
|
||||
return dfd.addBoth(_finish_stopping_engine)
|
||||
|
||||
def close(self) -> Deferred:
|
||||
@ -126,7 +132,9 @@ class ExecutionEngine:
|
||||
if self.running:
|
||||
return self.stop() # will also close spider and downloader
|
||||
if self.spider is not None:
|
||||
return self.close_spider(self.spider, reason="shutdown") # will also close downloader
|
||||
return self.close_spider(
|
||||
self.spider, reason="shutdown"
|
||||
) # will also close downloader
|
||||
return succeed(self.downloader.close())
|
||||
|
||||
def pause(self) -> None:
|
||||
@ -144,7 +152,10 @@ class ExecutionEngine:
|
||||
if self.paused:
|
||||
return None
|
||||
|
||||
while not self._needs_backout() and self._next_request_from_scheduler() is not None:
|
||||
while (
|
||||
not self._needs_backout()
|
||||
and self._next_request_from_scheduler() is not None
|
||||
):
|
||||
pass
|
||||
|
||||
if self.slot.start_requests is not None and not self._needs_backout():
|
||||
@ -154,7 +165,11 @@ class ExecutionEngine:
|
||||
self.slot.start_requests = None
|
||||
except Exception:
|
||||
self.slot.start_requests = None
|
||||
logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': self.spider})
|
||||
logger.error(
|
||||
"Error while obtaining start requests",
|
||||
exc_info=True,
|
||||
extra={"spider": self.spider},
|
||||
)
|
||||
else:
|
||||
self.crawl(request)
|
||||
|
||||
@ -179,18 +194,30 @@ class ExecutionEngine:
|
||||
|
||||
d = self._download(request, self.spider)
|
||||
d.addBoth(self._handle_downloader_output, request)
|
||||
d.addErrback(lambda f: logger.info('Error while handling downloader output',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': self.spider}))
|
||||
d.addErrback(
|
||||
lambda f: logger.info(
|
||||
"Error while handling downloader output",
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={"spider": self.spider},
|
||||
)
|
||||
)
|
||||
d.addBoth(lambda _: self.slot.remove_request(request))
|
||||
d.addErrback(lambda f: logger.info('Error while removing request from slot',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': self.spider}))
|
||||
d.addErrback(
|
||||
lambda f: logger.info(
|
||||
"Error while removing request from slot",
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={"spider": self.spider},
|
||||
)
|
||||
)
|
||||
slot = self.slot
|
||||
d.addBoth(lambda _: slot.nextcall.schedule())
|
||||
d.addErrback(lambda f: logger.info('Error while scheduling new request',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': self.spider}))
|
||||
d.addErrback(
|
||||
lambda f: logger.info(
|
||||
"Error while scheduling new request",
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={"spider": self.spider},
|
||||
)
|
||||
)
|
||||
return d
|
||||
|
||||
def _handle_downloader_output(
|
||||
@ -199,7 +226,9 @@ class ExecutionEngine:
|
||||
assert self.spider is not None # typing
|
||||
|
||||
if not isinstance(result, (Request, Response, Failure)):
|
||||
raise TypeError(f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}")
|
||||
raise TypeError(
|
||||
f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}"
|
||||
)
|
||||
|
||||
# downloader middleware can return requests (for example, redirects)
|
||||
if isinstance(result, Request):
|
||||
@ -211,7 +240,7 @@ class ExecutionEngine:
|
||||
lambda f: logger.error(
|
||||
"Error while enqueuing downloader output",
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': self.spider},
|
||||
extra={"spider": self.spider},
|
||||
)
|
||||
)
|
||||
return d
|
||||
@ -244,16 +273,22 @@ class ExecutionEngine:
|
||||
stacklevel=2,
|
||||
)
|
||||
if spider is not self.spider:
|
||||
raise RuntimeError(f"The spider {spider.name!r} does not match the open spider")
|
||||
raise RuntimeError(
|
||||
f"The spider {spider.name!r} does not match the open spider"
|
||||
)
|
||||
if self.spider is None:
|
||||
raise RuntimeError(f"No open spider to crawl: {request}")
|
||||
self._schedule_request(request, self.spider)
|
||||
self.slot.nextcall.schedule() # type: ignore[union-attr]
|
||||
|
||||
def _schedule_request(self, request: Request, spider: Spider) -> None:
|
||||
self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
|
||||
self.signals.send_catch_log(
|
||||
signals.request_scheduled, request=request, spider=spider
|
||||
)
|
||||
if not self.slot.scheduler.enqueue_request(request): # type: ignore[union-attr]
|
||||
self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
|
||||
self.signals.send_catch_log(
|
||||
signals.request_dropped, request=request, spider=spider
|
||||
)
|
||||
|
||||
def download(self, request: Request, spider: Optional[Spider] = None) -> Deferred:
|
||||
"""Return a Deferred which fires with a Response as result, only downloader middlewares are applied"""
|
||||
@ -264,10 +299,14 @@ class ExecutionEngine:
|
||||
stacklevel=2,
|
||||
)
|
||||
if spider is not self.spider:
|
||||
logger.warning("The spider '%s' does not match the open spider", spider.name)
|
||||
logger.warning(
|
||||
"The spider '%s' does not match the open spider", spider.name
|
||||
)
|
||||
if self.spider is None:
|
||||
raise RuntimeError(f"No open spider to crawl: {request}")
|
||||
return self._download(request, spider).addBoth(self._downloaded, request, spider)
|
||||
return self._download(request, spider).addBoth(
|
||||
self._downloaded, request, spider
|
||||
)
|
||||
|
||||
def _downloaded(
|
||||
self, result: Union[Response, Request], request: Request, spider: Spider
|
||||
@ -286,7 +325,9 @@ class ExecutionEngine:
|
||||
|
||||
def _on_success(result: Union[Response, Request]) -> Union[Response, Request]:
|
||||
if not isinstance(result, (Response, Request)):
|
||||
raise TypeError(f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}")
|
||||
raise TypeError(
|
||||
f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}"
|
||||
)
|
||||
if isinstance(result, Response):
|
||||
if result.request is None:
|
||||
result.request = request
|
||||
@ -311,13 +352,19 @@ class ExecutionEngine:
|
||||
return dwld
|
||||
|
||||
@inlineCallbacks
|
||||
def open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True):
|
||||
def open_spider(
|
||||
self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True
|
||||
):
|
||||
if self.slot is not None:
|
||||
raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
|
||||
logger.info("Spider opened", extra={'spider': spider})
|
||||
logger.info("Spider opened", extra={"spider": spider})
|
||||
nextcall = CallLaterOnce(self._next_request)
|
||||
scheduler = create_instance(self.scheduler_cls, settings=None, crawler=self.crawler)
|
||||
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
|
||||
scheduler = create_instance(
|
||||
self.scheduler_cls, settings=None, crawler=self.crawler
|
||||
)
|
||||
start_requests = yield self.scraper.spidermw.process_start_requests(
|
||||
start_requests, spider
|
||||
)
|
||||
self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
|
||||
self.spider = spider
|
||||
if hasattr(scheduler, "open"):
|
||||
@ -337,7 +384,9 @@ class ExecutionEngine:
|
||||
"""
|
||||
assert self.spider is not None # typing
|
||||
expected_ex = (DontCloseSpider, CloseSpider)
|
||||
res = self.signals.send_catch_log(signals.spider_idle, spider=self.spider, dont_log=expected_ex)
|
||||
res = self.signals.send_catch_log(
|
||||
signals.spider_idle, spider=self.spider, dont_log=expected_ex
|
||||
)
|
||||
detected_ex = {
|
||||
ex: x.value
|
||||
for _, x in res
|
||||
@ -347,7 +396,7 @@ class ExecutionEngine:
|
||||
if DontCloseSpider in detected_ex:
|
||||
return None
|
||||
if self.spider_is_idle():
|
||||
ex = detected_ex.get(CloseSpider, CloseSpider(reason='finished'))
|
||||
ex = detected_ex.get(CloseSpider, CloseSpider(reason="finished"))
|
||||
assert isinstance(ex, CloseSpider) # typing
|
||||
self.close_spider(self.spider, reason=ex.reason)
|
||||
|
||||
@ -359,40 +408,55 @@ class ExecutionEngine:
|
||||
if self.slot.closing is not None:
|
||||
return self.slot.closing
|
||||
|
||||
logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider})
|
||||
logger.info(
|
||||
"Closing spider (%(reason)s)", {"reason": reason}, extra={"spider": spider}
|
||||
)
|
||||
|
||||
dfd = self.slot.close()
|
||||
|
||||
def log_failure(msg: str) -> Callable:
|
||||
def errback(failure: Failure) -> None:
|
||||
logger.error(msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider})
|
||||
logger.error(
|
||||
msg, exc_info=failure_to_exc_info(failure), extra={"spider": spider}
|
||||
)
|
||||
|
||||
return errback
|
||||
|
||||
dfd.addBoth(lambda _: self.downloader.close())
|
||||
dfd.addErrback(log_failure('Downloader close failure'))
|
||||
dfd.addErrback(log_failure("Downloader close failure"))
|
||||
|
||||
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
|
||||
dfd.addErrback(log_failure('Scraper close failure'))
|
||||
dfd.addErrback(log_failure("Scraper close failure"))
|
||||
|
||||
if hasattr(self.slot.scheduler, "close"):
|
||||
dfd.addBoth(lambda _: self.slot.scheduler.close(reason))
|
||||
dfd.addErrback(log_failure("Scheduler close failure"))
|
||||
|
||||
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
|
||||
signal=signals.spider_closed, spider=spider, reason=reason,
|
||||
))
|
||||
dfd.addErrback(log_failure('Error while sending spider_close signal'))
|
||||
dfd.addBoth(
|
||||
lambda _: self.signals.send_catch_log_deferred(
|
||||
signal=signals.spider_closed,
|
||||
spider=spider,
|
||||
reason=reason,
|
||||
)
|
||||
)
|
||||
dfd.addErrback(log_failure("Error while sending spider_close signal"))
|
||||
|
||||
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
|
||||
dfd.addErrback(log_failure('Stats close failure'))
|
||||
dfd.addErrback(log_failure("Stats close failure"))
|
||||
|
||||
dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider}))
|
||||
dfd.addBoth(
|
||||
lambda _: logger.info(
|
||||
"Spider closed (%(reason)s)",
|
||||
{"reason": reason},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
)
|
||||
|
||||
dfd.addBoth(lambda _: setattr(self, 'slot', None))
|
||||
dfd.addErrback(log_failure('Error while unassigning slot'))
|
||||
dfd.addBoth(lambda _: setattr(self, "slot", None))
|
||||
dfd.addErrback(log_failure("Error while unassigning slot"))
|
||||
|
||||
dfd.addBoth(lambda _: setattr(self, 'spider', None))
|
||||
dfd.addErrback(log_failure('Error while unassigning spider'))
|
||||
dfd.addBoth(lambda _: setattr(self, "spider", None))
|
||||
dfd.addErrback(log_failure("Error while unassigning spider"))
|
||||
|
||||
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
|
||||
|
||||
@ -408,7 +472,11 @@ class ExecutionEngine:
|
||||
return [self.spider] if self.spider is not None else []
|
||||
|
||||
def has_capacity(self) -> bool:
|
||||
warnings.warn("ExecutionEngine.has_capacity is deprecated", ScrapyDeprecationWarning, stacklevel=2)
|
||||
warnings.warn(
|
||||
"ExecutionEngine.has_capacity is deprecated",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return not bool(self.slot)
|
||||
|
||||
def schedule(self, request: Request, spider: Spider) -> None:
|
||||
|
@ -28,7 +28,9 @@ class H2ConnectionPool:
|
||||
# Save all requests that arrive before the connection is established
|
||||
self._pending_requests: Dict[Tuple, Deque[Deferred]] = {}
|
||||
|
||||
def get_connection(self, key: Tuple, uri: URI, endpoint: HostnameEndpoint) -> Deferred:
|
||||
def get_connection(
|
||||
self, key: Tuple, uri: URI, endpoint: HostnameEndpoint
|
||||
) -> Deferred:
|
||||
if key in self._pending_requests:
|
||||
# Received a request while connecting to remote
|
||||
# Create a deferred which will fire with the H2ClientProtocol
|
||||
@ -46,7 +48,9 @@ class H2ConnectionPool:
|
||||
# No connection is established for the given URI
|
||||
return self._new_connection(key, uri, endpoint)
|
||||
|
||||
def _new_connection(self, key: Tuple, uri: URI, endpoint: HostnameEndpoint) -> Deferred:
|
||||
def _new_connection(
|
||||
self, key: Tuple, uri: URI, endpoint: HostnameEndpoint
|
||||
) -> Deferred:
|
||||
self._pending_requests[key] = deque()
|
||||
|
||||
conn_lost_deferred = Deferred()
|
||||
@ -102,7 +106,9 @@ class H2Agent:
|
||||
) -> None:
|
||||
self._reactor = reactor
|
||||
self._pool = pool
|
||||
self._context_factory = AcceptableProtocolsContextFactory(context_factory, acceptable_protocols=[b'h2'])
|
||||
self._context_factory = AcceptableProtocolsContextFactory(
|
||||
context_factory, acceptable_protocols=[b"h2"]
|
||||
)
|
||||
self.endpoint_factory = _StandardEndpointFactory(
|
||||
self._reactor, self._context_factory, connect_timeout, bind_address
|
||||
)
|
||||
@ -118,7 +124,7 @@ class H2Agent:
|
||||
return uri.scheme, uri.host, uri.port
|
||||
|
||||
def request(self, request: Request, spider: Spider) -> Deferred:
|
||||
uri = URI.fromBytes(bytes(request.url, encoding='utf-8'))
|
||||
uri = URI.fromBytes(bytes(request.url, encoding="utf-8"))
|
||||
try:
|
||||
endpoint = self.get_endpoint(uri)
|
||||
except SchemeNotSupported:
|
||||
|
@ -9,9 +9,15 @@ from h2.config import H2Configuration
|
||||
from h2.connection import H2Connection
|
||||
from h2.errors import ErrorCodes
|
||||
from h2.events import (
|
||||
Event, ConnectionTerminated, DataReceived, ResponseReceived,
|
||||
SettingsAcknowledged, StreamEnded, StreamReset, UnknownFrameReceived,
|
||||
WindowUpdated
|
||||
Event,
|
||||
ConnectionTerminated,
|
||||
DataReceived,
|
||||
ResponseReceived,
|
||||
SettingsAcknowledged,
|
||||
StreamEnded,
|
||||
StreamReset,
|
||||
UnknownFrameReceived,
|
||||
WindowUpdated,
|
||||
)
|
||||
from h2.exceptions import FrameTooLargeError, H2Error
|
||||
from twisted.internet.defer import Deferred
|
||||
@ -37,7 +43,6 @@ PROTOCOL_NAME = b"h2"
|
||||
|
||||
|
||||
class InvalidNegotiatedProtocol(H2Error):
|
||||
|
||||
def __init__(self, negotiated_protocol: bytes) -> None:
|
||||
self.negotiated_protocol = negotiated_protocol
|
||||
|
||||
@ -55,11 +60,13 @@ class RemoteTerminatedConnection(H2Error):
|
||||
self.terminate_event = event
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'Received GOAWAY frame from {self.remote_ip_address!r}'
|
||||
return f"Received GOAWAY frame from {self.remote_ip_address!r}"
|
||||
|
||||
|
||||
class MethodNotAllowed405(H2Error):
|
||||
def __init__(self, remote_ip_address: Optional[Union[IPv4Address, IPv6Address]]) -> None:
|
||||
def __init__(
|
||||
self, remote_ip_address: Optional[Union[IPv4Address, IPv6Address]]
|
||||
) -> None:
|
||||
self.remote_ip_address = remote_ip_address
|
||||
|
||||
def __str__(self) -> str:
|
||||
@ -70,7 +77,9 @@ class MethodNotAllowed405(H2Error):
|
||||
class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
IDLE_TIMEOUT = 240
|
||||
|
||||
def __init__(self, uri: URI, settings: Settings, conn_lost_deferred: Deferred) -> None:
|
||||
def __init__(
|
||||
self, uri: URI, settings: Settings, conn_lost_deferred: Deferred
|
||||
) -> None:
|
||||
"""
|
||||
Arguments:
|
||||
uri -- URI of the base url to which HTTP/2 Connection will be made.
|
||||
@ -82,7 +91,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
"""
|
||||
self._conn_lost_deferred = conn_lost_deferred
|
||||
|
||||
config = H2Configuration(client_side=True, header_encoding='utf-8')
|
||||
config = H2Configuration(client_side=True, header_encoding="utf-8")
|
||||
self.conn = H2Connection(config=config)
|
||||
|
||||
# ID of the next request stream
|
||||
@ -105,31 +114,25 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
# initialized when connection is successfully made
|
||||
self.metadata: Dict = {
|
||||
# Peer certificate instance
|
||||
'certificate': None,
|
||||
|
||||
"certificate": None,
|
||||
# Address of the server we are connected to which
|
||||
# is updated when HTTP/2 connection is made successfully
|
||||
'ip_address': None,
|
||||
|
||||
"ip_address": None,
|
||||
# URI of the peer HTTP/2 connection is made
|
||||
'uri': uri,
|
||||
|
||||
"uri": uri,
|
||||
# Both ip_address and uri are used by the Stream before
|
||||
# initiating the request to verify that the base address
|
||||
|
||||
# Variables taken from Project Settings
|
||||
'default_download_maxsize': settings.getint('DOWNLOAD_MAXSIZE'),
|
||||
'default_download_warnsize': settings.getint('DOWNLOAD_WARNSIZE'),
|
||||
|
||||
"default_download_maxsize": settings.getint("DOWNLOAD_MAXSIZE"),
|
||||
"default_download_warnsize": settings.getint("DOWNLOAD_WARNSIZE"),
|
||||
# Counter to keep track of opened streams. This counter
|
||||
# is used to make sure that not more than MAX_CONCURRENT_STREAMS
|
||||
# streams are opened which leads to ProtocolError
|
||||
# We use simple FIFO policy to handle pending requests
|
||||
'active_streams': 0,
|
||||
|
||||
"active_streams": 0,
|
||||
# Flag to keep track if settings were acknowledged by the remote
|
||||
# This ensures that we have established a HTTP/2 connection
|
||||
'settings_acknowledged': False,
|
||||
"settings_acknowledged": False,
|
||||
}
|
||||
|
||||
@property
|
||||
@ -138,7 +141,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
This is used while initiating pending streams to make sure
|
||||
that we initiate stream only during active HTTP/2 Connection
|
||||
"""
|
||||
return bool(self.transport.connected) and self.metadata['settings_acknowledged']
|
||||
return bool(self.transport.connected) and self.metadata["settings_acknowledged"]
|
||||
|
||||
@property
|
||||
def allowed_max_concurrent_streams(self) -> int:
|
||||
@ -149,7 +152,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
"""
|
||||
return min(
|
||||
self.conn.local_settings.max_concurrent_streams,
|
||||
self.conn.remote_settings.max_concurrent_streams
|
||||
self.conn.remote_settings.max_concurrent_streams,
|
||||
)
|
||||
|
||||
def _send_pending_requests(self) -> None:
|
||||
@ -159,37 +162,39 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
"""
|
||||
while (
|
||||
self._pending_request_stream_pool
|
||||
and self.metadata['active_streams'] < self.allowed_max_concurrent_streams
|
||||
and self.metadata["active_streams"] < self.allowed_max_concurrent_streams
|
||||
and self.h2_connected
|
||||
):
|
||||
self.metadata['active_streams'] += 1
|
||||
self.metadata["active_streams"] += 1
|
||||
stream = self._pending_request_stream_pool.popleft()
|
||||
stream.initiate_request()
|
||||
self._write_to_transport()
|
||||
|
||||
def pop_stream(self, stream_id: int) -> Stream:
|
||||
"""Perform cleanup when a stream is closed
|
||||
"""
|
||||
"""Perform cleanup when a stream is closed"""
|
||||
stream = self.streams.pop(stream_id)
|
||||
self.metadata['active_streams'] -= 1
|
||||
self.metadata["active_streams"] -= 1
|
||||
self._send_pending_requests()
|
||||
return stream
|
||||
|
||||
def _new_stream(self, request: Request, spider: Spider) -> Stream:
|
||||
"""Instantiates a new Stream object
|
||||
"""
|
||||
"""Instantiates a new Stream object"""
|
||||
stream = Stream(
|
||||
stream_id=next(self._stream_id_generator),
|
||||
request=request,
|
||||
protocol=self,
|
||||
download_maxsize=getattr(spider, 'download_maxsize', self.metadata['default_download_maxsize']),
|
||||
download_warnsize=getattr(spider, 'download_warnsize', self.metadata['default_download_warnsize']),
|
||||
download_maxsize=getattr(
|
||||
spider, "download_maxsize", self.metadata["default_download_maxsize"]
|
||||
),
|
||||
download_warnsize=getattr(
|
||||
spider, "download_warnsize", self.metadata["default_download_warnsize"]
|
||||
),
|
||||
)
|
||||
self.streams[stream.stream_id] = stream
|
||||
return stream
|
||||
|
||||
def _write_to_transport(self) -> None:
|
||||
""" Write data to the underlying transport connection
|
||||
"""Write data to the underlying transport connection
|
||||
from the HTTP2 connection instance if any
|
||||
"""
|
||||
# Reset the idle timeout as connection is still actively sending data
|
||||
@ -200,7 +205,9 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
|
||||
def request(self, request: Request, spider: Spider) -> Deferred:
|
||||
if not isinstance(request, Request):
|
||||
raise TypeError(f'Expected scrapy.http.Request, received {request.__class__.__qualname__}')
|
||||
raise TypeError(
|
||||
f"Expected scrapy.http.Request, received {request.__class__.__qualname__}"
|
||||
)
|
||||
|
||||
stream = self._new_stream(request, spider)
|
||||
d = stream.get_response()
|
||||
@ -221,7 +228,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
self.setTimeout(self.IDLE_TIMEOUT)
|
||||
|
||||
destination = self.transport.getPeer()
|
||||
self.metadata['ip_address'] = ipaddress.ip_address(destination.host)
|
||||
self.metadata["ip_address"] = ipaddress.ip_address(destination.host)
|
||||
|
||||
# Initiate H2 Connection
|
||||
self.conn.initiate_connection()
|
||||
@ -237,9 +244,14 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
"""
|
||||
Close the connection if it's not made via the expected protocol
|
||||
"""
|
||||
if self.transport.negotiatedProtocol is not None and self.transport.negotiatedProtocol != PROTOCOL_NAME:
|
||||
if (
|
||||
self.transport.negotiatedProtocol is not None
|
||||
and self.transport.negotiatedProtocol != PROTOCOL_NAME
|
||||
):
|
||||
# we have not initiated the connection yet, no need to send a GOAWAY frame to the remote peer
|
||||
self._lose_connection_with_error([InvalidNegotiatedProtocol(self.transport.negotiatedProtocol)])
|
||||
self._lose_connection_with_error(
|
||||
[InvalidNegotiatedProtocol(self.transport.negotiatedProtocol)]
|
||||
)
|
||||
|
||||
def _check_received_data(self, data: bytes) -> None:
|
||||
"""Checks for edge cases where the connection to remote fails
|
||||
@ -248,8 +260,8 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
Arguments:
|
||||
data -- Data received from the remote
|
||||
"""
|
||||
if data.startswith(b'HTTP/2.0 405 Method Not Allowed'):
|
||||
raise MethodNotAllowed405(self.metadata['ip_address'])
|
||||
if data.startswith(b"HTTP/2.0 405 Method Not Allowed"):
|
||||
raise MethodNotAllowed405(self.metadata["ip_address"])
|
||||
|
||||
def dataReceived(self, data: bytes) -> None:
|
||||
# Reset the idle timeout as connection is still actively receiving data
|
||||
@ -284,7 +296,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
if (
|
||||
self.conn.open_outbound_streams > 0
|
||||
or self.conn.open_inbound_streams > 0
|
||||
or self.metadata['active_streams'] > 0
|
||||
or self.metadata["active_streams"] > 0
|
||||
):
|
||||
error_code = ErrorCodes.PROTOCOL_ERROR
|
||||
else:
|
||||
@ -292,9 +304,9 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
self.conn.close_connection(error_code=error_code)
|
||||
self._write_to_transport()
|
||||
|
||||
self._lose_connection_with_error([
|
||||
TimeoutError(f"Connection was IDLE for more than {self.IDLE_TIMEOUT}s")
|
||||
])
|
||||
self._lose_connection_with_error(
|
||||
[TimeoutError(f"Connection was IDLE for more than {self.IDLE_TIMEOUT}s")]
|
||||
)
|
||||
|
||||
def connectionLost(self, reason: Failure = connectionDone) -> None:
|
||||
"""Called by Twisted when the transport connection is lost.
|
||||
@ -311,13 +323,13 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
self._conn_lost_deferred.callback(self._conn_lost_errors)
|
||||
|
||||
for stream in self.streams.values():
|
||||
if stream.metadata['request_sent']:
|
||||
if stream.metadata["request_sent"]:
|
||||
close_reason = StreamCloseReason.CONNECTION_LOST
|
||||
else:
|
||||
close_reason = StreamCloseReason.INACTIVE
|
||||
stream.close(close_reason, self._conn_lost_errors, from_protocol=True)
|
||||
|
||||
self.metadata['active_streams'] -= len(self.streams)
|
||||
self.metadata["active_streams"] -= len(self.streams)
|
||||
self.streams.clear()
|
||||
self._pending_request_stream_pool.clear()
|
||||
self.conn.close_connection()
|
||||
@ -345,13 +357,13 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
elif isinstance(event, SettingsAcknowledged):
|
||||
self.settings_acknowledged(event)
|
||||
elif isinstance(event, UnknownFrameReceived):
|
||||
logger.warning('Unknown frame received: %s', event.frame)
|
||||
logger.warning("Unknown frame received: %s", event.frame)
|
||||
|
||||
# Event handler functions starts here
|
||||
def connection_terminated(self, event: ConnectionTerminated) -> None:
|
||||
self._lose_connection_with_error([
|
||||
RemoteTerminatedConnection(self.metadata['ip_address'], event)
|
||||
])
|
||||
self._lose_connection_with_error(
|
||||
[RemoteTerminatedConnection(self.metadata["ip_address"], event)]
|
||||
)
|
||||
|
||||
def data_received(self, event: DataReceived) -> None:
|
||||
try:
|
||||
@ -370,14 +382,14 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
stream.receive_headers(event.headers)
|
||||
|
||||
def settings_acknowledged(self, event: SettingsAcknowledged) -> None:
|
||||
self.metadata['settings_acknowledged'] = True
|
||||
self.metadata["settings_acknowledged"] = True
|
||||
|
||||
# Send off all the pending requests as now we have
|
||||
# established a proper HTTP/2 connection
|
||||
self._send_pending_requests()
|
||||
|
||||
# Update certificate when our HTTP/2 connection is established
|
||||
self.metadata['certificate'] = Certificate(self.transport.getPeerCertificate())
|
||||
self.metadata["certificate"] = Certificate(self.transport.getPeerCertificate())
|
||||
|
||||
def stream_ended(self, event: StreamEnded) -> None:
|
||||
try:
|
||||
@ -406,7 +418,9 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
|
||||
|
||||
@implementer(IProtocolNegotiationFactory)
|
||||
class H2ClientFactory(Factory):
|
||||
def __init__(self, uri: URI, settings: Settings, conn_lost_deferred: Deferred) -> None:
|
||||
def __init__(
|
||||
self, uri: URI, settings: Settings, conn_lost_deferred: Deferred
|
||||
) -> None:
|
||||
self.uri = uri
|
||||
self.settings = settings
|
||||
self.conn_lost_deferred = conn_lost_deferred
|
||||
|
@ -32,18 +32,19 @@ class InactiveStreamClosed(ConnectionClosed):
|
||||
self.request = request
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'InactiveStreamClosed: Connection was closed without sending the request {self.request!r}'
|
||||
return f"InactiveStreamClosed: Connection was closed without sending the request {self.request!r}"
|
||||
|
||||
|
||||
class InvalidHostname(H2Error):
|
||||
|
||||
def __init__(self, request: Request, expected_hostname: str, expected_netloc: str) -> None:
|
||||
def __init__(
|
||||
self, request: Request, expected_hostname: str, expected_netloc: str
|
||||
) -> None:
|
||||
self.request = request
|
||||
self.expected_hostname = expected_hostname
|
||||
self.expected_netloc = expected_netloc
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'InvalidHostname: Expected {self.expected_hostname} or {self.expected_netloc} in {self.request}'
|
||||
return f"InvalidHostname: Expected {self.expected_hostname} or {self.expected_netloc} in {self.request}"
|
||||
|
||||
|
||||
class StreamCloseReason(Enum):
|
||||
@ -100,28 +101,31 @@ class Stream:
|
||||
self._request: Request = request
|
||||
self._protocol: "H2ClientProtocol" = protocol
|
||||
|
||||
self._download_maxsize = self._request.meta.get('download_maxsize', download_maxsize)
|
||||
self._download_warnsize = self._request.meta.get('download_warnsize', download_warnsize)
|
||||
self._download_maxsize = self._request.meta.get(
|
||||
"download_maxsize", download_maxsize
|
||||
)
|
||||
self._download_warnsize = self._request.meta.get(
|
||||
"download_warnsize", download_warnsize
|
||||
)
|
||||
|
||||
# Metadata of an HTTP/2 connection stream
|
||||
# initialized when stream is instantiated
|
||||
self.metadata: Dict = {
|
||||
'request_content_length': 0 if self._request.body is None else len(self._request.body),
|
||||
|
||||
"request_content_length": 0
|
||||
if self._request.body is None
|
||||
else len(self._request.body),
|
||||
# Flag to keep track whether the stream has initiated the request
|
||||
'request_sent': False,
|
||||
|
||||
"request_sent": False,
|
||||
# Flag to track whether we have logged about exceeding download warnsize
|
||||
'reached_warnsize': False,
|
||||
|
||||
"reached_warnsize": False,
|
||||
# Each time we send a data frame, we will decrease value by the amount send.
|
||||
'remaining_content_length': 0 if self._request.body is None else len(self._request.body),
|
||||
|
||||
"remaining_content_length": 0
|
||||
if self._request.body is None
|
||||
else len(self._request.body),
|
||||
# Flag to keep track whether client (self) have closed this stream
|
||||
'stream_closed_local': False,
|
||||
|
||||
"stream_closed_local": False,
|
||||
# Flag to keep track whether the server has closed the stream
|
||||
'stream_closed_server': False,
|
||||
"stream_closed_server": False,
|
||||
}
|
||||
|
||||
# Private variable used to build the response
|
||||
@ -130,21 +134,19 @@ class Stream:
|
||||
self._response: Dict = {
|
||||
# Data received frame by frame from the server is appended
|
||||
# and passed to the response Deferred when completely received.
|
||||
'body': BytesIO(),
|
||||
|
||||
"body": BytesIO(),
|
||||
# The amount of data received that counts against the
|
||||
# flow control window
|
||||
'flow_controlled_size': 0,
|
||||
|
||||
"flow_controlled_size": 0,
|
||||
# Headers received after sending the request
|
||||
'headers': Headers({}),
|
||||
"headers": Headers({}),
|
||||
}
|
||||
|
||||
def _cancel(_) -> None:
|
||||
# Close this stream as gracefully as possible
|
||||
# If the associated request is initiated we reset this stream
|
||||
# else we directly call close() method
|
||||
if self.metadata['request_sent']:
|
||||
if self.metadata["request_sent"]:
|
||||
self.reset_stream(StreamCloseReason.CANCELLED)
|
||||
else:
|
||||
self.close(StreamCloseReason.CANCELLED)
|
||||
@ -152,7 +154,7 @@ class Stream:
|
||||
self._deferred_response = Deferred(_cancel)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'Stream(id={self.stream_id!r})'
|
||||
return f"Stream(id={self.stream_id!r})"
|
||||
|
||||
@property
|
||||
def _log_warnsize(self) -> bool:
|
||||
@ -163,14 +165,16 @@ class Stream:
|
||||
True if both the above conditions hold true
|
||||
False if any of the conditions is false
|
||||
"""
|
||||
content_length_header = int(self._response['headers'].get(b'Content-Length', -1))
|
||||
content_length_header = int(
|
||||
self._response["headers"].get(b"Content-Length", -1)
|
||||
)
|
||||
return (
|
||||
self._download_warnsize
|
||||
and (
|
||||
self._response['flow_controlled_size'] > self._download_warnsize
|
||||
self._response["flow_controlled_size"] > self._download_warnsize
|
||||
or content_length_header > self._download_warnsize
|
||||
)
|
||||
and not self.metadata['reached_warnsize']
|
||||
and not self.metadata["reached_warnsize"]
|
||||
)
|
||||
|
||||
def get_response(self) -> Deferred:
|
||||
@ -183,9 +187,10 @@ class Stream:
|
||||
# Make sure that we are sending the request to the correct URL
|
||||
url = urlparse(self._request.url)
|
||||
return (
|
||||
url.netloc == str(self._protocol.metadata['uri'].host, 'utf-8')
|
||||
or url.netloc == str(self._protocol.metadata['uri'].netloc, 'utf-8')
|
||||
or url.netloc == f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
|
||||
url.netloc == str(self._protocol.metadata["uri"].host, "utf-8")
|
||||
or url.netloc == str(self._protocol.metadata["uri"].netloc, "utf-8")
|
||||
or url.netloc
|
||||
== f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
|
||||
)
|
||||
|
||||
def _get_request_headers(self) -> List[Tuple[str, str]]:
|
||||
@ -193,7 +198,7 @@ class Stream:
|
||||
|
||||
path = url.path
|
||||
if url.query:
|
||||
path += '?' + url.query
|
||||
path += "?" + url.query
|
||||
|
||||
# This pseudo-header field MUST NOT be empty for "http" or "https"
|
||||
# URIs; "http" or "https" URIs that do not contain a path component
|
||||
@ -202,40 +207,40 @@ class Stream:
|
||||
# a path component; these MUST include a ":path" pseudo-header field
|
||||
# with a value of '*' (refer RFC 7540 - Section 8.1.2.3)
|
||||
if not path:
|
||||
path = '*' if self._request.method == 'OPTIONS' else '/'
|
||||
path = "*" if self._request.method == "OPTIONS" else "/"
|
||||
|
||||
# Make sure pseudo-headers comes before all the other headers
|
||||
headers = [
|
||||
(':method', self._request.method),
|
||||
(':authority', url.netloc),
|
||||
(":method", self._request.method),
|
||||
(":authority", url.netloc),
|
||||
]
|
||||
|
||||
# The ":scheme" and ":path" pseudo-header fields MUST
|
||||
# be omitted for CONNECT method (refer RFC 7540 - Section 8.3)
|
||||
if self._request.method != 'CONNECT':
|
||||
if self._request.method != "CONNECT":
|
||||
headers += [
|
||||
(':scheme', self._protocol.metadata['uri'].scheme),
|
||||
(':path', path),
|
||||
(":scheme", self._protocol.metadata["uri"].scheme),
|
||||
(":path", path),
|
||||
]
|
||||
|
||||
content_length = str(len(self._request.body))
|
||||
headers.append(('Content-Length', content_length))
|
||||
headers.append(("Content-Length", content_length))
|
||||
|
||||
content_length_name = self._request.headers.normkey(b'Content-Length')
|
||||
content_length_name = self._request.headers.normkey(b"Content-Length")
|
||||
for name, values in self._request.headers.items():
|
||||
for value in values:
|
||||
value = str(value, 'utf-8')
|
||||
value = str(value, "utf-8")
|
||||
if name == content_length_name:
|
||||
if value != content_length:
|
||||
logger.warning(
|
||||
'Ignoring bad Content-Length header %r of request %r, '
|
||||
'sending %r instead',
|
||||
"Ignoring bad Content-Length header %r of request %r, "
|
||||
"sending %r instead",
|
||||
value,
|
||||
self._request,
|
||||
content_length,
|
||||
)
|
||||
continue
|
||||
headers.append((str(name, 'utf-8'), value))
|
||||
headers.append((str(name, "utf-8"), value))
|
||||
|
||||
return headers
|
||||
|
||||
@ -243,7 +248,7 @@ class Stream:
|
||||
if self.check_request_url():
|
||||
headers = self._get_request_headers()
|
||||
self._protocol.conn.send_headers(self.stream_id, headers, end_stream=False)
|
||||
self.metadata['request_sent'] = True
|
||||
self.metadata["request_sent"] = True
|
||||
self.send_data()
|
||||
else:
|
||||
# Close this stream calling the response errback
|
||||
@ -252,44 +257,53 @@ class Stream:
|
||||
|
||||
def send_data(self) -> None:
|
||||
"""Called immediately after the headers are sent. Here we send all the
|
||||
data as part of the request.
|
||||
data as part of the request.
|
||||
|
||||
If the content length is 0 initially then we end the stream immediately and
|
||||
wait for response data.
|
||||
If the content length is 0 initially then we end the stream immediately and
|
||||
wait for response data.
|
||||
|
||||
Warning: Only call this method when stream not closed from client side
|
||||
and has initiated request already by sending HEADER frame. If not then
|
||||
stream will raise ProtocolError (raise by h2 state machine).
|
||||
"""
|
||||
if self.metadata['stream_closed_local']:
|
||||
Warning: Only call this method when stream not closed from client side
|
||||
and has initiated request already by sending HEADER frame. If not then
|
||||
stream will raise ProtocolError (raise by h2 state machine).
|
||||
"""
|
||||
if self.metadata["stream_closed_local"]:
|
||||
raise StreamClosedError(self.stream_id)
|
||||
|
||||
# Firstly, check what the flow control window is for current stream.
|
||||
window_size = self._protocol.conn.local_flow_control_window(stream_id=self.stream_id)
|
||||
window_size = self._protocol.conn.local_flow_control_window(
|
||||
stream_id=self.stream_id
|
||||
)
|
||||
|
||||
# Next, check what the maximum frame size is.
|
||||
max_frame_size = self._protocol.conn.max_outbound_frame_size
|
||||
|
||||
# We will send no more than the window size or the remaining file size
|
||||
# of data in this call, whichever is smaller.
|
||||
bytes_to_send_size = min(window_size, self.metadata['remaining_content_length'])
|
||||
bytes_to_send_size = min(window_size, self.metadata["remaining_content_length"])
|
||||
|
||||
# We now need to send a number of data frames.
|
||||
while bytes_to_send_size > 0:
|
||||
chunk_size = min(bytes_to_send_size, max_frame_size)
|
||||
|
||||
data_chunk_start_id = self.metadata['request_content_length'] - self.metadata['remaining_content_length']
|
||||
data_chunk = self._request.body[data_chunk_start_id:data_chunk_start_id + chunk_size]
|
||||
data_chunk_start_id = (
|
||||
self.metadata["request_content_length"]
|
||||
- self.metadata["remaining_content_length"]
|
||||
)
|
||||
data_chunk = self._request.body[
|
||||
data_chunk_start_id : data_chunk_start_id + chunk_size
|
||||
]
|
||||
|
||||
self._protocol.conn.send_data(self.stream_id, data_chunk, end_stream=False)
|
||||
|
||||
bytes_to_send_size -= chunk_size
|
||||
self.metadata['remaining_content_length'] -= chunk_size
|
||||
self.metadata["remaining_content_length"] -= chunk_size
|
||||
|
||||
self.metadata['remaining_content_length'] = max(0, self.metadata['remaining_content_length'])
|
||||
self.metadata["remaining_content_length"] = max(
|
||||
0, self.metadata["remaining_content_length"]
|
||||
)
|
||||
|
||||
# End the stream if no more data needs to be send
|
||||
if self.metadata['remaining_content_length'] == 0:
|
||||
if self.metadata["remaining_content_length"] == 0:
|
||||
self._protocol.conn.end_stream(self.stream_id)
|
||||
|
||||
# Q. What about the rest of the data?
|
||||
@ -301,62 +315,64 @@ class Stream:
|
||||
blocked behind the flow control.
|
||||
"""
|
||||
if (
|
||||
self.metadata['remaining_content_length']
|
||||
and not self.metadata['stream_closed_server']
|
||||
and self.metadata['request_sent']
|
||||
self.metadata["remaining_content_length"]
|
||||
and not self.metadata["stream_closed_server"]
|
||||
and self.metadata["request_sent"]
|
||||
):
|
||||
self.send_data()
|
||||
|
||||
def receive_data(self, data: bytes, flow_controlled_length: int) -> None:
|
||||
self._response['body'].write(data)
|
||||
self._response['flow_controlled_size'] += flow_controlled_length
|
||||
self._response["body"].write(data)
|
||||
self._response["flow_controlled_size"] += flow_controlled_length
|
||||
|
||||
# We check maxsize here in case the Content-Length header was not received
|
||||
if self._download_maxsize and self._response['flow_controlled_size'] > self._download_maxsize:
|
||||
if (
|
||||
self._download_maxsize
|
||||
and self._response["flow_controlled_size"] > self._download_maxsize
|
||||
):
|
||||
self.reset_stream(StreamCloseReason.MAXSIZE_EXCEEDED)
|
||||
return
|
||||
|
||||
if self._log_warnsize:
|
||||
self.metadata['reached_warnsize'] = True
|
||||
self.metadata["reached_warnsize"] = True
|
||||
warning_msg = (
|
||||
f'Received more ({self._response["flow_controlled_size"]}) bytes than download '
|
||||
f'warn size ({self._download_warnsize}) in request {self._request}'
|
||||
f"warn size ({self._download_warnsize}) in request {self._request}"
|
||||
)
|
||||
logger.warning(warning_msg)
|
||||
|
||||
# Acknowledge the data received
|
||||
self._protocol.conn.acknowledge_received_data(
|
||||
self._response['flow_controlled_size'],
|
||||
self.stream_id
|
||||
self._response["flow_controlled_size"], self.stream_id
|
||||
)
|
||||
|
||||
def receive_headers(self, headers: List[HeaderTuple]) -> None:
|
||||
for name, value in headers:
|
||||
self._response['headers'][name] = value
|
||||
self._response["headers"][name] = value
|
||||
|
||||
# Check if we exceed the allowed max data size which can be received
|
||||
expected_size = int(self._response['headers'].get(b'Content-Length', -1))
|
||||
expected_size = int(self._response["headers"].get(b"Content-Length", -1))
|
||||
if self._download_maxsize and expected_size > self._download_maxsize:
|
||||
self.reset_stream(StreamCloseReason.MAXSIZE_EXCEEDED)
|
||||
return
|
||||
|
||||
if self._log_warnsize:
|
||||
self.metadata['reached_warnsize'] = True
|
||||
self.metadata["reached_warnsize"] = True
|
||||
warning_msg = (
|
||||
f'Expected response size ({expected_size}) larger than '
|
||||
f'download warn size ({self._download_warnsize}) in request {self._request}'
|
||||
f"Expected response size ({expected_size}) larger than "
|
||||
f"download warn size ({self._download_warnsize}) in request {self._request}"
|
||||
)
|
||||
logger.warning(warning_msg)
|
||||
|
||||
def reset_stream(self, reason: StreamCloseReason = StreamCloseReason.RESET) -> None:
|
||||
"""Close this stream by sending a RST_FRAME to the remote peer"""
|
||||
if self.metadata['stream_closed_local']:
|
||||
if self.metadata["stream_closed_local"]:
|
||||
raise StreamClosedError(self.stream_id)
|
||||
|
||||
# Clear buffer earlier to avoid keeping data in memory for a long time
|
||||
self._response['body'].truncate(0)
|
||||
self._response["body"].truncate(0)
|
||||
|
||||
self.metadata['stream_closed_local'] = True
|
||||
self.metadata["stream_closed_local"] = True
|
||||
self._protocol.conn.reset_stream(self.stream_id, ErrorCodes.REFUSED_STREAM)
|
||||
self.close(reason)
|
||||
|
||||
@ -366,13 +382,14 @@ class Stream:
|
||||
errors: Optional[List[BaseException]] = None,
|
||||
from_protocol: bool = False,
|
||||
) -> None:
|
||||
"""Based on the reason sent we will handle each case.
|
||||
"""
|
||||
if self.metadata['stream_closed_server']:
|
||||
"""Based on the reason sent we will handle each case."""
|
||||
if self.metadata["stream_closed_server"]:
|
||||
raise StreamClosedError(self.stream_id)
|
||||
|
||||
if not isinstance(reason, StreamCloseReason):
|
||||
raise TypeError(f'Expected StreamCloseReason, received {reason.__class__.__qualname__}')
|
||||
raise TypeError(
|
||||
f"Expected StreamCloseReason, received {reason.__class__.__qualname__}"
|
||||
)
|
||||
|
||||
# Have default value of errors as an empty list as
|
||||
# some cases can add a list of exceptions
|
||||
@ -381,7 +398,7 @@ class Stream:
|
||||
if not from_protocol:
|
||||
self._protocol.pop_stream(self.stream_id)
|
||||
|
||||
self.metadata['stream_closed_server'] = True
|
||||
self.metadata["stream_closed_server"] = True
|
||||
|
||||
# We do not check for Content-Length or Transfer-Encoding in response headers
|
||||
# and add `partial` flag as in HTTP/1.1 as 'A request or response that includes
|
||||
@ -392,13 +409,14 @@ class Stream:
|
||||
# receiving DATA_FRAME's when we have received the headers (not
|
||||
# having Content-Length)
|
||||
if reason is StreamCloseReason.MAXSIZE_EXCEEDED:
|
||||
expected_size = int(self._response['headers'].get(
|
||||
b'Content-Length',
|
||||
self._response['flow_controlled_size'])
|
||||
expected_size = int(
|
||||
self._response["headers"].get(
|
||||
b"Content-Length", self._response["flow_controlled_size"]
|
||||
)
|
||||
)
|
||||
error_msg = (
|
||||
f'Cancelling download of {self._request.url}: received response '
|
||||
f'size ({expected_size}) larger than download max size ({self._download_maxsize})'
|
||||
f"Cancelling download of {self._request.url}: received response "
|
||||
f"size ({expected_size}) larger than download max size ({self._download_maxsize})"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
self._deferred_response.errback(CancelledError(error_msg))
|
||||
@ -416,16 +434,20 @@ class Stream:
|
||||
|
||||
# There maybe no :status in headers, we make
|
||||
# HTTP Status Code: 499 - Client Closed Request
|
||||
self._response['headers'][':status'] = '499'
|
||||
self._response["headers"][":status"] = "499"
|
||||
self._fire_response_deferred()
|
||||
|
||||
elif reason is StreamCloseReason.RESET:
|
||||
self._deferred_response.errback(ResponseFailed([
|
||||
Failure(
|
||||
f'Remote peer {self._protocol.metadata["ip_address"]} sent RST_STREAM',
|
||||
ProtocolError
|
||||
self._deferred_response.errback(
|
||||
ResponseFailed(
|
||||
[
|
||||
Failure(
|
||||
f'Remote peer {self._protocol.metadata["ip_address"]} sent RST_STREAM',
|
||||
ProtocolError,
|
||||
)
|
||||
]
|
||||
)
|
||||
]))
|
||||
)
|
||||
|
||||
elif reason is StreamCloseReason.CONNECTION_LOST:
|
||||
self._deferred_response.errback(ResponseFailed(errors))
|
||||
@ -436,33 +458,35 @@ class Stream:
|
||||
|
||||
else:
|
||||
assert reason is StreamCloseReason.INVALID_HOSTNAME
|
||||
self._deferred_response.errback(InvalidHostname(
|
||||
self._request,
|
||||
str(self._protocol.metadata['uri'].host, 'utf-8'),
|
||||
f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
|
||||
))
|
||||
self._deferred_response.errback(
|
||||
InvalidHostname(
|
||||
self._request,
|
||||
str(self._protocol.metadata["uri"].host, "utf-8"),
|
||||
f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}',
|
||||
)
|
||||
)
|
||||
|
||||
def _fire_response_deferred(self) -> None:
|
||||
"""Builds response from the self._response dict
|
||||
and fires the response deferred callback with the
|
||||
generated response instance"""
|
||||
|
||||
body = self._response['body'].getvalue()
|
||||
body = self._response["body"].getvalue()
|
||||
response_cls = responsetypes.from_args(
|
||||
headers=self._response['headers'],
|
||||
headers=self._response["headers"],
|
||||
url=self._request.url,
|
||||
body=body,
|
||||
)
|
||||
|
||||
response = response_cls(
|
||||
url=self._request.url,
|
||||
status=int(self._response['headers'][':status']),
|
||||
headers=self._response['headers'],
|
||||
status=int(self._response["headers"][":status"]),
|
||||
headers=self._response["headers"],
|
||||
body=body,
|
||||
request=self._request,
|
||||
certificate=self._protocol.metadata['certificate'],
|
||||
ip_address=self._protocol.metadata['ip_address'],
|
||||
protocol='h2',
|
||||
certificate=self._protocol.metadata["certificate"],
|
||||
ip_address=self._protocol.metadata["ip_address"],
|
||||
protocol="h2",
|
||||
)
|
||||
|
||||
self._deferred_response.callback(response)
|
||||
|
@ -20,14 +20,18 @@ class BaseSchedulerMeta(type):
|
||||
"""
|
||||
Metaclass to check scheduler classes against the necessary interface
|
||||
"""
|
||||
|
||||
def __instancecheck__(cls, instance):
|
||||
return cls.__subclasscheck__(type(instance))
|
||||
|
||||
def __subclasscheck__(cls, subclass):
|
||||
return (
|
||||
hasattr(subclass, "has_pending_requests") and callable(subclass.has_pending_requests)
|
||||
and hasattr(subclass, "enqueue_request") and callable(subclass.enqueue_request)
|
||||
and hasattr(subclass, "next_request") and callable(subclass.next_request)
|
||||
hasattr(subclass, "has_pending_requests")
|
||||
and callable(subclass.has_pending_requests)
|
||||
and hasattr(subclass, "enqueue_request")
|
||||
and callable(subclass.enqueue_request)
|
||||
and hasattr(subclass, "next_request")
|
||||
and callable(subclass.next_request)
|
||||
)
|
||||
|
||||
|
||||
@ -162,6 +166,7 @@ class Scheduler(BaseScheduler):
|
||||
:param crawler: The crawler object corresponding to the current crawl.
|
||||
:type crawler: :class:`scrapy.crawler.Crawler`
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dupefilter,
|
||||
@ -187,15 +192,15 @@ class Scheduler(BaseScheduler):
|
||||
"""
|
||||
Factory method, initializes the scheduler with arguments taken from the crawl settings
|
||||
"""
|
||||
dupefilter_cls = load_object(crawler.settings['DUPEFILTER_CLASS'])
|
||||
dupefilter_cls = load_object(crawler.settings["DUPEFILTER_CLASS"])
|
||||
return cls(
|
||||
dupefilter=create_instance(dupefilter_cls, crawler.settings, crawler),
|
||||
jobdir=job_dir(crawler.settings),
|
||||
dqclass=load_object(crawler.settings['SCHEDULER_DISK_QUEUE']),
|
||||
mqclass=load_object(crawler.settings['SCHEDULER_MEMORY_QUEUE']),
|
||||
logunser=crawler.settings.getbool('SCHEDULER_DEBUG'),
|
||||
dqclass=load_object(crawler.settings["SCHEDULER_DISK_QUEUE"]),
|
||||
mqclass=load_object(crawler.settings["SCHEDULER_MEMORY_QUEUE"]),
|
||||
logunser=crawler.settings.getbool("SCHEDULER_DEBUG"),
|
||||
stats=crawler.stats,
|
||||
pqclass=load_object(crawler.settings['SCHEDULER_PRIORITY_QUEUE']),
|
||||
pqclass=load_object(crawler.settings["SCHEDULER_PRIORITY_QUEUE"]),
|
||||
crawler=crawler,
|
||||
)
|
||||
|
||||
@ -239,11 +244,11 @@ class Scheduler(BaseScheduler):
|
||||
return False
|
||||
dqok = self._dqpush(request)
|
||||
if dqok:
|
||||
self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
|
||||
self.stats.inc_value("scheduler/enqueued/disk", spider=self.spider)
|
||||
else:
|
||||
self._mqpush(request)
|
||||
self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
|
||||
self.stats.inc_value('scheduler/enqueued', spider=self.spider)
|
||||
self.stats.inc_value("scheduler/enqueued/memory", spider=self.spider)
|
||||
self.stats.inc_value("scheduler/enqueued", spider=self.spider)
|
||||
return True
|
||||
|
||||
def next_request(self) -> Optional[Request]:
|
||||
@ -257,13 +262,13 @@ class Scheduler(BaseScheduler):
|
||||
"""
|
||||
request = self.mqs.pop()
|
||||
if request is not None:
|
||||
self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
|
||||
self.stats.inc_value("scheduler/dequeued/memory", spider=self.spider)
|
||||
else:
|
||||
request = self._dqpop()
|
||||
if request is not None:
|
||||
self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
|
||||
self.stats.inc_value("scheduler/dequeued/disk", spider=self.spider)
|
||||
if request is not None:
|
||||
self.stats.inc_value('scheduler/dequeued', spider=self.spider)
|
||||
self.stats.inc_value("scheduler/dequeued", spider=self.spider)
|
||||
return request
|
||||
|
||||
def __len__(self) -> int:
|
||||
@ -279,13 +284,19 @@ class Scheduler(BaseScheduler):
|
||||
self.dqs.push(request)
|
||||
except ValueError as e: # non serializable request
|
||||
if self.logunser:
|
||||
msg = ("Unable to serialize request: %(request)s - reason:"
|
||||
" %(reason)s - no more unserializable requests will be"
|
||||
" logged (stats being collected)")
|
||||
logger.warning(msg, {'request': request, 'reason': e},
|
||||
exc_info=True, extra={'spider': self.spider})
|
||||
msg = (
|
||||
"Unable to serialize request: %(request)s - reason:"
|
||||
" %(reason)s - no more unserializable requests will be"
|
||||
" logged (stats being collected)"
|
||||
)
|
||||
logger.warning(
|
||||
msg,
|
||||
{"request": request, "reason": e},
|
||||
exc_info=True,
|
||||
extra={"spider": self.spider},
|
||||
)
|
||||
self.logunser = False
|
||||
self.stats.inc_value('scheduler/unserializable', spider=self.spider)
|
||||
self.stats.inc_value("scheduler/unserializable", spider=self.spider)
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
@ -299,43 +310,50 @@ class Scheduler(BaseScheduler):
|
||||
return None
|
||||
|
||||
def _mq(self):
|
||||
""" Create a new priority queue instance, with in-memory storage """
|
||||
return create_instance(self.pqclass,
|
||||
settings=None,
|
||||
crawler=self.crawler,
|
||||
downstream_queue_cls=self.mqclass,
|
||||
key='')
|
||||
"""Create a new priority queue instance, with in-memory storage"""
|
||||
return create_instance(
|
||||
self.pqclass,
|
||||
settings=None,
|
||||
crawler=self.crawler,
|
||||
downstream_queue_cls=self.mqclass,
|
||||
key="",
|
||||
)
|
||||
|
||||
def _dq(self):
|
||||
""" Create a new priority queue instance, with disk storage """
|
||||
"""Create a new priority queue instance, with disk storage"""
|
||||
state = self._read_dqs_state(self.dqdir)
|
||||
q = create_instance(self.pqclass,
|
||||
settings=None,
|
||||
crawler=self.crawler,
|
||||
downstream_queue_cls=self.dqclass,
|
||||
key=self.dqdir,
|
||||
startprios=state)
|
||||
q = create_instance(
|
||||
self.pqclass,
|
||||
settings=None,
|
||||
crawler=self.crawler,
|
||||
downstream_queue_cls=self.dqclass,
|
||||
key=self.dqdir,
|
||||
startprios=state,
|
||||
)
|
||||
if q:
|
||||
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
|
||||
{'queuesize': len(q)}, extra={'spider': self.spider})
|
||||
logger.info(
|
||||
"Resuming crawl (%(queuesize)d requests scheduled)",
|
||||
{"queuesize": len(q)},
|
||||
extra={"spider": self.spider},
|
||||
)
|
||||
return q
|
||||
|
||||
def _dqdir(self, jobdir: Optional[str]) -> Optional[str]:
|
||||
""" Return a folder name to keep disk queue state at """
|
||||
"""Return a folder name to keep disk queue state at"""
|
||||
if jobdir is not None:
|
||||
dqdir = Path(jobdir, 'requests.queue')
|
||||
dqdir = Path(jobdir, "requests.queue")
|
||||
if not dqdir.exists():
|
||||
dqdir.mkdir(parents=True)
|
||||
return str(dqdir)
|
||||
return None
|
||||
|
||||
def _read_dqs_state(self, dqdir: str) -> list:
|
||||
path = Path(dqdir, 'active.json')
|
||||
path = Path(dqdir, "active.json")
|
||||
if not path.exists():
|
||||
return []
|
||||
with path.open(encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def _write_dqs_state(self, dqdir: str, state: list) -> None:
|
||||
with Path(dqdir, 'active.json').open('w', encoding="utf-8") as f:
|
||||
with Path(dqdir, "active.json").open("w", encoding="utf-8") as f:
|
||||
json.dump(state, f)
|
||||
|
@ -63,7 +63,9 @@ class Slot:
|
||||
self.itemproc_size: int = 0
|
||||
self.closing: Optional[Deferred] = None
|
||||
|
||||
def add_response_request(self, result: Union[Response, Failure], request: Request) -> Deferred:
|
||||
def add_response_request(
|
||||
self, result: Union[Response, Failure], request: Request
|
||||
) -> Deferred:
|
||||
deferred = Deferred()
|
||||
self.queue.append((result, request, deferred))
|
||||
if isinstance(result, Response):
|
||||
@ -77,7 +79,9 @@ class Slot:
|
||||
self.active.add(request)
|
||||
return response, request, deferred
|
||||
|
||||
def finish_response(self, result: Union[Response, Failure], request: Request) -> None:
|
||||
def finish_response(
|
||||
self, result: Union[Response, Failure], request: Request
|
||||
) -> None:
|
||||
self.active.remove(request)
|
||||
if isinstance(result, Response):
|
||||
self.active_size -= max(len(result.body), self.MIN_RESPONSE_SIZE)
|
||||
@ -92,13 +96,12 @@ class Slot:
|
||||
|
||||
|
||||
class Scraper:
|
||||
|
||||
def __init__(self, crawler: Crawler) -> None:
|
||||
self.slot: Optional[Slot] = None
|
||||
self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
|
||||
itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
|
||||
itemproc_cls = load_object(crawler.settings["ITEM_PROCESSOR"])
|
||||
self.itemproc = itemproc_cls.from_crawler(crawler)
|
||||
self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
|
||||
self.concurrent_items = crawler.settings.getint("CONCURRENT_ITEMS")
|
||||
self.crawler = crawler
|
||||
self.signals = crawler.signals
|
||||
self.logformatter = crawler.logformatter
|
||||
@ -106,7 +109,7 @@ class Scraper:
|
||||
@inlineCallbacks
|
||||
def open_spider(self, spider: Spider):
|
||||
"""Open the given spider for scraping and allocate resources for it"""
|
||||
self.slot = Slot(self.crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE'))
|
||||
self.slot = Slot(self.crawler.settings.getint("SCRAPER_SLOT_MAX_ACTIVE_SIZE"))
|
||||
yield self.itemproc.open_spider(spider)
|
||||
|
||||
def close_spider(self, spider: Spider) -> Deferred:
|
||||
@ -127,7 +130,9 @@ class Scraper:
|
||||
if self.slot.closing and self.slot.is_idle():
|
||||
self.slot.closing.callback(spider)
|
||||
|
||||
def enqueue_scrape(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred:
|
||||
def enqueue_scrape(
|
||||
self, result: Union[Response, Failure], request: Request, spider: Spider
|
||||
) -> Deferred:
|
||||
if self.slot is None:
|
||||
raise RuntimeError("Scraper slot not assigned")
|
||||
dfd = self.slot.add_response_request(result, request)
|
||||
@ -140,10 +145,13 @@ class Scraper:
|
||||
|
||||
dfd.addBoth(finish_scraping)
|
||||
dfd.addErrback(
|
||||
lambda f: logger.error('Scraper bug processing %(request)s',
|
||||
{'request': request},
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
lambda f: logger.error(
|
||||
"Scraper bug processing %(request)s",
|
||||
{"request": request},
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={"spider": spider},
|
||||
)
|
||||
)
|
||||
self._scrape_next(spider)
|
||||
return dfd
|
||||
|
||||
@ -153,35 +161,49 @@ class Scraper:
|
||||
response, request, deferred = self.slot.next_response_request_deferred()
|
||||
self._scrape(response, request, spider).chainDeferred(deferred)
|
||||
|
||||
def _scrape(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred:
|
||||
def _scrape(
|
||||
self, result: Union[Response, Failure], request: Request, spider: Spider
|
||||
) -> Deferred:
|
||||
"""
|
||||
Handle the downloaded response or failure through the spider callback/errback
|
||||
"""
|
||||
if not isinstance(result, (Response, Failure)):
|
||||
raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
|
||||
dfd = self._scrape2(result, request, spider) # returns spider's processed output
|
||||
raise TypeError(
|
||||
f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}"
|
||||
)
|
||||
dfd = self._scrape2(
|
||||
result, request, spider
|
||||
) # returns spider's processed output
|
||||
dfd.addErrback(self.handle_spider_error, request, result, spider)
|
||||
dfd.addCallback(self.handle_spider_output, request, result, spider)
|
||||
return dfd
|
||||
|
||||
def _scrape2(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred:
|
||||
def _scrape2(
|
||||
self, result: Union[Response, Failure], request: Request, spider: Spider
|
||||
) -> Deferred:
|
||||
"""
|
||||
Handle the different cases of request's result been a Response or a Failure
|
||||
"""
|
||||
if isinstance(result, Response):
|
||||
return self.spidermw.scrape_response(self.call_spider, result, request, spider)
|
||||
return self.spidermw.scrape_response(
|
||||
self.call_spider, result, request, spider
|
||||
)
|
||||
# else result is a Failure
|
||||
dfd = self.call_spider(result, request, spider)
|
||||
return dfd.addErrback(self._log_download_errors, result, request, spider)
|
||||
|
||||
def call_spider(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred:
|
||||
def call_spider(
|
||||
self, result: Union[Response, Failure], request: Request, spider: Spider
|
||||
) -> Deferred:
|
||||
if isinstance(result, Response):
|
||||
if getattr(result, "request", None) is None:
|
||||
result.request = request
|
||||
callback = result.request.callback or spider._parse
|
||||
warn_on_generator_with_return_value(spider, callback)
|
||||
dfd = defer_succeed(result)
|
||||
dfd.addCallbacks(callback=callback, callbackKeywords=result.request.cb_kwargs)
|
||||
dfd.addCallbacks(
|
||||
callback=callback, callbackKeywords=result.request.cb_kwargs
|
||||
)
|
||||
else: # result is a Failure
|
||||
result.request = request
|
||||
warn_on_generator_with_return_value(spider, request.errback)
|
||||
@ -189,45 +211,69 @@ class Scraper:
|
||||
dfd.addErrback(request.errback)
|
||||
return dfd.addCallback(iterate_spider_output)
|
||||
|
||||
def handle_spider_error(self, _failure: Failure, request: Request, response: Response, spider: Spider) -> None:
|
||||
def handle_spider_error(
|
||||
self, _failure: Failure, request: Request, response: Response, spider: Spider
|
||||
) -> None:
|
||||
exc = _failure.value
|
||||
if isinstance(exc, CloseSpider):
|
||||
assert self.crawler.engine is not None # typing
|
||||
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
|
||||
self.crawler.engine.close_spider(spider, exc.reason or "cancelled")
|
||||
return
|
||||
logkws = self.logformatter.spider_error(_failure, request, response, spider)
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
exc_info=failure_to_exc_info(_failure),
|
||||
extra={'spider': spider}
|
||||
extra={"spider": spider},
|
||||
)
|
||||
self.signals.send_catch_log(
|
||||
signal=signals.spider_error,
|
||||
failure=_failure, response=response,
|
||||
spider=spider
|
||||
failure=_failure,
|
||||
response=response,
|
||||
spider=spider,
|
||||
)
|
||||
self.crawler.stats.inc_value(
|
||||
f"spider_exceptions/{_failure.value.__class__.__name__}",
|
||||
spider=spider
|
||||
f"spider_exceptions/{_failure.value.__class__.__name__}", spider=spider
|
||||
)
|
||||
|
||||
def handle_spider_output(self, result: Union[Iterable, AsyncIterable], request: Request,
|
||||
response: Response, spider: Spider) -> Deferred:
|
||||
def handle_spider_output(
|
||||
self,
|
||||
result: Union[Iterable, AsyncIterable],
|
||||
request: Request,
|
||||
response: Response,
|
||||
spider: Spider,
|
||||
) -> Deferred:
|
||||
if not result:
|
||||
return defer_succeed(None)
|
||||
it: Union[Generator, AsyncGenerator]
|
||||
if isinstance(result, AsyncIterable):
|
||||
it = aiter_errback(result, self.handle_spider_error, request, response, spider)
|
||||
dfd = parallel_async(it, self.concurrent_items, self._process_spidermw_output,
|
||||
request, response, spider)
|
||||
it = aiter_errback(
|
||||
result, self.handle_spider_error, request, response, spider
|
||||
)
|
||||
dfd = parallel_async(
|
||||
it,
|
||||
self.concurrent_items,
|
||||
self._process_spidermw_output,
|
||||
request,
|
||||
response,
|
||||
spider,
|
||||
)
|
||||
else:
|
||||
it = iter_errback(result, self.handle_spider_error, request, response, spider)
|
||||
dfd = parallel(it, self.concurrent_items, self._process_spidermw_output,
|
||||
request, response, spider)
|
||||
it = iter_errback(
|
||||
result, self.handle_spider_error, request, response, spider
|
||||
)
|
||||
dfd = parallel(
|
||||
it,
|
||||
self.concurrent_items,
|
||||
self._process_spidermw_output,
|
||||
request,
|
||||
response,
|
||||
spider,
|
||||
)
|
||||
return dfd
|
||||
|
||||
def _process_spidermw_output(self, output: Any, request: Request, response: Response,
|
||||
spider: Spider) -> Optional[Deferred]:
|
||||
def _process_spidermw_output(
|
||||
self, output: Any, request: Request, response: Response, spider: Spider
|
||||
) -> Optional[Deferred]:
|
||||
"""Process each Request/Item (given in the output parameter) returned
|
||||
from the given spider
|
||||
"""
|
||||
@ -245,14 +291,19 @@ class Scraper:
|
||||
else:
|
||||
typename = type(output).__name__
|
||||
logger.error(
|
||||
'Spider must return request, item, or None, got %(typename)r in %(request)s',
|
||||
{'request': request, 'typename': typename},
|
||||
extra={'spider': spider},
|
||||
"Spider must return request, item, or None, got %(typename)r in %(request)s",
|
||||
{"request": request, "typename": typename},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
return None
|
||||
|
||||
def _log_download_errors(self, spider_failure: Failure, download_failure: Failure, request: Request,
|
||||
spider: Spider) -> Union[Failure, None]:
|
||||
def _log_download_errors(
|
||||
self,
|
||||
spider_failure: Failure,
|
||||
download_failure: Failure,
|
||||
request: Request,
|
||||
spider: Spider,
|
||||
) -> Union[Failure, None]:
|
||||
"""Log and silence errors that come from the engine (typically download
|
||||
errors that got propagated thru here).
|
||||
|
||||
@ -262,29 +313,33 @@ class Scraper:
|
||||
"""
|
||||
if not download_failure.check(IgnoreRequest):
|
||||
if download_failure.frames:
|
||||
logkws = self.logformatter.download_error(download_failure, request, spider)
|
||||
logkws = self.logformatter.download_error(
|
||||
download_failure, request, spider
|
||||
)
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
extra={'spider': spider},
|
||||
extra={"spider": spider},
|
||||
exc_info=failure_to_exc_info(download_failure),
|
||||
)
|
||||
else:
|
||||
errmsg = download_failure.getErrorMessage()
|
||||
if errmsg:
|
||||
logkws = self.logformatter.download_error(
|
||||
download_failure, request, spider, errmsg)
|
||||
download_failure, request, spider, errmsg
|
||||
)
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
extra={'spider': spider},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
|
||||
if spider_failure is not download_failure:
|
||||
return spider_failure
|
||||
return None
|
||||
|
||||
def _itemproc_finished(self, output: Any, item: Any, response: Response, spider: Spider) -> None:
|
||||
"""ItemProcessor finished for the given ``item`` and returned ``output``
|
||||
"""
|
||||
def _itemproc_finished(
|
||||
self, output: Any, item: Any, response: Response, spider: Spider
|
||||
) -> None:
|
||||
"""ItemProcessor finished for the given ``item`` and returned ``output``"""
|
||||
assert self.slot is not None # typing
|
||||
self.slot.itemproc_size -= 1
|
||||
if isinstance(output, Failure):
|
||||
@ -292,19 +347,30 @@ class Scraper:
|
||||
if isinstance(ex, DropItem):
|
||||
logkws = self.logformatter.dropped(item, ex, response, spider)
|
||||
if logkws is not None:
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
logger.log(*logformatter_adapter(logkws), extra={"spider": spider})
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_dropped, item=item, response=response,
|
||||
spider=spider, exception=output.value)
|
||||
signal=signals.item_dropped,
|
||||
item=item,
|
||||
response=response,
|
||||
spider=spider,
|
||||
exception=output.value,
|
||||
)
|
||||
logkws = self.logformatter.item_error(item, ex, response, spider)
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider},
|
||||
exc_info=failure_to_exc_info(output))
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
extra={"spider": spider},
|
||||
exc_info=failure_to_exc_info(output),
|
||||
)
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_error, item=item, response=response,
|
||||
spider=spider, failure=output)
|
||||
signal=signals.item_error,
|
||||
item=item,
|
||||
response=response,
|
||||
spider=spider,
|
||||
failure=output,
|
||||
)
|
||||
logkws = self.logformatter.scraped(output, response, spider)
|
||||
if logkws is not None:
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
logger.log(*logformatter_adapter(logkws), extra={"spider": spider})
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_scraped, item=output, response=response,
|
||||
spider=spider)
|
||||
signal=signals.item_scraped, item=output, response=response, spider=spider
|
||||
)
|
||||
|
@ -6,7 +6,17 @@ See documentation in docs/topics/spider-middleware.rst
|
||||
import logging
|
||||
from inspect import isasyncgenfunction, iscoroutine
|
||||
from itertools import islice
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Callable, Generator, Iterable, Tuple, Union, cast
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
AsyncIterable,
|
||||
Callable,
|
||||
Generator,
|
||||
Iterable,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
from twisted.python.failure import Failure
|
||||
@ -17,7 +27,12 @@ from scrapy.http import Response
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.asyncgen import as_async_generator, collect_asyncgen
|
||||
from scrapy.utils.conf import build_component_list
|
||||
from scrapy.utils.defer import mustbe_deferred, deferred_from_coro, deferred_f_from_coro_f, maybe_deferred_to_future
|
||||
from scrapy.utils.defer import (
|
||||
mustbe_deferred,
|
||||
deferred_from_coro,
|
||||
deferred_f_from_coro_f,
|
||||
maybe_deferred_to_future,
|
||||
)
|
||||
from scrapy.utils.python import MutableAsyncChain, MutableChain
|
||||
|
||||
|
||||
@ -33,7 +48,7 @@ def _isiterable(o) -> bool:
|
||||
|
||||
class SpiderMiddlewareManager(MiddlewareManager):
|
||||
|
||||
component_name = 'spider middleware'
|
||||
component_name = "spider middleware"
|
||||
|
||||
def __init__(self, *middlewares):
|
||||
super().__init__(*middlewares)
|
||||
@ -41,28 +56,35 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
|
||||
return build_component_list(settings.getwithbase("SPIDER_MIDDLEWARES"))
|
||||
|
||||
def _add_middleware(self, mw):
|
||||
super()._add_middleware(mw)
|
||||
if hasattr(mw, 'process_spider_input'):
|
||||
self.methods['process_spider_input'].append(mw.process_spider_input)
|
||||
if hasattr(mw, 'process_start_requests'):
|
||||
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
|
||||
process_spider_output = self._get_async_method_pair(mw, 'process_spider_output')
|
||||
self.methods['process_spider_output'].appendleft(process_spider_output)
|
||||
process_spider_exception = getattr(mw, 'process_spider_exception', None)
|
||||
self.methods['process_spider_exception'].appendleft(process_spider_exception)
|
||||
if hasattr(mw, "process_spider_input"):
|
||||
self.methods["process_spider_input"].append(mw.process_spider_input)
|
||||
if hasattr(mw, "process_start_requests"):
|
||||
self.methods["process_start_requests"].appendleft(mw.process_start_requests)
|
||||
process_spider_output = self._get_async_method_pair(mw, "process_spider_output")
|
||||
self.methods["process_spider_output"].appendleft(process_spider_output)
|
||||
process_spider_exception = getattr(mw, "process_spider_exception", None)
|
||||
self.methods["process_spider_exception"].appendleft(process_spider_exception)
|
||||
|
||||
def _process_spider_input(self, scrape_func: ScrapeFunc, response: Response, request: Request,
|
||||
spider: Spider) -> Any:
|
||||
for method in self.methods['process_spider_input']:
|
||||
def _process_spider_input(
|
||||
self,
|
||||
scrape_func: ScrapeFunc,
|
||||
response: Response,
|
||||
request: Request,
|
||||
spider: Spider,
|
||||
) -> Any:
|
||||
for method in self.methods["process_spider_input"]:
|
||||
method = cast(Callable, method)
|
||||
try:
|
||||
result = method(response=response, spider=spider)
|
||||
if result is not None:
|
||||
msg = (f"{method.__qualname__} must return None "
|
||||
f"or raise an exception, got {type(result)}")
|
||||
msg = (
|
||||
f"{method.__qualname__} must return None "
|
||||
f"or raise an exception, got {type(result)}"
|
||||
)
|
||||
raise _InvalidOutput(msg)
|
||||
except _InvalidOutput:
|
||||
raise
|
||||
@ -70,17 +92,22 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
return scrape_func(Failure(), request, spider)
|
||||
return scrape_func(response, request, spider)
|
||||
|
||||
def _evaluate_iterable(self, response: Response, spider: Spider, iterable: Union[Iterable, AsyncIterable],
|
||||
exception_processor_index: int, recover_to: Union[MutableChain, MutableAsyncChain]
|
||||
) -> Union[Generator, AsyncGenerator]:
|
||||
|
||||
def _evaluate_iterable(
|
||||
self,
|
||||
response: Response,
|
||||
spider: Spider,
|
||||
iterable: Union[Iterable, AsyncIterable],
|
||||
exception_processor_index: int,
|
||||
recover_to: Union[MutableChain, MutableAsyncChain],
|
||||
) -> Union[Generator, AsyncGenerator]:
|
||||
def process_sync(iterable: Iterable):
|
||||
try:
|
||||
for r in iterable:
|
||||
yield r
|
||||
except Exception as ex:
|
||||
exception_result = self._process_spider_exception(response, spider, Failure(ex),
|
||||
exception_processor_index)
|
||||
exception_result = self._process_spider_exception(
|
||||
response, spider, Failure(ex), exception_processor_index
|
||||
)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
recover_to.extend(exception_result)
|
||||
@ -90,8 +117,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
async for r in iterable:
|
||||
yield r
|
||||
except Exception as ex:
|
||||
exception_result = self._process_spider_exception(response, spider, Failure(ex),
|
||||
exception_processor_index)
|
||||
exception_result = self._process_spider_exception(
|
||||
response, spider, Failure(ex), exception_processor_index
|
||||
)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
recover_to.extend(exception_result)
|
||||
@ -100,13 +128,20 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
return process_async(iterable)
|
||||
return process_sync(iterable)
|
||||
|
||||
def _process_spider_exception(self, response: Response, spider: Spider, _failure: Failure,
|
||||
start_index: int = 0) -> Union[Failure, MutableChain]:
|
||||
def _process_spider_exception(
|
||||
self,
|
||||
response: Response,
|
||||
spider: Spider,
|
||||
_failure: Failure,
|
||||
start_index: int = 0,
|
||||
) -> Union[Failure, MutableChain]:
|
||||
exception = _failure.value
|
||||
# don't handle _InvalidOutput exception
|
||||
if isinstance(exception, _InvalidOutput):
|
||||
return _failure
|
||||
method_list = islice(self.methods['process_spider_exception'], start_index, None)
|
||||
method_list = islice(
|
||||
self.methods["process_spider_exception"], start_index, None
|
||||
)
|
||||
for method_index, method in enumerate(method_list, start=start_index):
|
||||
if method is None:
|
||||
continue
|
||||
@ -115,7 +150,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
if _isiterable(result):
|
||||
# stop exception handling by handing control over to the
|
||||
# process_spider_output chain if an iterable has been returned
|
||||
dfd: Deferred = self._process_spider_output(response, spider, result, method_index + 1)
|
||||
dfd: Deferred = self._process_spider_output(
|
||||
response, spider, result, method_index + 1
|
||||
)
|
||||
# _process_spider_output() returns a Deferred only because of downgrading so this can be
|
||||
# simplified when downgrading is removed.
|
||||
if dfd.called:
|
||||
@ -128,8 +165,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
elif result is None:
|
||||
continue
|
||||
else:
|
||||
msg = (f"{method.__qualname__} must return None "
|
||||
f"or an iterable, got {type(result)}")
|
||||
msg = (
|
||||
f"{method.__qualname__} must return None "
|
||||
f"or an iterable, got {type(result)}"
|
||||
)
|
||||
raise _InvalidOutput(msg)
|
||||
return _failure
|
||||
|
||||
@ -137,9 +176,13 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
# being available immediately which doesn't work when it's a wrapped coroutine.
|
||||
# It also needs @inlineCallbacks only because of downgrading so it can be removed when downgrading is removed.
|
||||
@inlineCallbacks
|
||||
def _process_spider_output(self, response: Response, spider: Spider,
|
||||
result: Union[Iterable, AsyncIterable], start_index: int = 0
|
||||
) -> Deferred:
|
||||
def _process_spider_output(
|
||||
self,
|
||||
response: Response,
|
||||
spider: Spider,
|
||||
result: Union[Iterable, AsyncIterable],
|
||||
start_index: int = 0,
|
||||
) -> Deferred:
|
||||
# items in this iterable do not need to go through the process_spider_output
|
||||
# chain, they went through it already from the process_spider_exception method
|
||||
recovered: Union[MutableChain, MutableAsyncChain]
|
||||
@ -156,7 +199,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
# Storing methods and method tuples in the same list is weird but we should be able to roll this back
|
||||
# when we drop this compatibility feature.
|
||||
|
||||
method_list = islice(self.methods['process_spider_output'], start_index, None)
|
||||
method_list = islice(self.methods["process_spider_output"], start_index, None)
|
||||
for method_index, method_pair in enumerate(method_list, start=start_index):
|
||||
if method_pair is None:
|
||||
continue
|
||||
@ -177,24 +220,32 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
result = as_async_generator(result)
|
||||
elif need_downgrade:
|
||||
if not self.downgrade_warning_done:
|
||||
logger.warning(f"Async iterable passed to {method.__qualname__} "
|
||||
f"was downgraded to a non-async one")
|
||||
logger.warning(
|
||||
f"Async iterable passed to {method.__qualname__} "
|
||||
f"was downgraded to a non-async one"
|
||||
)
|
||||
self.downgrade_warning_done = True
|
||||
assert isinstance(result, AsyncIterable)
|
||||
# AsyncIterable -> Iterable
|
||||
result = yield deferred_from_coro(collect_asyncgen(result))
|
||||
if isinstance(recovered, AsyncIterable):
|
||||
recovered_collected = yield deferred_from_coro(collect_asyncgen(recovered))
|
||||
recovered_collected = yield deferred_from_coro(
|
||||
collect_asyncgen(recovered)
|
||||
)
|
||||
recovered = MutableChain(recovered_collected)
|
||||
# might fail directly if the output value is not a generator
|
||||
result = method(response=response, result=result, spider=spider)
|
||||
except Exception as ex:
|
||||
exception_result = self._process_spider_exception(response, spider, Failure(ex), method_index + 1)
|
||||
exception_result = self._process_spider_exception(
|
||||
response, spider, Failure(ex), method_index + 1
|
||||
)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
return exception_result
|
||||
if _isiterable(result):
|
||||
result = self._evaluate_iterable(response, spider, result, method_index + 1, recovered)
|
||||
result = self._evaluate_iterable(
|
||||
response, spider, result, method_index + 1, recovered
|
||||
)
|
||||
else:
|
||||
if iscoroutine(result):
|
||||
result.close() # Silence warning about not awaiting
|
||||
@ -214,15 +265,18 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
return MutableAsyncChain(result, recovered)
|
||||
return MutableChain(result, recovered) # type: ignore[arg-type]
|
||||
|
||||
async def _process_callback_output(self, response: Response, spider: Spider, result: Union[Iterable, AsyncIterable]
|
||||
) -> Union[MutableChain, MutableAsyncChain]:
|
||||
async def _process_callback_output(
|
||||
self, response: Response, spider: Spider, result: Union[Iterable, AsyncIterable]
|
||||
) -> Union[MutableChain, MutableAsyncChain]:
|
||||
recovered: Union[MutableChain, MutableAsyncChain]
|
||||
if isinstance(result, AsyncIterable):
|
||||
recovered = MutableAsyncChain()
|
||||
else:
|
||||
recovered = MutableChain()
|
||||
result = self._evaluate_iterable(response, spider, result, 0, recovered)
|
||||
result = await maybe_deferred_to_future(self._process_spider_output(response, spider, result))
|
||||
result = await maybe_deferred_to_future(
|
||||
self._process_spider_output(response, spider, result)
|
||||
)
|
||||
if isinstance(result, AsyncIterable):
|
||||
return MutableAsyncChain(result, recovered)
|
||||
if isinstance(recovered, AsyncIterable):
|
||||
@ -230,41 +284,60 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
recovered = MutableChain(recovered_collected)
|
||||
return MutableChain(result, recovered) # type: ignore[arg-type]
|
||||
|
||||
def scrape_response(self, scrape_func: ScrapeFunc, response: Response, request: Request,
|
||||
spider: Spider) -> Deferred:
|
||||
async def process_callback_output(result: Union[Iterable, AsyncIterable]
|
||||
) -> Union[MutableChain, MutableAsyncChain]:
|
||||
def scrape_response(
|
||||
self,
|
||||
scrape_func: ScrapeFunc,
|
||||
response: Response,
|
||||
request: Request,
|
||||
spider: Spider,
|
||||
) -> Deferred:
|
||||
async def process_callback_output(
|
||||
result: Union[Iterable, AsyncIterable]
|
||||
) -> Union[MutableChain, MutableAsyncChain]:
|
||||
return await self._process_callback_output(response, spider, result)
|
||||
|
||||
def process_spider_exception(_failure: Failure) -> Union[Failure, MutableChain]:
|
||||
return self._process_spider_exception(response, spider, _failure)
|
||||
|
||||
dfd = mustbe_deferred(self._process_spider_input, scrape_func, response, request, spider)
|
||||
dfd.addCallbacks(callback=deferred_f_from_coro_f(process_callback_output), errback=process_spider_exception)
|
||||
dfd = mustbe_deferred(
|
||||
self._process_spider_input, scrape_func, response, request, spider
|
||||
)
|
||||
dfd.addCallbacks(
|
||||
callback=deferred_f_from_coro_f(process_callback_output),
|
||||
errback=process_spider_exception,
|
||||
)
|
||||
return dfd
|
||||
|
||||
def process_start_requests(self, start_requests, spider: Spider) -> Deferred:
|
||||
return self._process_chain('process_start_requests', start_requests, spider)
|
||||
return self._process_chain("process_start_requests", start_requests, spider)
|
||||
|
||||
# This method is only needed until _async compatibility methods are removed.
|
||||
@staticmethod
|
||||
def _get_async_method_pair(mw: Any, methodname: str) -> Union[None, Callable, Tuple[Callable, Callable]]:
|
||||
def _get_async_method_pair(
|
||||
mw: Any, methodname: str
|
||||
) -> Union[None, Callable, Tuple[Callable, Callable]]:
|
||||
normal_method = getattr(mw, methodname, None)
|
||||
methodname_async = methodname + "_async"
|
||||
async_method = getattr(mw, methodname_async, None)
|
||||
if not async_method:
|
||||
return normal_method
|
||||
if not normal_method:
|
||||
logger.error(f"Middleware {mw.__qualname__} has {methodname_async} "
|
||||
f"without {methodname}, skipping this method.")
|
||||
logger.error(
|
||||
f"Middleware {mw.__qualname__} has {methodname_async} "
|
||||
f"without {methodname}, skipping this method."
|
||||
)
|
||||
return None
|
||||
if not isasyncgenfunction(async_method):
|
||||
logger.error(f"{async_method.__qualname__} is not "
|
||||
f"an async generator function, skipping this method.")
|
||||
logger.error(
|
||||
f"{async_method.__qualname__} is not "
|
||||
f"an async generator function, skipping this method."
|
||||
)
|
||||
return normal_method
|
||||
if isasyncgenfunction(normal_method):
|
||||
logger.error(f"{normal_method.__qualname__} is an async "
|
||||
f"generator function while {methodname_async} exists, "
|
||||
f"skipping both methods.")
|
||||
logger.error(
|
||||
f"{normal_method.__qualname__} is an async "
|
||||
f"generator function while {methodname_async} exists, "
|
||||
f"skipping both methods."
|
||||
)
|
||||
return None
|
||||
return normal_method, async_method
|
||||
|
@ -49,10 +49,9 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Crawler:
|
||||
|
||||
def __init__(self, spidercls, settings=None, init_reactor: bool = False):
|
||||
if isinstance(spidercls, Spider):
|
||||
raise ValueError('The spidercls argument must be a class, not an object')
|
||||
raise ValueError("The spidercls argument must be a class, not an object")
|
||||
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
@ -63,14 +62,15 @@ class Crawler:
|
||||
|
||||
self.signals = SignalManager(self)
|
||||
|
||||
self.stats = load_object(self.settings['STATS_CLASS'])(self)
|
||||
self.stats = load_object(self.settings["STATS_CLASS"])(self)
|
||||
|
||||
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
|
||||
handler = LogCounterHandler(self, level=self.settings.get("LOG_LEVEL"))
|
||||
logging.root.addHandler(handler)
|
||||
|
||||
d = dict(overridden_settings(self.settings))
|
||||
logger.info("Overridden settings:\n%(settings)s",
|
||||
{'settings': pprint.pformat(d)})
|
||||
logger.info(
|
||||
"Overridden settings:\n%(settings)s", {"settings": pprint.pformat(d)}
|
||||
)
|
||||
|
||||
if get_scrapy_root_handler() is not None:
|
||||
# scrapy root handler already installed: update it with new settings
|
||||
@ -80,11 +80,11 @@ class Crawler:
|
||||
self.__remove_handler = lambda: logging.root.removeHandler(handler)
|
||||
self.signals.connect(self.__remove_handler, signals.engine_stopped)
|
||||
|
||||
lf_cls = load_object(self.settings['LOG_FORMATTER'])
|
||||
lf_cls = load_object(self.settings["LOG_FORMATTER"])
|
||||
self.logformatter = lf_cls.from_crawler(self)
|
||||
|
||||
self.request_fingerprinter: RequestFingerprinter = create_instance(
|
||||
load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']),
|
||||
load_object(self.settings["REQUEST_FINGERPRINTER_CLASS"]),
|
||||
settings=self.settings,
|
||||
crawler=self,
|
||||
)
|
||||
@ -160,23 +160,26 @@ class CrawlerRunner:
|
||||
crawlers = property(
|
||||
lambda self: self._crawlers,
|
||||
doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
|
||||
":meth:`crawl` and managed by this class."
|
||||
":meth:`crawl` and managed by this class.",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_spider_loader(settings):
|
||||
""" Get SpiderLoader instance from settings """
|
||||
cls_path = settings.get('SPIDER_LOADER_CLASS')
|
||||
"""Get SpiderLoader instance from settings"""
|
||||
cls_path = settings.get("SPIDER_LOADER_CLASS")
|
||||
loader_cls = load_object(cls_path)
|
||||
excs = (DoesNotImplement, MultipleInvalid) if MultipleInvalid else DoesNotImplement
|
||||
excs = (
|
||||
(DoesNotImplement, MultipleInvalid) if MultipleInvalid else DoesNotImplement
|
||||
)
|
||||
try:
|
||||
verifyClass(ISpiderLoader, loader_cls)
|
||||
except excs:
|
||||
warnings.warn(
|
||||
'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does '
|
||||
'not fully implement scrapy.interfaces.ISpiderLoader interface. '
|
||||
'Please add all missing methods to avoid unexpected runtime errors.',
|
||||
category=ScrapyDeprecationWarning, stacklevel=2
|
||||
"SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does "
|
||||
"not fully implement scrapy.interfaces.ISpiderLoader interface. "
|
||||
"Please add all missing methods to avoid unexpected runtime errors.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return loader_cls.from_settings(settings.frozencopy())
|
||||
|
||||
@ -191,9 +194,12 @@ class CrawlerRunner:
|
||||
|
||||
@property
|
||||
def spiders(self):
|
||||
warnings.warn("CrawlerRunner.spiders attribute is renamed to "
|
||||
"CrawlerRunner.spider_loader.",
|
||||
category=ScrapyDeprecationWarning, stacklevel=2)
|
||||
warnings.warn(
|
||||
"CrawlerRunner.spiders attribute is renamed to "
|
||||
"CrawlerRunner.spider_loader.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return self.spider_loader
|
||||
|
||||
def crawl(self, crawler_or_spidercls, *args, **kwargs):
|
||||
@ -220,8 +226,9 @@ class CrawlerRunner:
|
||||
"""
|
||||
if isinstance(crawler_or_spidercls, Spider):
|
||||
raise ValueError(
|
||||
'The crawler_or_spidercls argument cannot be a spider object, '
|
||||
'it must be a spider class (or a Crawler object)')
|
||||
"The crawler_or_spidercls argument cannot be a spider object, "
|
||||
"it must be a spider class (or a Crawler object)"
|
||||
)
|
||||
crawler = self.create_crawler(crawler_or_spidercls)
|
||||
return self._crawl(crawler, *args, **kwargs)
|
||||
|
||||
@ -233,7 +240,7 @@ class CrawlerRunner:
|
||||
def _done(result):
|
||||
self.crawlers.discard(crawler)
|
||||
self._active.discard(d)
|
||||
self.bootstrap_failed |= not getattr(crawler, 'spider', None)
|
||||
self.bootstrap_failed |= not getattr(crawler, "spider", None)
|
||||
return result
|
||||
|
||||
return d.addBoth(_done)
|
||||
@ -251,8 +258,9 @@ class CrawlerRunner:
|
||||
"""
|
||||
if isinstance(crawler_or_spidercls, Spider):
|
||||
raise ValueError(
|
||||
'The crawler_or_spidercls argument cannot be a spider object, '
|
||||
'it must be a spider class (or a Crawler object)')
|
||||
"The crawler_or_spidercls argument cannot be a spider object, "
|
||||
"it must be a spider class (or a Crawler object)"
|
||||
)
|
||||
if isinstance(crawler_or_spidercls, Crawler):
|
||||
return crawler_or_spidercls
|
||||
return self._create_crawler(crawler_or_spidercls)
|
||||
@ -314,18 +322,23 @@ class CrawlerProcess(CrawlerRunner):
|
||||
|
||||
def _signal_shutdown(self, signum, _):
|
||||
from twisted.internet import reactor
|
||||
|
||||
install_shutdown_handlers(self._signal_kill)
|
||||
signame = signal_names[signum]
|
||||
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
|
||||
{'signame': signame})
|
||||
logger.info(
|
||||
"Received %(signame)s, shutting down gracefully. Send again to force ",
|
||||
{"signame": signame},
|
||||
)
|
||||
reactor.callFromThread(self._graceful_stop_reactor)
|
||||
|
||||
def _signal_kill(self, signum, _):
|
||||
from twisted.internet import reactor
|
||||
|
||||
install_shutdown_handlers(signal.SIG_IGN)
|
||||
signame = signal_names[signum]
|
||||
logger.info('Received %(signame)s twice, forcing unclean shutdown',
|
||||
{'signame': signame})
|
||||
logger.info(
|
||||
"Received %(signame)s twice, forcing unclean shutdown", {"signame": signame}
|
||||
)
|
||||
reactor.callFromThread(self._stop_reactor)
|
||||
|
||||
def _create_crawler(self, spidercls):
|
||||
@ -351,6 +364,7 @@ class CrawlerProcess(CrawlerRunner):
|
||||
handlers (default: True)
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
|
||||
if stop_after_crawl:
|
||||
d = self.join()
|
||||
# Don't start the reactor if the deferreds are already fired
|
||||
@ -364,8 +378,8 @@ class CrawlerProcess(CrawlerRunner):
|
||||
resolver = create_instance(resolver_class, self.settings, self, reactor=reactor)
|
||||
resolver.install_on_reactor()
|
||||
tp = reactor.getThreadPool()
|
||||
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
|
||||
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
|
||||
tp.adjustPoolsize(maxthreads=self.settings.getint("REACTOR_THREADPOOL_MAXSIZE"))
|
||||
reactor.addSystemEventTrigger("before", "shutdown", self.stop)
|
||||
reactor.run(installSignalHandlers=False) # blocking call
|
||||
|
||||
def _graceful_stop_reactor(self):
|
||||
@ -375,6 +389,7 @@ class CrawlerProcess(CrawlerRunner):
|
||||
|
||||
def _stop_reactor(self, _=None):
|
||||
from twisted.internet import reactor
|
||||
|
||||
try:
|
||||
reactor.stop()
|
||||
except RuntimeError: # raised if already stopped or in shutdown stage
|
||||
|
@ -17,14 +17,14 @@ class AjaxCrawlMiddleware:
|
||||
"""
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('AJAXCRAWL_ENABLED'):
|
||||
if not settings.getbool("AJAXCRAWL_ENABLED"):
|
||||
raise NotConfigured
|
||||
|
||||
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
||||
# middleware parses first 4k. 4k turns out to be insufficient
|
||||
# for this middleware, and parsing 100k could be slow.
|
||||
# We use something in between (32K) by default.
|
||||
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
|
||||
self.lookup_bytes = settings.getint("AJAXCRAWL_MAXSIZE", 32768)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
@ -35,23 +35,25 @@ class AjaxCrawlMiddleware:
|
||||
if not isinstance(response, HtmlResponse) or response.status != 200:
|
||||
return response
|
||||
|
||||
if request.method != 'GET':
|
||||
if request.method != "GET":
|
||||
# other HTTP methods are either not safe or don't have a body
|
||||
return response
|
||||
|
||||
if 'ajax_crawlable' in request.meta: # prevent loops
|
||||
if "ajax_crawlable" in request.meta: # prevent loops
|
||||
return response
|
||||
|
||||
if not self._has_ajax_crawlable_variant(response):
|
||||
return response
|
||||
|
||||
# scrapy already handles #! links properly
|
||||
ajax_crawl_request = request.replace(url=request.url + '#!')
|
||||
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
|
||||
extra={'spider': spider})
|
||||
ajax_crawl_request = request.replace(url=request.url + "#!")
|
||||
logger.debug(
|
||||
"Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
{"ajax_crawl_request": ajax_crawl_request, "request": request},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
|
||||
ajax_crawl_request.meta['ajax_crawlable'] = True
|
||||
ajax_crawl_request.meta["ajax_crawlable"] = True
|
||||
return ajax_crawl_request
|
||||
|
||||
def _has_ajax_crawlable_variant(self, response):
|
||||
@ -59,12 +61,14 @@ class AjaxCrawlMiddleware:
|
||||
Return True if a page without hash fragment could be "AJAX crawlable"
|
||||
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
body = response.text[:self.lookup_bytes]
|
||||
body = response.text[: self.lookup_bytes]
|
||||
return _has_ajaxcrawlable_meta(body)
|
||||
|
||||
|
||||
# XXX: move it to w3lib?
|
||||
_ajax_crawlable_re = re.compile(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>')
|
||||
_ajax_crawlable_re = re.compile(
|
||||
r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
|
||||
)
|
||||
|
||||
|
||||
def _has_ajaxcrawlable_meta(text):
|
||||
@ -82,12 +86,12 @@ def _has_ajaxcrawlable_meta(text):
|
||||
# Stripping scripts and comments is slow (about 20x slower than
|
||||
# just checking if a string is in text); this is a quick fail-fast
|
||||
# path that should work for most pages.
|
||||
if 'fragment' not in text:
|
||||
if "fragment" not in text:
|
||||
return False
|
||||
if 'content' not in text:
|
||||
if "content" not in text:
|
||||
return False
|
||||
|
||||
text = html.remove_tags_with_content(text, ('script', 'noscript'))
|
||||
text = html.remove_tags_with_content(text, ("script", "noscript"))
|
||||
text = html.replace_entities(text)
|
||||
text = html.remove_comments(text)
|
||||
return _ajax_crawlable_re.search(text) is not None
|
||||
|
@ -29,14 +29,14 @@ class CookiesMiddleware:
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COOKIES_ENABLED'):
|
||||
if not crawler.settings.getbool("COOKIES_ENABLED"):
|
||||
raise NotConfigured
|
||||
return cls(crawler.settings.getbool('COOKIES_DEBUG'))
|
||||
return cls(crawler.settings.getbool("COOKIES_DEBUG"))
|
||||
|
||||
def _process_cookies(self, cookies, *, jar, request):
|
||||
for cookie in cookies:
|
||||
cookie_domain = cookie.domain
|
||||
if cookie_domain.startswith('.'):
|
||||
if cookie_domain.startswith("."):
|
||||
cookie_domain = cookie_domain[1:]
|
||||
|
||||
request_domain = urlparse_cached(request).hostname.lower()
|
||||
@ -49,7 +49,7 @@ class CookiesMiddleware:
|
||||
jar.set_cookie_if_ok(cookie, request)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
if request.meta.get("dont_merge_cookies", False):
|
||||
return
|
||||
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
@ -58,12 +58,12 @@ class CookiesMiddleware:
|
||||
self._process_cookies(cookies, jar=jar, request=request)
|
||||
|
||||
# set Cookie header
|
||||
request.headers.pop('Cookie', None)
|
||||
request.headers.pop("Cookie", None)
|
||||
jar.add_cookie_header(request)
|
||||
self._debug_cookie(request, spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
if request.meta.get("dont_merge_cookies", False):
|
||||
return response
|
||||
|
||||
# extract cookies from Set-Cookie and drop invalid/expired cookies
|
||||
@ -78,21 +78,25 @@ class CookiesMiddleware:
|
||||
|
||||
def _debug_cookie(self, request, spider):
|
||||
if self.debug:
|
||||
cl = [to_unicode(c, errors='replace')
|
||||
for c in request.headers.getlist('Cookie')]
|
||||
cl = [
|
||||
to_unicode(c, errors="replace")
|
||||
for c in request.headers.getlist("Cookie")
|
||||
]
|
||||
if cl:
|
||||
cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
|
||||
msg = f"Sending cookies to: {request}\n{cookies}"
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
logger.debug(msg, extra={"spider": spider})
|
||||
|
||||
def _debug_set_cookie(self, response, spider):
|
||||
if self.debug:
|
||||
cl = [to_unicode(c, errors='replace')
|
||||
for c in response.headers.getlist('Set-Cookie')]
|
||||
cl = [
|
||||
to_unicode(c, errors="replace")
|
||||
for c in response.headers.getlist("Set-Cookie")
|
||||
]
|
||||
if cl:
|
||||
cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
|
||||
msg = f"Received cookies from: {response}\n{cookies}"
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
logger.debug(msg, extra={"spider": spider})
|
||||
|
||||
def _format_cookie(self, cookie, request):
|
||||
"""
|
||||
@ -113,8 +117,11 @@ class CookiesMiddleware:
|
||||
try:
|
||||
decoded[key] = cookie[key].decode("utf8")
|
||||
except UnicodeDecodeError:
|
||||
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
|
||||
request, cookie)
|
||||
logger.warning(
|
||||
"Non UTF-8 encoded cookie found in request %s: %s",
|
||||
request,
|
||||
cookie,
|
||||
)
|
||||
decoded[key] = cookie[key].decode("latin1", errors="replace")
|
||||
|
||||
cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"
|
||||
|
@ -16,7 +16,7 @@ from scrapy.responsetypes import responsetypes
|
||||
|
||||
|
||||
warn(
|
||||
'scrapy.downloadermiddlewares.decompression is deprecated',
|
||||
"scrapy.downloadermiddlewares.decompression is deprecated",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
@ -26,15 +26,15 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DecompressionMiddleware:
|
||||
""" This middleware tries to recognise and extract the possibly compressed
|
||||
responses that may arrive. """
|
||||
"""This middleware tries to recognise and extract the possibly compressed
|
||||
responses that may arrive."""
|
||||
|
||||
def __init__(self):
|
||||
self._formats = {
|
||||
'tar': self._is_tar,
|
||||
'zip': self._is_zip,
|
||||
'gz': self._is_gzip,
|
||||
'bz2': self._is_bzip2
|
||||
"tar": self._is_tar,
|
||||
"zip": self._is_zip,
|
||||
"gz": self._is_gzip,
|
||||
"bz2": self._is_bzip2,
|
||||
}
|
||||
|
||||
def _is_tar(self, response):
|
||||
@ -86,7 +86,10 @@ class DecompressionMiddleware:
|
||||
for fmt, func in self._formats.items():
|
||||
new_response = func(response)
|
||||
if new_response:
|
||||
logger.debug('Decompressed response with format: %(responsefmt)s',
|
||||
{'responsefmt': fmt}, extra={'spider': spider})
|
||||
logger.debug(
|
||||
"Decompressed response with format: %(responsefmt)s",
|
||||
{"responsefmt": fmt},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
return new_response
|
||||
return response
|
||||
|
@ -8,13 +8,12 @@ from scrapy.utils.python import without_none_values
|
||||
|
||||
|
||||
class DefaultHeadersMiddleware:
|
||||
|
||||
def __init__(self, headers):
|
||||
self._headers = headers
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS'])
|
||||
headers = without_none_values(crawler.settings["DEFAULT_REQUEST_HEADERS"])
|
||||
return cls(headers.items())
|
||||
|
||||
def process_request(self, request, spider):
|
||||
|
@ -8,19 +8,18 @@ from scrapy import signals
|
||||
|
||||
|
||||
class DownloadTimeoutMiddleware:
|
||||
|
||||
def __init__(self, timeout=180):
|
||||
self._timeout = timeout
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings.getfloat('DOWNLOAD_TIMEOUT'))
|
||||
o = cls(crawler.settings.getfloat("DOWNLOAD_TIMEOUT"))
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self._timeout = getattr(spider, 'download_timeout', self._timeout)
|
||||
self._timeout = getattr(spider, "download_timeout", self._timeout)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self._timeout:
|
||||
request.meta.setdefault('download_timeout', self._timeout)
|
||||
request.meta.setdefault("download_timeout", self._timeout)
|
||||
|
@ -24,27 +24,29 @@ class HttpAuthMiddleware:
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
usr = getattr(spider, 'http_user', '')
|
||||
pwd = getattr(spider, 'http_pass', '')
|
||||
usr = getattr(spider, "http_user", "")
|
||||
pwd = getattr(spider, "http_pass", "")
|
||||
if usr or pwd:
|
||||
self.auth = basic_auth_header(usr, pwd)
|
||||
if not hasattr(spider, 'http_auth_domain'):
|
||||
warnings.warn('Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security '
|
||||
'problems if the spider makes requests to several different domains. http_auth_domain '
|
||||
'will be set to the domain of the first request, please set it to the correct value '
|
||||
'explicitly.',
|
||||
category=ScrapyDeprecationWarning)
|
||||
if not hasattr(spider, "http_auth_domain"):
|
||||
warnings.warn(
|
||||
"Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security "
|
||||
"problems if the spider makes requests to several different domains. http_auth_domain "
|
||||
"will be set to the domain of the first request, please set it to the correct value "
|
||||
"explicitly.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
)
|
||||
self.domain_unset = True
|
||||
else:
|
||||
self.domain = spider.http_auth_domain
|
||||
self.domain_unset = False
|
||||
|
||||
def process_request(self, request, spider):
|
||||
auth = getattr(self, 'auth', None)
|
||||
if auth and b'Authorization' not in request.headers:
|
||||
auth = getattr(self, "auth", None)
|
||||
if auth and b"Authorization" not in request.headers:
|
||||
domain = urlparse_cached(request).hostname
|
||||
if self.domain_unset:
|
||||
self.domain = domain
|
||||
self.domain_unset = False
|
||||
if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
|
||||
request.headers[b'Authorization'] = auth
|
||||
request.headers[b"Authorization"] = auth
|
||||
|
@ -29,21 +29,31 @@ HttpCacheMiddlewareTV = TypeVar("HttpCacheMiddlewareTV", bound="HttpCacheMiddlew
|
||||
|
||||
class HttpCacheMiddleware:
|
||||
|
||||
DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost, TCPTimedOutError, ResponseFailed,
|
||||
IOError)
|
||||
DOWNLOAD_EXCEPTIONS = (
|
||||
defer.TimeoutError,
|
||||
TimeoutError,
|
||||
DNSLookupError,
|
||||
ConnectionRefusedError,
|
||||
ConnectionDone,
|
||||
ConnectError,
|
||||
ConnectionLost,
|
||||
TCPTimedOutError,
|
||||
ResponseFailed,
|
||||
IOError,
|
||||
)
|
||||
|
||||
def __init__(self, settings: Settings, stats: StatsCollector) -> None:
|
||||
if not settings.getbool('HTTPCACHE_ENABLED'):
|
||||
if not settings.getbool("HTTPCACHE_ENABLED"):
|
||||
raise NotConfigured
|
||||
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
|
||||
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||
self.policy = load_object(settings["HTTPCACHE_POLICY"])(settings)
|
||||
self.storage = load_object(settings["HTTPCACHE_STORAGE"])(settings)
|
||||
self.ignore_missing = settings.getbool("HTTPCACHE_IGNORE_MISSING")
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls: Type[HttpCacheMiddlewareTV], crawler: Crawler) -> HttpCacheMiddlewareTV:
|
||||
def from_crawler(
|
||||
cls: Type[HttpCacheMiddlewareTV], crawler: Crawler
|
||||
) -> HttpCacheMiddlewareTV:
|
||||
o = cls(crawler.settings, crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
@ -56,78 +66,86 @@ class HttpCacheMiddleware:
|
||||
self.storage.close_spider(spider)
|
||||
|
||||
def process_request(self, request: Request, spider: Spider) -> Optional[Response]:
|
||||
if request.meta.get('dont_cache', False):
|
||||
if request.meta.get("dont_cache", False):
|
||||
return None
|
||||
|
||||
# Skip uncacheable requests
|
||||
if not self.policy.should_cache_request(request):
|
||||
request.meta['_dont_cache'] = True # flag as uncacheable
|
||||
request.meta["_dont_cache"] = True # flag as uncacheable
|
||||
return None
|
||||
|
||||
# Look for cached response and check if expired
|
||||
cachedresponse = self.storage.retrieve_response(spider, request)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/miss', spider=spider)
|
||||
self.stats.inc_value("httpcache/miss", spider=spider)
|
||||
if self.ignore_missing:
|
||||
self.stats.inc_value('httpcache/ignore', spider=spider)
|
||||
self.stats.inc_value("httpcache/ignore", spider=spider)
|
||||
raise IgnoreRequest(f"Ignored request not in cache: {request}")
|
||||
return None # first time request
|
||||
|
||||
# Return cached response only if not expired
|
||||
cachedresponse.flags.append('cached')
|
||||
cachedresponse.flags.append("cached")
|
||||
if self.policy.is_cached_response_fresh(cachedresponse, request):
|
||||
self.stats.inc_value('httpcache/hit', spider=spider)
|
||||
self.stats.inc_value("httpcache/hit", spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
# Keep a reference to cached response to avoid a second cache lookup on
|
||||
# process_response hook
|
||||
request.meta['cached_response'] = cachedresponse
|
||||
request.meta["cached_response"] = cachedresponse
|
||||
|
||||
return None
|
||||
|
||||
def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
|
||||
if request.meta.get('dont_cache', False):
|
||||
def process_response(
|
||||
self, request: Request, response: Response, spider: Spider
|
||||
) -> Response:
|
||||
if request.meta.get("dont_cache", False):
|
||||
return response
|
||||
|
||||
# Skip cached responses and uncacheable requests
|
||||
if 'cached' in response.flags or '_dont_cache' in request.meta:
|
||||
request.meta.pop('_dont_cache', None)
|
||||
if "cached" in response.flags or "_dont_cache" in request.meta:
|
||||
request.meta.pop("_dont_cache", None)
|
||||
return response
|
||||
|
||||
# RFC2616 requires origin server to set Date header,
|
||||
# https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
|
||||
if 'Date' not in response.headers:
|
||||
response.headers['Date'] = formatdate(usegmt=True)
|
||||
if "Date" not in response.headers:
|
||||
response.headers["Date"] = formatdate(usegmt=True)
|
||||
|
||||
# Do not validate first-hand responses
|
||||
cachedresponse = request.meta.pop('cached_response', None)
|
||||
cachedresponse = request.meta.pop("cached_response", None)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/firsthand', spider=spider)
|
||||
self.stats.inc_value("httpcache/firsthand", spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
if self.policy.is_cached_response_valid(cachedresponse, response, request):
|
||||
self.stats.inc_value('httpcache/revalidate', spider=spider)
|
||||
self.stats.inc_value("httpcache/revalidate", spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
self.stats.inc_value('httpcache/invalidate', spider=spider)
|
||||
self.stats.inc_value("httpcache/invalidate", spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
def process_exception(
|
||||
self, request: Request, exception: Exception, spider: Spider
|
||||
) -> Optional[Response]:
|
||||
cachedresponse = request.meta.pop('cached_response', None)
|
||||
if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
|
||||
self.stats.inc_value('httpcache/errorrecovery', spider=spider)
|
||||
cachedresponse = request.meta.pop("cached_response", None)
|
||||
if cachedresponse is not None and isinstance(
|
||||
exception, self.DOWNLOAD_EXCEPTIONS
|
||||
):
|
||||
self.stats.inc_value("httpcache/errorrecovery", spider=spider)
|
||||
return cachedresponse
|
||||
return None
|
||||
|
||||
def _cache_response(
|
||||
self, spider: Spider, response: Response, request: Request, cachedresponse: Optional[Response]
|
||||
self,
|
||||
spider: Spider,
|
||||
response: Response,
|
||||
request: Request,
|
||||
cachedresponse: Optional[Response],
|
||||
) -> None:
|
||||
if self.policy.should_cache_response(response, request):
|
||||
self.stats.inc_value('httpcache/store', spider=spider)
|
||||
self.stats.inc_value("httpcache/store", spider=spider)
|
||||
self.storage.store_response(spider, request, response)
|
||||
else:
|
||||
self.stats.inc_value('httpcache/uncacheable', spider=spider)
|
||||
self.stats.inc_value("httpcache/uncacheable", spider=spider)
|
||||
|
@ -8,17 +8,19 @@ from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
from scrapy.utils.gz import gunzip
|
||||
|
||||
ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
|
||||
ACCEPTED_ENCODINGS = [b"gzip", b"deflate"]
|
||||
|
||||
try:
|
||||
import brotli
|
||||
ACCEPTED_ENCODINGS.append(b'br')
|
||||
|
||||
ACCEPTED_ENCODINGS.append(b"br")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import zstandard
|
||||
ACCEPTED_ENCODINGS.append(b'zstd')
|
||||
|
||||
ACCEPTED_ENCODINGS.append(b"zstd")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
@ -26,12 +28,13 @@ except ImportError:
|
||||
class HttpCompressionMiddleware:
|
||||
"""This middleware allows compressed (gzip, deflate) traffic to be
|
||||
sent/received from web sites"""
|
||||
|
||||
def __init__(self, stats=None):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
|
||||
if not crawler.settings.getbool("COMPRESSION_ENABLED"):
|
||||
raise NotConfigured
|
||||
try:
|
||||
return cls(stats=crawler.stats)
|
||||
@ -47,21 +50,26 @@ class HttpCompressionMiddleware:
|
||||
return result
|
||||
|
||||
def process_request(self, request, spider):
|
||||
request.headers.setdefault('Accept-Encoding',
|
||||
b", ".join(ACCEPTED_ENCODINGS))
|
||||
request.headers.setdefault("Accept-Encoding", b", ".join(ACCEPTED_ENCODINGS))
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
|
||||
if request.method == 'HEAD':
|
||||
if request.method == "HEAD":
|
||||
return response
|
||||
if isinstance(response, Response):
|
||||
content_encoding = response.headers.getlist('Content-Encoding')
|
||||
content_encoding = response.headers.getlist("Content-Encoding")
|
||||
if content_encoding:
|
||||
encoding = content_encoding.pop()
|
||||
decoded_body = self._decode(response.body, encoding.lower())
|
||||
if self.stats:
|
||||
self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider)
|
||||
self.stats.inc_value('httpcompression/response_count', spider=spider)
|
||||
self.stats.inc_value(
|
||||
"httpcompression/response_bytes",
|
||||
len(decoded_body),
|
||||
spider=spider,
|
||||
)
|
||||
self.stats.inc_value(
|
||||
"httpcompression/response_count", spider=spider
|
||||
)
|
||||
respcls = responsetypes.from_args(
|
||||
headers=response.headers, url=response.url, body=decoded_body
|
||||
)
|
||||
@ -69,18 +77,18 @@ class HttpCompressionMiddleware:
|
||||
if issubclass(respcls, TextResponse):
|
||||
# force recalculating the encoding until we make sure the
|
||||
# responsetypes guessing is reliable
|
||||
kwargs['encoding'] = None
|
||||
kwargs["encoding"] = None
|
||||
response = response.replace(**kwargs)
|
||||
if not content_encoding:
|
||||
del response.headers['Content-Encoding']
|
||||
del response.headers["Content-Encoding"]
|
||||
|
||||
return response
|
||||
|
||||
def _decode(self, body, encoding):
|
||||
if encoding == b'gzip' or encoding == b'x-gzip':
|
||||
if encoding == b"gzip" or encoding == b"x-gzip":
|
||||
body = gunzip(body)
|
||||
|
||||
if encoding == b'deflate':
|
||||
if encoding == b"deflate":
|
||||
try:
|
||||
body = zlib.decompress(body)
|
||||
except zlib.error:
|
||||
@ -90,9 +98,9 @@ class HttpCompressionMiddleware:
|
||||
# http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
|
||||
# http://www.gzip.org/zlib/zlib_faq.html#faq38
|
||||
body = zlib.decompress(body, -15)
|
||||
if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
|
||||
if encoding == b"br" and b"br" in ACCEPTED_ENCODINGS:
|
||||
body = brotli.decompress(body)
|
||||
if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS:
|
||||
if encoding == b"zstd" and b"zstd" in ACCEPTED_ENCODINGS:
|
||||
# Using its streaming API since its simple API could handle only cases
|
||||
# where there is content size data embedded in the frame
|
||||
reader = zstandard.ZstdDecompressor().stream_reader(io.BytesIO(body))
|
||||
|
@ -8,8 +8,7 @@ from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
class HttpProxyMiddleware:
|
||||
|
||||
def __init__(self, auth_encoding='latin-1'):
|
||||
def __init__(self, auth_encoding="latin-1"):
|
||||
self.auth_encoding = auth_encoding
|
||||
self.proxies = {}
|
||||
for type_, url in getproxies().items():
|
||||
@ -22,20 +21,20 @@ class HttpProxyMiddleware:
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
|
||||
if not crawler.settings.getbool("HTTPPROXY_ENABLED"):
|
||||
raise NotConfigured
|
||||
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
|
||||
auth_encoding = crawler.settings.get("HTTPPROXY_AUTH_ENCODING")
|
||||
return cls(auth_encoding)
|
||||
|
||||
def _basic_auth_header(self, username, password):
|
||||
user_pass = to_bytes(
|
||||
f'{unquote(username)}:{unquote(password)}',
|
||||
encoding=self.auth_encoding)
|
||||
f"{unquote(username)}:{unquote(password)}", encoding=self.auth_encoding
|
||||
)
|
||||
return base64.b64encode(user_pass)
|
||||
|
||||
def _get_proxy(self, url, orig_type):
|
||||
proxy_type, user, password, hostport = _parse_proxy(url)
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, "", "", "", ""))
|
||||
|
||||
if user:
|
||||
creds = self._basic_auth_header(user, password)
|
||||
@ -46,39 +45,36 @@ class HttpProxyMiddleware:
|
||||
|
||||
def process_request(self, request, spider):
|
||||
creds, proxy_url = None, None
|
||||
if 'proxy' in request.meta:
|
||||
if request.meta['proxy'] is not None:
|
||||
creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
|
||||
if "proxy" in request.meta:
|
||||
if request.meta["proxy"] is not None:
|
||||
creds, proxy_url = self._get_proxy(request.meta["proxy"], "")
|
||||
elif self.proxies:
|
||||
parsed = urlparse_cached(request)
|
||||
scheme = parsed.scheme
|
||||
if (
|
||||
(
|
||||
# 'no_proxy' is only supported by http schemes
|
||||
scheme not in ('http', 'https')
|
||||
or not proxy_bypass(parsed.hostname)
|
||||
)
|
||||
and scheme in self.proxies
|
||||
):
|
||||
# 'no_proxy' is only supported by http schemes
|
||||
scheme not in ("http", "https")
|
||||
or not proxy_bypass(parsed.hostname)
|
||||
) and scheme in self.proxies:
|
||||
creds, proxy_url = self.proxies[scheme]
|
||||
|
||||
self._set_proxy_and_creds(request, proxy_url, creds)
|
||||
|
||||
def _set_proxy_and_creds(self, request, proxy_url, creds):
|
||||
if proxy_url:
|
||||
request.meta['proxy'] = proxy_url
|
||||
elif request.meta.get('proxy') is not None:
|
||||
request.meta['proxy'] = None
|
||||
request.meta["proxy"] = proxy_url
|
||||
elif request.meta.get("proxy") is not None:
|
||||
request.meta["proxy"] = None
|
||||
if creds:
|
||||
request.headers[b'Proxy-Authorization'] = b'Basic ' + creds
|
||||
request.meta['_auth_proxy'] = proxy_url
|
||||
elif '_auth_proxy' in request.meta:
|
||||
if proxy_url != request.meta['_auth_proxy']:
|
||||
if b'Proxy-Authorization' in request.headers:
|
||||
del request.headers[b'Proxy-Authorization']
|
||||
del request.meta['_auth_proxy']
|
||||
elif b'Proxy-Authorization' in request.headers:
|
||||
request.headers[b"Proxy-Authorization"] = b"Basic " + creds
|
||||
request.meta["_auth_proxy"] = proxy_url
|
||||
elif "_auth_proxy" in request.meta:
|
||||
if proxy_url != request.meta["_auth_proxy"]:
|
||||
if b"Proxy-Authorization" in request.headers:
|
||||
del request.headers[b"Proxy-Authorization"]
|
||||
del request.meta["_auth_proxy"]
|
||||
elif b"Proxy-Authorization" in request.headers:
|
||||
if proxy_url:
|
||||
request.meta['_auth_proxy'] = proxy_url
|
||||
request.meta["_auth_proxy"] = proxy_url
|
||||
else:
|
||||
del request.headers[b'Proxy-Authorization']
|
||||
del request.headers[b"Proxy-Authorization"]
|
||||
|
@ -17,57 +17,66 @@ def _build_redirect_request(source_request, *, url, **kwargs):
|
||||
**kwargs,
|
||||
cookies=None,
|
||||
)
|
||||
if 'Cookie' in redirect_request.headers:
|
||||
if "Cookie" in redirect_request.headers:
|
||||
source_request_netloc = urlparse_cached(source_request).netloc
|
||||
redirect_request_netloc = urlparse_cached(redirect_request).netloc
|
||||
if source_request_netloc != redirect_request_netloc:
|
||||
del redirect_request.headers['Cookie']
|
||||
del redirect_request.headers["Cookie"]
|
||||
return redirect_request
|
||||
|
||||
|
||||
class BaseRedirectMiddleware:
|
||||
|
||||
enabled_setting = 'REDIRECT_ENABLED'
|
||||
enabled_setting = "REDIRECT_ENABLED"
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool(self.enabled_setting):
|
||||
raise NotConfigured
|
||||
|
||||
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
|
||||
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
|
||||
self.max_redirect_times = settings.getint("REDIRECT_MAX_TIMES")
|
||||
self.priority_adjust = settings.getint("REDIRECT_PRIORITY_ADJUST")
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def _redirect(self, redirected, request, spider, reason):
|
||||
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
|
||||
redirects = request.meta.get('redirect_times', 0) + 1
|
||||
ttl = request.meta.setdefault("redirect_ttl", self.max_redirect_times)
|
||||
redirects = request.meta.get("redirect_times", 0) + 1
|
||||
|
||||
if ttl and redirects <= self.max_redirect_times:
|
||||
redirected.meta['redirect_times'] = redirects
|
||||
redirected.meta['redirect_ttl'] = ttl - 1
|
||||
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + [request.url]
|
||||
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + [reason]
|
||||
redirected.meta["redirect_times"] = redirects
|
||||
redirected.meta["redirect_ttl"] = ttl - 1
|
||||
redirected.meta["redirect_urls"] = request.meta.get("redirect_urls", []) + [
|
||||
request.url
|
||||
]
|
||||
redirected.meta["redirect_reasons"] = request.meta.get(
|
||||
"redirect_reasons", []
|
||||
) + [reason]
|
||||
redirected.dont_filter = request.dont_filter
|
||||
redirected.priority = request.priority + self.priority_adjust
|
||||
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
|
||||
{'reason': reason, 'redirected': redirected, 'request': request},
|
||||
extra={'spider': spider})
|
||||
logger.debug(
|
||||
"Redirecting (%(reason)s) to %(redirected)s from %(request)s",
|
||||
{"reason": reason, "redirected": redirected, "request": request},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
return redirected
|
||||
logger.debug("Discarding %(request)s: max redirections reached",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
logger.debug(
|
||||
"Discarding %(request)s: max redirections reached",
|
||||
{"request": request},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
raise IgnoreRequest("max redirections reached")
|
||||
|
||||
def _redirect_request_using_get(self, request, redirect_url):
|
||||
redirect_request = _build_redirect_request(
|
||||
request,
|
||||
url=redirect_url,
|
||||
method='GET',
|
||||
body='',
|
||||
method="GET",
|
||||
body="",
|
||||
)
|
||||
redirect_request.headers.pop('Content-Type', None)
|
||||
redirect_request.headers.pop('Content-Length', None)
|
||||
redirect_request.headers.pop("Content-Type", None)
|
||||
redirect_request.headers.pop("Content-Length", None)
|
||||
return redirect_request
|
||||
|
||||
|
||||
@ -79,25 +88,25 @@ class RedirectMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if (
|
||||
request.meta.get('dont_redirect', False)
|
||||
or response.status in getattr(spider, 'handle_httpstatus_list', [])
|
||||
or response.status in request.meta.get('handle_httpstatus_list', [])
|
||||
or request.meta.get('handle_httpstatus_all', False)
|
||||
request.meta.get("dont_redirect", False)
|
||||
or response.status in getattr(spider, "handle_httpstatus_list", [])
|
||||
or response.status in request.meta.get("handle_httpstatus_list", [])
|
||||
or request.meta.get("handle_httpstatus_all", False)
|
||||
):
|
||||
return response
|
||||
|
||||
allowed_status = (301, 302, 303, 307, 308)
|
||||
if 'Location' not in response.headers or response.status not in allowed_status:
|
||||
if "Location" not in response.headers or response.status not in allowed_status:
|
||||
return response
|
||||
|
||||
location = safe_url_string(response.headers['Location'])
|
||||
if response.headers['Location'].startswith(b'//'):
|
||||
location = safe_url_string(response.headers["Location"])
|
||||
if response.headers["Location"].startswith(b"//"):
|
||||
request_scheme = urlparse(request.url).scheme
|
||||
location = request_scheme + '://' + location.lstrip('/')
|
||||
location = request_scheme + "://" + location.lstrip("/")
|
||||
|
||||
redirected_url = urljoin(request.url, location)
|
||||
|
||||
if response.status in (301, 307, 308) or request.method == 'HEAD':
|
||||
if response.status in (301, 307, 308) or request.method == "HEAD":
|
||||
redirected = _build_redirect_request(request, url=redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
@ -107,25 +116,24 @@ class RedirectMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
enabled_setting = 'METAREFRESH_ENABLED'
|
||||
enabled_setting = "METAREFRESH_ENABLED"
|
||||
|
||||
def __init__(self, settings):
|
||||
super().__init__(settings)
|
||||
self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
|
||||
self._maxdelay = settings.getint('METAREFRESH_MAXDELAY')
|
||||
self._ignore_tags = settings.getlist("METAREFRESH_IGNORE_TAGS")
|
||||
self._maxdelay = settings.getint("METAREFRESH_MAXDELAY")
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if (
|
||||
request.meta.get('dont_redirect', False)
|
||||
or request.method == 'HEAD'
|
||||
request.meta.get("dont_redirect", False)
|
||||
or request.method == "HEAD"
|
||||
or not isinstance(response, HtmlResponse)
|
||||
):
|
||||
return response
|
||||
|
||||
interval, url = get_meta_refresh(response,
|
||||
ignore_tags=self._ignore_tags)
|
||||
interval, url = get_meta_refresh(response, ignore_tags=self._ignore_tags)
|
||||
if url and interval < self._maxdelay:
|
||||
redirected = self._redirect_request_using_get(request, url)
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
return self._redirect(redirected, request, spider, "meta refresh")
|
||||
|
||||
return response
|
||||
|
@ -39,11 +39,11 @@ def get_retry_request(
|
||||
request: Request,
|
||||
*,
|
||||
spider: Spider,
|
||||
reason: Union[str, Exception] = 'unspecified',
|
||||
reason: Union[str, Exception] = "unspecified",
|
||||
max_retry_times: Optional[int] = None,
|
||||
priority_adjust: Optional[int] = None,
|
||||
logger: Logger = retry_logger,
|
||||
stats_base_key: str = 'retry',
|
||||
stats_base_key: str = "retry",
|
||||
):
|
||||
"""
|
||||
Returns a new :class:`~scrapy.Request` object to retry the specified
|
||||
@ -87,22 +87,22 @@ def get_retry_request(
|
||||
"""
|
||||
settings = spider.crawler.settings
|
||||
stats = spider.crawler.stats
|
||||
retry_times = request.meta.get('retry_times', 0) + 1
|
||||
retry_times = request.meta.get("retry_times", 0) + 1
|
||||
if max_retry_times is None:
|
||||
max_retry_times = request.meta.get('max_retry_times')
|
||||
max_retry_times = request.meta.get("max_retry_times")
|
||||
if max_retry_times is None:
|
||||
max_retry_times = settings.getint('RETRY_TIMES')
|
||||
max_retry_times = settings.getint("RETRY_TIMES")
|
||||
if retry_times <= max_retry_times:
|
||||
logger.debug(
|
||||
"Retrying %(request)s (failed %(retry_times)d times): %(reason)s",
|
||||
{'request': request, 'retry_times': retry_times, 'reason': reason},
|
||||
extra={'spider': spider}
|
||||
{"request": request, "retry_times": retry_times, "reason": reason},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
new_request: Request = request.copy()
|
||||
new_request.meta['retry_times'] = retry_times
|
||||
new_request.meta["retry_times"] = retry_times
|
||||
new_request.dont_filter = True
|
||||
if priority_adjust is None:
|
||||
priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
|
||||
priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST")
|
||||
new_request.priority = request.priority + priority_adjust
|
||||
|
||||
if callable(reason):
|
||||
@ -110,15 +110,14 @@ def get_retry_request(
|
||||
if isinstance(reason, Exception):
|
||||
reason = global_object_name(reason.__class__)
|
||||
|
||||
stats.inc_value(f'{stats_base_key}/count')
|
||||
stats.inc_value(f'{stats_base_key}/reason_count/{reason}')
|
||||
stats.inc_value(f"{stats_base_key}/count")
|
||||
stats.inc_value(f"{stats_base_key}/reason_count/{reason}")
|
||||
return new_request
|
||||
stats.inc_value(f'{stats_base_key}/max_reached')
|
||||
stats.inc_value(f"{stats_base_key}/max_reached")
|
||||
logger.error(
|
||||
"Gave up retrying %(request)s (failed %(retry_times)d times): "
|
||||
"%(reason)s",
|
||||
{'request': request, 'retry_times': retry_times, 'reason': reason},
|
||||
extra={'spider': spider},
|
||||
"Gave up retrying %(request)s (failed %(retry_times)d times): " "%(reason)s",
|
||||
{"request": request, "retry_times": retry_times, "reason": reason},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
return None
|
||||
|
||||
@ -127,24 +126,35 @@ class RetryMiddleware:
|
||||
|
||||
# IOError is raised by the HttpCompression middleware when trying to
|
||||
# decompress an empty response
|
||||
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost, TCPTimedOutError, ResponseFailed,
|
||||
IOError, TunnelError)
|
||||
EXCEPTIONS_TO_RETRY = (
|
||||
defer.TimeoutError,
|
||||
TimeoutError,
|
||||
DNSLookupError,
|
||||
ConnectionRefusedError,
|
||||
ConnectionDone,
|
||||
ConnectError,
|
||||
ConnectionLost,
|
||||
TCPTimedOutError,
|
||||
ResponseFailed,
|
||||
IOError,
|
||||
TunnelError,
|
||||
)
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('RETRY_ENABLED'):
|
||||
if not settings.getbool("RETRY_ENABLED"):
|
||||
raise NotConfigured
|
||||
self.max_retry_times = settings.getint('RETRY_TIMES')
|
||||
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
|
||||
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
|
||||
self.max_retry_times = settings.getint("RETRY_TIMES")
|
||||
self.retry_http_codes = set(
|
||||
int(x) for x in settings.getlist("RETRY_HTTP_CODES")
|
||||
)
|
||||
self.priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST")
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_retry', False):
|
||||
if request.meta.get("dont_retry", False):
|
||||
return response
|
||||
if response.status in self.retry_http_codes:
|
||||
reason = response_status_message(response.status)
|
||||
@ -152,15 +162,14 @@ class RetryMiddleware:
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if (
|
||||
isinstance(exception, self.EXCEPTIONS_TO_RETRY)
|
||||
and not request.meta.get('dont_retry', False)
|
||||
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) and not request.meta.get(
|
||||
"dont_retry", False
|
||||
):
|
||||
return self._retry(request, exception, spider)
|
||||
|
||||
def _retry(self, request, reason, spider):
|
||||
max_retry_times = request.meta.get('max_retry_times', self.max_retry_times)
|
||||
priority_adjust = request.meta.get('priority_adjust', self.priority_adjust)
|
||||
max_retry_times = request.meta.get("max_retry_times", self.max_retry_times)
|
||||
priority_adjust = request.meta.get("priority_adjust", self.priority_adjust)
|
||||
return get_retry_request(
|
||||
request,
|
||||
reason=reason,
|
||||
|
@ -20,23 +20,23 @@ class RobotsTxtMiddleware:
|
||||
DOWNLOAD_PRIORITY = 1000
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
|
||||
if not crawler.settings.getbool("ROBOTSTXT_OBEY"):
|
||||
raise NotConfigured
|
||||
self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
|
||||
self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT', None)
|
||||
self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
|
||||
self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
|
||||
self.crawler = crawler
|
||||
self._parsers = {}
|
||||
self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER'))
|
||||
self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))
|
||||
|
||||
# check if parser dependencies are met, this should throw an error otherwise.
|
||||
self._parserimpl.from_crawler(self.crawler, b'')
|
||||
self._parserimpl.from_crawler(self.crawler, b"")
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_obey_robotstxt'):
|
||||
if request.meta.get("dont_obey_robotstxt"):
|
||||
return
|
||||
d = maybeDeferred(self.robot_parser, request, spider)
|
||||
d.addCallback(self.process_request_2, request, spider)
|
||||
@ -48,11 +48,14 @@ class RobotsTxtMiddleware:
|
||||
|
||||
useragent = self._robotstxt_useragent
|
||||
if not useragent:
|
||||
useragent = request.headers.get(b'User-Agent', self._default_useragent)
|
||||
useragent = request.headers.get(b"User-Agent", self._default_useragent)
|
||||
if not rp.allowed(request.url, useragent):
|
||||
logger.debug("Forbidden by robots.txt: %(request)s",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
self.crawler.stats.inc_value('robotstxt/forbidden')
|
||||
logger.debug(
|
||||
"Forbidden by robots.txt: %(request)s",
|
||||
{"request": request},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
self.crawler.stats.inc_value("robotstxt/forbidden")
|
||||
raise IgnoreRequest("Forbidden by robots.txt")
|
||||
|
||||
def robot_parser(self, request, spider):
|
||||
@ -65,13 +68,13 @@ class RobotsTxtMiddleware:
|
||||
robotsreq = Request(
|
||||
robotsurl,
|
||||
priority=self.DOWNLOAD_PRIORITY,
|
||||
meta={'dont_obey_robotstxt': True}
|
||||
meta={"dont_obey_robotstxt": True},
|
||||
)
|
||||
dfd = self.crawler.engine.download(robotsreq)
|
||||
dfd.addCallback(self._parse_robots, netloc, spider)
|
||||
dfd.addErrback(self._logerror, robotsreq, spider)
|
||||
dfd.addErrback(self._robots_error, netloc)
|
||||
self.crawler.stats.inc_value('robotstxt/request_count')
|
||||
self.crawler.stats.inc_value("robotstxt/request_count")
|
||||
|
||||
if isinstance(self._parsers[netloc], Deferred):
|
||||
d = Deferred()
|
||||
@ -79,21 +82,26 @@ class RobotsTxtMiddleware:
|
||||
def cb(result):
|
||||
d.callback(result)
|
||||
return result
|
||||
|
||||
self._parsers[netloc].addCallback(cb)
|
||||
return d
|
||||
return self._parsers[netloc]
|
||||
|
||||
def _logerror(self, failure, request, spider):
|
||||
if failure.type is not IgnoreRequest:
|
||||
logger.error("Error downloading %(request)s: %(f_exception)s",
|
||||
{'request': request, 'f_exception': failure.value},
|
||||
exc_info=failure_to_exc_info(failure),
|
||||
extra={'spider': spider})
|
||||
logger.error(
|
||||
"Error downloading %(request)s: %(f_exception)s",
|
||||
{"request": request, "f_exception": failure.value},
|
||||
exc_info=failure_to_exc_info(failure),
|
||||
extra={"spider": spider},
|
||||
)
|
||||
return failure
|
||||
|
||||
def _parse_robots(self, response, netloc, spider):
|
||||
self.crawler.stats.inc_value('robotstxt/response_count')
|
||||
self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
|
||||
self.crawler.stats.inc_value("robotstxt/response_count")
|
||||
self.crawler.stats.inc_value(
|
||||
f"robotstxt/response_status_count/{response.status}"
|
||||
)
|
||||
rp = self._parserimpl.from_crawler(self.crawler, response.body)
|
||||
rp_dfd = self._parsers[netloc]
|
||||
self._parsers[netloc] = rp
|
||||
@ -101,7 +109,7 @@ class RobotsTxtMiddleware:
|
||||
|
||||
def _robots_error(self, failure, netloc):
|
||||
if failure.type is not IgnoreRequest:
|
||||
key = f'robotstxt/exception_count/{failure.type}'
|
||||
key = f"robotstxt/exception_count/{failure.type}"
|
||||
self.crawler.stats.inc_value(key)
|
||||
rp_dfd = self._parsers[netloc]
|
||||
self._parsers[netloc] = None
|
||||
|
@ -11,40 +11,50 @@ def get_header_size(headers):
|
||||
if isinstance(value, (list, tuple)):
|
||||
for v in value:
|
||||
size += len(b": ") + len(key) + len(v)
|
||||
return size + len(b'\r\n') * (len(headers.keys()) - 1)
|
||||
return size + len(b"\r\n") * (len(headers.keys()) - 1)
|
||||
|
||||
|
||||
def get_status_size(response_status):
|
||||
return len(to_bytes(http.RESPONSES.get(response_status, b''))) + 15
|
||||
return len(to_bytes(http.RESPONSES.get(response_status, b""))) + 15
|
||||
# resp.status + b"\r\n" + b"HTTP/1.1 <100-599> "
|
||||
|
||||
|
||||
class DownloaderStats:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('DOWNLOADER_STATS'):
|
||||
if not crawler.settings.getbool("DOWNLOADER_STATS"):
|
||||
raise NotConfigured
|
||||
return cls(crawler.stats)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
self.stats.inc_value('downloader/request_count', spider=spider)
|
||||
self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
|
||||
self.stats.inc_value("downloader/request_count", spider=spider)
|
||||
self.stats.inc_value(
|
||||
f"downloader/request_method_count/{request.method}", spider=spider
|
||||
)
|
||||
reqlen = len(request_httprepr(request))
|
||||
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
|
||||
self.stats.inc_value("downloader/request_bytes", reqlen, spider=spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
self.stats.inc_value('downloader/response_count', spider=spider)
|
||||
self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
|
||||
reslen = len(response.body) + get_header_size(response.headers) + get_status_size(response.status) + 4
|
||||
self.stats.inc_value("downloader/response_count", spider=spider)
|
||||
self.stats.inc_value(
|
||||
f"downloader/response_status_count/{response.status}", spider=spider
|
||||
)
|
||||
reslen = (
|
||||
len(response.body)
|
||||
+ get_header_size(response.headers)
|
||||
+ get_status_size(response.status)
|
||||
+ 4
|
||||
)
|
||||
# response.body + b"\r\n"+ response.header + b"\r\n" + response.status
|
||||
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
|
||||
self.stats.inc_value("downloader/response_bytes", reslen, spider=spider)
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
ex_class = global_object_name(exception.__class__)
|
||||
self.stats.inc_value('downloader/exception_count', spider=spider)
|
||||
self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)
|
||||
self.stats.inc_value("downloader/exception_count", spider=spider)
|
||||
self.stats.inc_value(
|
||||
f"downloader/exception_type_count/{ex_class}", spider=spider
|
||||
)
|
||||
|
@ -6,18 +6,18 @@ from scrapy import signals
|
||||
class UserAgentMiddleware:
|
||||
"""This middleware allows spiders to override the user_agent"""
|
||||
|
||||
def __init__(self, user_agent='Scrapy'):
|
||||
def __init__(self, user_agent="Scrapy"):
|
||||
self.user_agent = user_agent
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings['USER_AGENT'])
|
||||
o = cls(crawler.settings["USER_AGENT"])
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
|
||||
self.user_agent = getattr(spider, "user_agent", self.user_agent)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self.user_agent:
|
||||
request.headers.setdefault(b'User-Agent', self.user_agent)
|
||||
request.headers.setdefault(b"User-Agent", self.user_agent)
|
||||
|
@ -18,7 +18,9 @@ BaseDupeFilterTV = TypeVar("BaseDupeFilterTV", bound="BaseDupeFilter")
|
||||
|
||||
class BaseDupeFilter:
|
||||
@classmethod
|
||||
def from_settings(cls: Type[BaseDupeFilterTV], settings: BaseSettings) -> BaseDupeFilterTV:
|
||||
def from_settings(
|
||||
cls: Type[BaseDupeFilterTV], settings: BaseSettings
|
||||
) -> BaseDupeFilterTV:
|
||||
return cls()
|
||||
|
||||
def request_seen(self, request: Request) -> bool:
|
||||
@ -55,13 +57,15 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
self.debug = debug
|
||||
self.logger = logging.getLogger(__name__)
|
||||
if path:
|
||||
self.file = Path(path, 'requests.seen').open('a+', encoding="utf-8")
|
||||
self.file = Path(path, "requests.seen").open("a+", encoding="utf-8")
|
||||
self.file.seek(0)
|
||||
self.fingerprints.update(x.rstrip() for x in self.file)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls: Type[RFPDupeFilterTV], settings: BaseSettings, *, fingerprinter=None) -> RFPDupeFilterTV:
|
||||
debug = settings.getbool('DUPEFILTER_DEBUG')
|
||||
def from_settings(
|
||||
cls: Type[RFPDupeFilterTV], settings: BaseSettings, *, fingerprinter=None
|
||||
) -> RFPDupeFilterTV:
|
||||
debug = settings.getbool("DUPEFILTER_DEBUG")
|
||||
try:
|
||||
return cls(job_dir(settings), debug, fingerprinter=fingerprinter)
|
||||
except TypeError:
|
||||
@ -100,7 +104,7 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
return True
|
||||
self.fingerprints.add(fp)
|
||||
if self.file:
|
||||
self.file.write(fp + '\n')
|
||||
self.file.write(fp + "\n")
|
||||
return False
|
||||
|
||||
def request_fingerprint(self, request: Request) -> str:
|
||||
@ -113,13 +117,15 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
def log(self, request: Request, spider: Spider) -> None:
|
||||
if self.debug:
|
||||
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
|
||||
args = {'request': request, 'referer': referer_str(request)}
|
||||
self.logger.debug(msg, args, extra={'spider': spider})
|
||||
args = {"request": request, "referer": referer_str(request)}
|
||||
self.logger.debug(msg, args, extra={"spider": spider})
|
||||
elif self.logdupes:
|
||||
msg = ("Filtered duplicate request: %(request)s"
|
||||
" - no more duplicates will be shown"
|
||||
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
||||
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||
msg = (
|
||||
"Filtered duplicate request: %(request)s"
|
||||
" - no more duplicates will be shown"
|
||||
" (see DUPEFILTER_DEBUG to show all duplicates)"
|
||||
)
|
||||
self.logger.debug(msg, {"request": request}, extra={"spider": spider})
|
||||
self.logdupes = False
|
||||
|
||||
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
||||
spider.crawler.stats.inc_value("dupefilter/filtered", spider=spider)
|
||||
|
@ -10,6 +10,7 @@ new exceptions here without documenting them there.
|
||||
|
||||
class NotConfigured(Exception):
|
||||
"""Indicates a missing configuration situation"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@ -18,6 +19,7 @@ class _InvalidOutput(TypeError):
|
||||
Indicates an invalid value has been returned by a middleware's processing method.
|
||||
Internal and undocumented, it should not be raised or caught by user code.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@ -30,13 +32,14 @@ class IgnoreRequest(Exception):
|
||||
|
||||
class DontCloseSpider(Exception):
|
||||
"""Request the spider not to be closed yet"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class CloseSpider(Exception):
|
||||
"""Raise this from callbacks to request the spider to be closed"""
|
||||
|
||||
def __init__(self, reason='cancelled'):
|
||||
def __init__(self, reason="cancelled"):
|
||||
super().__init__()
|
||||
self.reason = reason
|
||||
|
||||
@ -58,11 +61,13 @@ class StopDownload(Exception):
|
||||
|
||||
class DropItem(Exception):
|
||||
"""Drop item from the item pipeline"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class NotSupported(Exception):
|
||||
"""Indicates a feature or method is not supported"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@ -73,7 +78,7 @@ class UsageError(Exception):
|
||||
"""To indicate a command-line usage error"""
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
self.print_help = kw.pop('print_help', True)
|
||||
self.print_help = kw.pop("print_help", True)
|
||||
super().__init__(*a, **kw)
|
||||
|
||||
|
||||
@ -81,9 +86,11 @@ class ScrapyDeprecationWarning(Warning):
|
||||
"""Warning category for deprecated features, since the default
|
||||
DeprecationWarning is silenced on Python 2.7+
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ContractFail(AssertionError):
|
||||
"""Error raised in case of a failing contract"""
|
||||
|
||||
pass
|
||||
|
@ -19,13 +19,19 @@ from scrapy.utils.python import is_listlike, to_bytes, to_unicode
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||
|
||||
|
||||
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
|
||||
'CsvItemExporter', 'XmlItemExporter', 'JsonLinesItemExporter',
|
||||
'JsonItemExporter', 'MarshalItemExporter']
|
||||
__all__ = [
|
||||
"BaseItemExporter",
|
||||
"PprintItemExporter",
|
||||
"PickleItemExporter",
|
||||
"CsvItemExporter",
|
||||
"XmlItemExporter",
|
||||
"JsonLinesItemExporter",
|
||||
"JsonItemExporter",
|
||||
"MarshalItemExporter",
|
||||
]
|
||||
|
||||
|
||||
class BaseItemExporter:
|
||||
|
||||
def __init__(self, *, dont_fail=False, **kwargs):
|
||||
self._kwargs = kwargs
|
||||
self._configure(kwargs, dont_fail=dont_fail)
|
||||
@ -35,10 +41,10 @@ class BaseItemExporter:
|
||||
If dont_fail is set, it won't raise an exception on unexpected options
|
||||
(useful for using with keyword arguments in subclasses ``__init__`` methods)
|
||||
"""
|
||||
self.encoding = options.pop('encoding', None)
|
||||
self.fields_to_export = options.pop('fields_to_export', None)
|
||||
self.export_empty_fields = options.pop('export_empty_fields', False)
|
||||
self.indent = options.pop('indent', None)
|
||||
self.encoding = options.pop("encoding", None)
|
||||
self.fields_to_export = options.pop("fields_to_export", None)
|
||||
self.export_empty_fields = options.pop("export_empty_fields", False)
|
||||
self.indent = options.pop("indent", None)
|
||||
if not dont_fail and options:
|
||||
raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
|
||||
|
||||
@ -46,7 +52,7 @@ class BaseItemExporter:
|
||||
raise NotImplementedError
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', lambda x: x)
|
||||
serializer = field.get("serializer", lambda x: x)
|
||||
return serializer(value)
|
||||
|
||||
def start_exporting(self):
|
||||
@ -74,8 +80,7 @@ class BaseItemExporter:
|
||||
field_iter = self.fields_to_export.items()
|
||||
else:
|
||||
field_iter = (
|
||||
(x, y) for x, y in self.fields_to_export.items()
|
||||
if x in item
|
||||
(x, y) for x, y in self.fields_to_export.items() if x in item
|
||||
)
|
||||
else:
|
||||
if include_empty:
|
||||
@ -98,36 +103,36 @@ class BaseItemExporter:
|
||||
|
||||
|
||||
class JsonLinesItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
super().__init__(dont_fail=True, **kwargs)
|
||||
self.file = file
|
||||
self._kwargs.setdefault('ensure_ascii', not self.encoding)
|
||||
self._kwargs.setdefault("ensure_ascii", not self.encoding)
|
||||
self.encoder = ScrapyJSONEncoder(**self._kwargs)
|
||||
|
||||
def export_item(self, item):
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
data = self.encoder.encode(itemdict) + '\n'
|
||||
data = self.encoder.encode(itemdict) + "\n"
|
||||
self.file.write(to_bytes(data, self.encoding))
|
||||
|
||||
|
||||
class JsonItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
super().__init__(dont_fail=True, **kwargs)
|
||||
self.file = file
|
||||
# there is a small difference between the behaviour or JsonItemExporter.indent
|
||||
# and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
|
||||
# the addition of newlines everywhere
|
||||
json_indent = self.indent if self.indent is not None and self.indent > 0 else None
|
||||
self._kwargs.setdefault('indent', json_indent)
|
||||
self._kwargs.setdefault('ensure_ascii', not self.encoding)
|
||||
json_indent = (
|
||||
self.indent if self.indent is not None and self.indent > 0 else None
|
||||
)
|
||||
self._kwargs.setdefault("indent", json_indent)
|
||||
self._kwargs.setdefault("ensure_ascii", not self.encoding)
|
||||
self.encoder = ScrapyJSONEncoder(**self._kwargs)
|
||||
self.first_item = True
|
||||
|
||||
def _beautify_newline(self):
|
||||
if self.indent is not None:
|
||||
self.file.write(b'\n')
|
||||
self.file.write(b"\n")
|
||||
|
||||
def start_exporting(self):
|
||||
self.file.write(b"[")
|
||||
@ -141,7 +146,7 @@ class JsonItemExporter(BaseItemExporter):
|
||||
if self.first_item:
|
||||
self.first_item = False
|
||||
else:
|
||||
self.file.write(b',')
|
||||
self.file.write(b",")
|
||||
self._beautify_newline()
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
data = self.encoder.encode(itemdict)
|
||||
@ -149,22 +154,21 @@ class JsonItemExporter(BaseItemExporter):
|
||||
|
||||
|
||||
class XmlItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self.item_element = kwargs.pop('item_element', 'item')
|
||||
self.root_element = kwargs.pop('root_element', 'items')
|
||||
self.item_element = kwargs.pop("item_element", "item")
|
||||
self.root_element = kwargs.pop("root_element", "items")
|
||||
super().__init__(**kwargs)
|
||||
if not self.encoding:
|
||||
self.encoding = 'utf-8'
|
||||
self.encoding = "utf-8"
|
||||
self.xg = XMLGenerator(file, encoding=self.encoding)
|
||||
|
||||
def _beautify_newline(self, new_item=False):
|
||||
if self.indent is not None and (self.indent > 0 or new_item):
|
||||
self.xg.characters('\n')
|
||||
self.xg.characters("\n")
|
||||
|
||||
def _beautify_indent(self, depth=1):
|
||||
if self.indent:
|
||||
self.xg.characters(' ' * self.indent * depth)
|
||||
self.xg.characters(" " * self.indent * depth)
|
||||
|
||||
def start_exporting(self):
|
||||
self.xg.startDocument()
|
||||
@ -175,7 +179,7 @@ class XmlItemExporter(BaseItemExporter):
|
||||
self._beautify_indent(depth=1)
|
||||
self.xg.startElement(self.item_element, {})
|
||||
self._beautify_newline()
|
||||
for name, value in self._get_serialized_fields(item, default_value=''):
|
||||
for name, value in self._get_serialized_fields(item, default_value=""):
|
||||
self._export_xml_field(name, value, depth=2)
|
||||
self._beautify_indent(depth=1)
|
||||
self.xg.endElement(self.item_element)
|
||||
@ -188,7 +192,7 @@ class XmlItemExporter(BaseItemExporter):
|
||||
def _export_xml_field(self, name, serialized_value, depth):
|
||||
self._beautify_indent(depth=depth)
|
||||
self.xg.startElement(name, {})
|
||||
if hasattr(serialized_value, 'items'):
|
||||
if hasattr(serialized_value, "items"):
|
||||
self._beautify_newline()
|
||||
for subname, value in serialized_value.items():
|
||||
self._export_xml_field(subname, value, depth=depth + 1)
|
||||
@ -196,7 +200,7 @@ class XmlItemExporter(BaseItemExporter):
|
||||
elif is_listlike(serialized_value):
|
||||
self._beautify_newline()
|
||||
for value in serialized_value:
|
||||
self._export_xml_field('value', value, depth=depth + 1)
|
||||
self._export_xml_field("value", value, depth=depth + 1)
|
||||
self._beautify_indent(depth=depth)
|
||||
elif isinstance(serialized_value, str):
|
||||
self.xg.characters(serialized_value)
|
||||
@ -207,18 +211,24 @@ class XmlItemExporter(BaseItemExporter):
|
||||
|
||||
|
||||
class CsvItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, include_headers_line=True, join_multivalued=',', errors=None, **kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
file,
|
||||
include_headers_line=True,
|
||||
join_multivalued=",",
|
||||
errors=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(dont_fail=True, **kwargs)
|
||||
if not self.encoding:
|
||||
self.encoding = 'utf-8'
|
||||
self.encoding = "utf-8"
|
||||
self.include_headers_line = include_headers_line
|
||||
self.stream = io.TextIOWrapper(
|
||||
file,
|
||||
line_buffering=False,
|
||||
write_through=True,
|
||||
encoding=self.encoding,
|
||||
newline='', # Windows needs this https://github.com/scrapy/scrapy/issues/3034
|
||||
newline="", # Windows needs this https://github.com/scrapy/scrapy/issues/3034
|
||||
errors=errors,
|
||||
)
|
||||
self.csv_writer = csv.writer(self.stream, **self._kwargs)
|
||||
@ -226,7 +236,7 @@ class CsvItemExporter(BaseItemExporter):
|
||||
self._join_multivalued = join_multivalued
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', self._join_if_needed)
|
||||
serializer = field.get("serializer", self._join_if_needed)
|
||||
return serializer(value)
|
||||
|
||||
def _join_if_needed(self, value):
|
||||
@ -242,8 +252,7 @@ class CsvItemExporter(BaseItemExporter):
|
||||
self._headers_not_written = False
|
||||
self._write_headers_and_set_fields_to_export(item)
|
||||
|
||||
fields = self._get_serialized_fields(item, default_value='',
|
||||
include_empty=True)
|
||||
fields = self._get_serialized_fields(item, default_value="", include_empty=True)
|
||||
values = list(self._build_row(x for _, x in fields))
|
||||
self.csv_writer.writerow(values)
|
||||
|
||||
@ -268,7 +277,6 @@ class CsvItemExporter(BaseItemExporter):
|
||||
|
||||
|
||||
class PickleItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, protocol=4, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.file = file
|
||||
@ -297,14 +305,13 @@ class MarshalItemExporter(BaseItemExporter):
|
||||
|
||||
|
||||
class PprintItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.file = file
|
||||
|
||||
def export_item(self, item):
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(to_bytes(pprint.pformat(itemdict) + '\n'))
|
||||
self.file.write(to_bytes(pprint.pformat(itemdict) + "\n"))
|
||||
|
||||
|
||||
class PythonItemExporter(BaseItemExporter):
|
||||
@ -318,17 +325,18 @@ class PythonItemExporter(BaseItemExporter):
|
||||
"""
|
||||
|
||||
def _configure(self, options, dont_fail=False):
|
||||
self.binary = options.pop('binary', True)
|
||||
self.binary = options.pop("binary", True)
|
||||
super()._configure(options, dont_fail)
|
||||
if self.binary:
|
||||
warnings.warn(
|
||||
"PythonItemExporter will drop support for binary export in the future",
|
||||
ScrapyDeprecationWarning)
|
||||
ScrapyDeprecationWarning,
|
||||
)
|
||||
if not self.encoding:
|
||||
self.encoding = 'utf-8'
|
||||
self.encoding = "utf-8"
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', self._serialize_value)
|
||||
serializer = field.get("serializer", self._serialize_value)
|
||||
return serializer(value)
|
||||
|
||||
def _serialize_value(self, value):
|
||||
|
@ -9,8 +9,8 @@ from scrapy.utils.conf import build_component_list
|
||||
|
||||
class ExtensionManager(MiddlewareManager):
|
||||
|
||||
component_name = 'extension'
|
||||
component_name = "extension"
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(settings.getwithbase('EXTENSIONS'))
|
||||
return build_component_list(settings.getwithbase("EXTENSIONS"))
|
||||
|
@ -11,15 +11,14 @@ from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class CloseSpider:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
|
||||
self.close_on = {
|
||||
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
|
||||
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
|
||||
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
|
||||
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
|
||||
"timeout": crawler.settings.getfloat("CLOSESPIDER_TIMEOUT"),
|
||||
"itemcount": crawler.settings.getint("CLOSESPIDER_ITEMCOUNT"),
|
||||
"pagecount": crawler.settings.getint("CLOSESPIDER_PAGECOUNT"),
|
||||
"errorcount": crawler.settings.getint("CLOSESPIDER_ERRORCOUNT"),
|
||||
}
|
||||
|
||||
if not any(self.close_on.values()):
|
||||
@ -27,13 +26,13 @@ class CloseSpider:
|
||||
|
||||
self.counter = defaultdict(int)
|
||||
|
||||
if self.close_on.get('errorcount'):
|
||||
if self.close_on.get("errorcount"):
|
||||
crawler.signals.connect(self.error_count, signal=signals.spider_error)
|
||||
if self.close_on.get('pagecount'):
|
||||
if self.close_on.get("pagecount"):
|
||||
crawler.signals.connect(self.page_count, signal=signals.response_received)
|
||||
if self.close_on.get('timeout'):
|
||||
if self.close_on.get("timeout"):
|
||||
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
if self.close_on.get('itemcount'):
|
||||
if self.close_on.get("itemcount"):
|
||||
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
|
||||
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
@ -42,27 +41,31 @@ class CloseSpider:
|
||||
return cls(crawler)
|
||||
|
||||
def error_count(self, failure, response, spider):
|
||||
self.counter['errorcount'] += 1
|
||||
if self.counter['errorcount'] == self.close_on['errorcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_errorcount')
|
||||
self.counter["errorcount"] += 1
|
||||
if self.counter["errorcount"] == self.close_on["errorcount"]:
|
||||
self.crawler.engine.close_spider(spider, "closespider_errorcount")
|
||||
|
||||
def page_count(self, response, request, spider):
|
||||
self.counter['pagecount'] += 1
|
||||
if self.counter['pagecount'] == self.close_on['pagecount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_pagecount')
|
||||
self.counter["pagecount"] += 1
|
||||
if self.counter["pagecount"] == self.close_on["pagecount"]:
|
||||
self.crawler.engine.close_spider(spider, "closespider_pagecount")
|
||||
|
||||
def spider_opened(self, spider):
|
||||
from twisted.internet import reactor
|
||||
self.task = reactor.callLater(self.close_on['timeout'],
|
||||
self.crawler.engine.close_spider, spider,
|
||||
reason='closespider_timeout')
|
||||
|
||||
self.task = reactor.callLater(
|
||||
self.close_on["timeout"],
|
||||
self.crawler.engine.close_spider,
|
||||
spider,
|
||||
reason="closespider_timeout",
|
||||
)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.counter['itemcount'] += 1
|
||||
if self.counter['itemcount'] == self.close_on['itemcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
|
||||
self.counter["itemcount"] += 1
|
||||
if self.counter["itemcount"] == self.close_on["itemcount"]:
|
||||
self.crawler.engine.close_spider(spider, "closespider_itemcount")
|
||||
|
||||
def spider_closed(self, spider):
|
||||
task = getattr(self, 'task', False)
|
||||
task = getattr(self, "task", False)
|
||||
if task and task.active():
|
||||
task.cancel()
|
||||
|
@ -7,7 +7,6 @@ from scrapy import signals
|
||||
|
||||
|
||||
class CoreStats:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
self.start_time = None
|
||||
@ -24,23 +23,25 @@ class CoreStats:
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.start_time = datetime.utcnow()
|
||||
self.stats.set_value('start_time', self.start_time, spider=spider)
|
||||
self.stats.set_value("start_time", self.start_time, spider=spider)
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
finish_time = datetime.utcnow()
|
||||
elapsed_time = finish_time - self.start_time
|
||||
elapsed_time_seconds = elapsed_time.total_seconds()
|
||||
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
|
||||
self.stats.set_value('finish_time', finish_time, spider=spider)
|
||||
self.stats.set_value('finish_reason', reason, spider=spider)
|
||||
self.stats.set_value(
|
||||
"elapsed_time_seconds", elapsed_time_seconds, spider=spider
|
||||
)
|
||||
self.stats.set_value("finish_time", finish_time, spider=spider)
|
||||
self.stats.set_value("finish_reason", reason, spider=spider)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.stats.inc_value('item_scraped_count', spider=spider)
|
||||
self.stats.inc_value("item_scraped_count", spider=spider)
|
||||
|
||||
def response_received(self, spider):
|
||||
self.stats.inc_value('response_received_count', spider=spider)
|
||||
self.stats.inc_value("response_received_count", spider=spider)
|
||||
|
||||
def item_dropped(self, item, spider, exception):
|
||||
reason = exception.__class__.__name__
|
||||
self.stats.inc_value('item_dropped_count', spider=spider)
|
||||
self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)
|
||||
self.stats.inc_value("item_dropped_count", spider=spider)
|
||||
self.stats.inc_value(f"item_dropped_reasons_count/{reason}", spider=spider)
|
||||
|
@ -18,7 +18,6 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StackTraceDump:
|
||||
|
||||
def __init__(self, crawler=None):
|
||||
self.crawler = crawler
|
||||
try:
|
||||
@ -34,20 +33,23 @@ class StackTraceDump:
|
||||
|
||||
def dump_stacktrace(self, signum, frame):
|
||||
log_args = {
|
||||
'stackdumps': self._thread_stacks(),
|
||||
'enginestatus': format_engine_status(self.crawler.engine),
|
||||
'liverefs': format_live_refs(),
|
||||
"stackdumps": self._thread_stacks(),
|
||||
"enginestatus": format_engine_status(self.crawler.engine),
|
||||
"liverefs": format_live_refs(),
|
||||
}
|
||||
logger.info("Dumping stack trace and engine status\n"
|
||||
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
|
||||
log_args, extra={'crawler': self.crawler})
|
||||
logger.info(
|
||||
"Dumping stack trace and engine status\n"
|
||||
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
|
||||
log_args,
|
||||
extra={"crawler": self.crawler},
|
||||
)
|
||||
|
||||
def _thread_stacks(self):
|
||||
id2name = dict((th.ident, th.name) for th in threading.enumerate())
|
||||
dumps = ''
|
||||
dumps = ""
|
||||
for id_, frame in sys._current_frames().items():
|
||||
name = id2name.get(id_, '')
|
||||
dump = ''.join(traceback.format_stack(frame))
|
||||
name = id2name.get(id_, "")
|
||||
dump = "".join(traceback.format_stack(frame))
|
||||
dumps += f"# Thread: {name}({id_})\n{dump}\n"
|
||||
return dumps
|
||||
|
||||
|
@ -34,15 +34,15 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
def build_storage(builder, uri, *args, feed_options=None, preargs=(), **kwargs):
|
||||
argument_names = get_func_args(builder)
|
||||
if 'feed_options' in argument_names:
|
||||
kwargs['feed_options'] = feed_options
|
||||
if "feed_options" in argument_names:
|
||||
kwargs["feed_options"] = feed_options
|
||||
else:
|
||||
warnings.warn(
|
||||
f"{builder.__qualname__} does not support the 'feed_options' keyword argument. Add a "
|
||||
"'feed_options' parameter to its signature to remove this "
|
||||
"warning. This parameter will become mandatory in a future "
|
||||
"version of Scrapy.",
|
||||
category=ScrapyDeprecationWarning
|
||||
category=ScrapyDeprecationWarning,
|
||||
)
|
||||
return builder(*preargs, uri, *args, **kwargs)
|
||||
|
||||
@ -55,6 +55,7 @@ class ItemFilter:
|
||||
:param feed_options: feed specific options passed from FeedExporter
|
||||
:type feed_options: dict
|
||||
"""
|
||||
|
||||
feed_options: Optional[dict]
|
||||
item_classes: Tuple
|
||||
|
||||
@ -62,7 +63,8 @@ class ItemFilter:
|
||||
self.feed_options = feed_options
|
||||
if feed_options is not None:
|
||||
self.item_classes = tuple(
|
||||
load_object(item_class) for item_class in feed_options.get("item_classes") or ()
|
||||
load_object(item_class)
|
||||
for item_class in feed_options.get("item_classes") or ()
|
||||
)
|
||||
else:
|
||||
self.item_classes = tuple()
|
||||
@ -98,13 +100,12 @@ class IFeedStorage(Interface):
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class BlockingFeedStorage:
|
||||
|
||||
def open(self, spider):
|
||||
path = spider.crawler.settings['FEED_TEMPDIR']
|
||||
path = spider.crawler.settings["FEED_TEMPDIR"]
|
||||
if path and not Path(path).is_dir():
|
||||
raise OSError('Not a Directory: ' + str(path))
|
||||
raise OSError("Not a Directory: " + str(path))
|
||||
|
||||
return NamedTemporaryFile(prefix='feed-', dir=path)
|
||||
return NamedTemporaryFile(prefix="feed-", dir=path)
|
||||
|
||||
def store(self, file):
|
||||
return threads.deferToThread(self._store_in_thread, file)
|
||||
@ -115,16 +116,17 @@ class BlockingFeedStorage:
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class StdoutFeedStorage:
|
||||
|
||||
def __init__(self, uri, _stdout=None, *, feed_options=None):
|
||||
if not _stdout:
|
||||
_stdout = sys.stdout.buffer
|
||||
self._stdout = _stdout
|
||||
if feed_options and feed_options.get('overwrite', False) is True:
|
||||
logger.warning('Standard output (stdout) storage does not support '
|
||||
'overwriting. To suppress this warning, remove the '
|
||||
'overwrite option from your FEEDS setting, or set '
|
||||
'it to False.')
|
||||
if feed_options and feed_options.get("overwrite", False) is True:
|
||||
logger.warning(
|
||||
"Standard output (stdout) storage does not support "
|
||||
"overwriting. To suppress this warning, remove the "
|
||||
"overwrite option from your FEEDS setting, or set "
|
||||
"it to False."
|
||||
)
|
||||
|
||||
def open(self, spider):
|
||||
return self._stdout
|
||||
@ -135,11 +137,10 @@ class StdoutFeedStorage:
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class FileFeedStorage:
|
||||
|
||||
def __init__(self, uri, *, feed_options=None):
|
||||
self.path = file_uri_to_path(uri)
|
||||
feed_options = feed_options or {}
|
||||
self.write_mode = 'wb' if feed_options.get('overwrite', False) else 'ab'
|
||||
self.write_mode = "wb" if feed_options.get("overwrite", False) else "ab"
|
||||
|
||||
def open(self, spider) -> IO[Any]:
|
||||
dirname = Path(self.path).parent
|
||||
@ -152,11 +153,19 @@ class FileFeedStorage:
|
||||
|
||||
|
||||
class S3FeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri, access_key=None, secret_key=None, acl=None, endpoint_url=None, *,
|
||||
feed_options=None, session_token=None):
|
||||
def __init__(
|
||||
self,
|
||||
uri,
|
||||
access_key=None,
|
||||
secret_key=None,
|
||||
acl=None,
|
||||
endpoint_url=None,
|
||||
*,
|
||||
feed_options=None,
|
||||
session_token=None,
|
||||
):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured('missing botocore library')
|
||||
raise NotConfigured("missing botocore library")
|
||||
u = urlparse(uri)
|
||||
self.bucketname = u.hostname
|
||||
self.access_key = u.username or access_key
|
||||
@ -166,41 +175,45 @@ class S3FeedStorage(BlockingFeedStorage):
|
||||
self.acl = acl
|
||||
self.endpoint_url = endpoint_url
|
||||
import botocore.session
|
||||
|
||||
session = botocore.session.get_session()
|
||||
self.s3_client = session.create_client(
|
||||
's3', aws_access_key_id=self.access_key,
|
||||
"s3",
|
||||
aws_access_key_id=self.access_key,
|
||||
aws_secret_access_key=self.secret_key,
|
||||
aws_session_token=self.session_token,
|
||||
endpoint_url=self.endpoint_url)
|
||||
if feed_options and feed_options.get('overwrite', True) is False:
|
||||
logger.warning('S3 does not support appending to files. To '
|
||||
'suppress this warning, remove the overwrite '
|
||||
'option from your FEEDS setting or set it to True.')
|
||||
endpoint_url=self.endpoint_url,
|
||||
)
|
||||
if feed_options and feed_options.get("overwrite", True) is False:
|
||||
logger.warning(
|
||||
"S3 does not support appending to files. To "
|
||||
"suppress this warning, remove the overwrite "
|
||||
"option from your FEEDS setting or set it to True."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, uri, *, feed_options=None):
|
||||
return build_storage(
|
||||
cls,
|
||||
uri,
|
||||
access_key=crawler.settings['AWS_ACCESS_KEY_ID'],
|
||||
secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'],
|
||||
session_token=crawler.settings['AWS_SESSION_TOKEN'],
|
||||
acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None,
|
||||
endpoint_url=crawler.settings['AWS_ENDPOINT_URL'] or None,
|
||||
access_key=crawler.settings["AWS_ACCESS_KEY_ID"],
|
||||
secret_key=crawler.settings["AWS_SECRET_ACCESS_KEY"],
|
||||
session_token=crawler.settings["AWS_SESSION_TOKEN"],
|
||||
acl=crawler.settings["FEED_STORAGE_S3_ACL"] or None,
|
||||
endpoint_url=crawler.settings["AWS_ENDPOINT_URL"] or None,
|
||||
feed_options=feed_options,
|
||||
)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
kwargs = {'ACL': self.acl} if self.acl else {}
|
||||
kwargs = {"ACL": self.acl} if self.acl else {}
|
||||
self.s3_client.put_object(
|
||||
Bucket=self.bucketname, Key=self.keyname, Body=file,
|
||||
**kwargs)
|
||||
Bucket=self.bucketname, Key=self.keyname, Body=file, **kwargs
|
||||
)
|
||||
file.close()
|
||||
|
||||
|
||||
class GCSFeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri, project_id, acl):
|
||||
self.project_id = project_id
|
||||
self.acl = acl
|
||||
@ -212,13 +225,14 @@ class GCSFeedStorage(BlockingFeedStorage):
|
||||
def from_crawler(cls, crawler, uri):
|
||||
return cls(
|
||||
uri,
|
||||
crawler.settings['GCS_PROJECT_ID'],
|
||||
crawler.settings['FEED_STORAGE_GCS_ACL'] or None
|
||||
crawler.settings["GCS_PROJECT_ID"],
|
||||
crawler.settings["FEED_STORAGE_GCS_ACL"] or None,
|
||||
)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
from google.cloud.storage import Client
|
||||
|
||||
client = Client(project=self.project_id)
|
||||
bucket = client.get_bucket(self.bucket_name)
|
||||
blob = bucket.blob(self.blob_name)
|
||||
@ -226,37 +240,51 @@ class GCSFeedStorage(BlockingFeedStorage):
|
||||
|
||||
|
||||
class FTPFeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri, use_active_mode=False, *, feed_options=None):
|
||||
u = urlparse(uri)
|
||||
self.host = u.hostname
|
||||
self.port = int(u.port or '21')
|
||||
self.port = int(u.port or "21")
|
||||
self.username = u.username
|
||||
self.password = unquote(u.password or '')
|
||||
self.password = unquote(u.password or "")
|
||||
self.path = u.path
|
||||
self.use_active_mode = use_active_mode
|
||||
self.overwrite = not feed_options or feed_options.get('overwrite', True)
|
||||
self.overwrite = not feed_options or feed_options.get("overwrite", True)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, uri, *, feed_options=None):
|
||||
return build_storage(
|
||||
cls,
|
||||
uri,
|
||||
crawler.settings.getbool('FEED_STORAGE_FTP_ACTIVE'),
|
||||
crawler.settings.getbool("FEED_STORAGE_FTP_ACTIVE"),
|
||||
feed_options=feed_options,
|
||||
)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
ftp_store_file(
|
||||
path=self.path, file=file, host=self.host,
|
||||
port=self.port, username=self.username,
|
||||
password=self.password, use_active_mode=self.use_active_mode,
|
||||
path=self.path,
|
||||
file=file,
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
username=self.username,
|
||||
password=self.password,
|
||||
use_active_mode=self.use_active_mode,
|
||||
overwrite=self.overwrite,
|
||||
)
|
||||
|
||||
|
||||
class _FeedSlot:
|
||||
def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, uri_template, filter):
|
||||
def __init__(
|
||||
self,
|
||||
file,
|
||||
exporter,
|
||||
storage,
|
||||
uri,
|
||||
format,
|
||||
store_empty,
|
||||
batch_id,
|
||||
uri_template,
|
||||
filter,
|
||||
):
|
||||
self.file = file
|
||||
self.exporter = exporter
|
||||
self.storage = storage
|
||||
@ -283,7 +311,6 @@ class _FeedSlot:
|
||||
|
||||
|
||||
class FeedExporter:
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
exporter = cls(crawler)
|
||||
@ -299,48 +326,55 @@ class FeedExporter:
|
||||
self.slots = []
|
||||
self.filters = {}
|
||||
|
||||
if not self.settings['FEEDS'] and not self.settings['FEED_URI']:
|
||||
if not self.settings["FEEDS"] and not self.settings["FEED_URI"]:
|
||||
raise NotConfigured
|
||||
|
||||
# Begin: Backward compatibility for FEED_URI and FEED_FORMAT settings
|
||||
if self.settings['FEED_URI']:
|
||||
if self.settings["FEED_URI"]:
|
||||
warnings.warn(
|
||||
'The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of '
|
||||
'the `FEEDS` setting. Please see the `FEEDS` setting docs for more details',
|
||||
category=ScrapyDeprecationWarning, stacklevel=2,
|
||||
"The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of "
|
||||
"the `FEEDS` setting. Please see the `FEEDS` setting docs for more details",
|
||||
category=ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
uri = str(self.settings["FEED_URI"]) # handle pathlib.Path objects
|
||||
feed_options = {"format": self.settings.get("FEED_FORMAT", "jsonlines")}
|
||||
self.feeds[uri] = feed_complete_default_values_from_settings(
|
||||
feed_options, self.settings
|
||||
)
|
||||
uri = str(self.settings['FEED_URI']) # handle pathlib.Path objects
|
||||
feed_options = {'format': self.settings.get('FEED_FORMAT', 'jsonlines')}
|
||||
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
|
||||
self.filters[uri] = self._load_filter(feed_options)
|
||||
# End: Backward compatibility for FEED_URI and FEED_FORMAT settings
|
||||
|
||||
# 'FEEDS' setting takes precedence over 'FEED_URI'
|
||||
for uri, feed_options in self.settings.getdict('FEEDS').items():
|
||||
for uri, feed_options in self.settings.getdict("FEEDS").items():
|
||||
uri = str(uri) # handle pathlib.Path objects
|
||||
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
|
||||
self.feeds[uri] = feed_complete_default_values_from_settings(
|
||||
feed_options, self.settings
|
||||
)
|
||||
self.filters[uri] = self._load_filter(feed_options)
|
||||
|
||||
self.storages = self._load_components('FEED_STORAGES')
|
||||
self.exporters = self._load_components('FEED_EXPORTERS')
|
||||
self.storages = self._load_components("FEED_STORAGES")
|
||||
self.exporters = self._load_components("FEED_EXPORTERS")
|
||||
for uri, feed_options in self.feeds.items():
|
||||
if not self._storage_supported(uri, feed_options):
|
||||
raise NotConfigured
|
||||
if not self._settings_are_valid():
|
||||
raise NotConfigured
|
||||
if not self._exporter_supported(feed_options['format']):
|
||||
if not self._exporter_supported(feed_options["format"]):
|
||||
raise NotConfigured
|
||||
|
||||
def open_spider(self, spider):
|
||||
for uri, feed_options in self.feeds.items():
|
||||
uri_params = self._get_uri_params(spider, feed_options['uri_params'])
|
||||
self.slots.append(self._start_new_batch(
|
||||
batch_id=1,
|
||||
uri=uri % uri_params,
|
||||
feed_options=feed_options,
|
||||
spider=spider,
|
||||
uri_template=uri,
|
||||
))
|
||||
uri_params = self._get_uri_params(spider, feed_options["uri_params"])
|
||||
self.slots.append(
|
||||
self._start_new_batch(
|
||||
batch_id=1,
|
||||
uri=uri % uri_params,
|
||||
feed_options=feed_options,
|
||||
spider=spider,
|
||||
uri_template=uri,
|
||||
)
|
||||
)
|
||||
|
||||
def close_spider(self, spider):
|
||||
deferred_list = []
|
||||
@ -368,16 +402,15 @@ class FeedExporter:
|
||||
|
||||
def _handle_store_error(self, f, logmsg, spider, slot_type):
|
||||
logger.error(
|
||||
"Error storing %s", logmsg,
|
||||
exc_info=failure_to_exc_info(f), extra={'spider': spider}
|
||||
"Error storing %s",
|
||||
logmsg,
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={"spider": spider},
|
||||
)
|
||||
self.crawler.stats.inc_value(f"feedexport/failed_count/{slot_type}")
|
||||
|
||||
def _handle_store_success(self, f, logmsg, spider, slot_type):
|
||||
logger.info(
|
||||
"Stored %s", logmsg,
|
||||
extra={'spider': spider}
|
||||
)
|
||||
logger.info("Stored %s", logmsg, extra={"spider": spider})
|
||||
self.crawler.stats.inc_value(f"feedexport/success_count/{slot_type}")
|
||||
|
||||
def _start_new_batch(self, batch_id, uri, feed_options, spider, uri_template):
|
||||
@ -393,26 +426,28 @@ class FeedExporter:
|
||||
storage = self._get_storage(uri, feed_options)
|
||||
file = storage.open(spider)
|
||||
if "postprocessing" in feed_options:
|
||||
file = PostProcessingManager(feed_options["postprocessing"], file, feed_options)
|
||||
file = PostProcessingManager(
|
||||
feed_options["postprocessing"], file, feed_options
|
||||
)
|
||||
|
||||
exporter = self._get_exporter(
|
||||
file=file,
|
||||
format=feed_options['format'],
|
||||
fields_to_export=feed_options['fields'],
|
||||
encoding=feed_options['encoding'],
|
||||
indent=feed_options['indent'],
|
||||
**feed_options['item_export_kwargs'],
|
||||
format=feed_options["format"],
|
||||
fields_to_export=feed_options["fields"],
|
||||
encoding=feed_options["encoding"],
|
||||
indent=feed_options["indent"],
|
||||
**feed_options["item_export_kwargs"],
|
||||
)
|
||||
slot = _FeedSlot(
|
||||
file=file,
|
||||
exporter=exporter,
|
||||
storage=storage,
|
||||
uri=uri,
|
||||
format=feed_options['format'],
|
||||
store_empty=feed_options['store_empty'],
|
||||
format=feed_options["format"],
|
||||
store_empty=feed_options["store_empty"],
|
||||
batch_id=batch_id,
|
||||
uri_template=uri_template,
|
||||
filter=self.filters[uri_template]
|
||||
filter=self.filters[uri_template],
|
||||
)
|
||||
if slot.store_empty:
|
||||
slot.start_exporting()
|
||||
@ -422,7 +457,9 @@ class FeedExporter:
|
||||
slots = []
|
||||
for slot in self.slots:
|
||||
if not slot.filter.accepts(item):
|
||||
slots.append(slot) # if slot doesn't accept item, continue with next slot
|
||||
slots.append(
|
||||
slot
|
||||
) # if slot doesn't accept item, continue with next slot
|
||||
continue
|
||||
|
||||
slot.start_exporting()
|
||||
@ -430,18 +467,22 @@ class FeedExporter:
|
||||
slot.itemcount += 1
|
||||
# create new slot for each slot with itemcount == FEED_EXPORT_BATCH_ITEM_COUNT and close the old one
|
||||
if (
|
||||
self.feeds[slot.uri_template]['batch_item_count']
|
||||
and slot.itemcount >= self.feeds[slot.uri_template]['batch_item_count']
|
||||
self.feeds[slot.uri_template]["batch_item_count"]
|
||||
and slot.itemcount >= self.feeds[slot.uri_template]["batch_item_count"]
|
||||
):
|
||||
uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot)
|
||||
uri_params = self._get_uri_params(
|
||||
spider, self.feeds[slot.uri_template]["uri_params"], slot
|
||||
)
|
||||
self._close_slot(slot, spider)
|
||||
slots.append(self._start_new_batch(
|
||||
batch_id=slot.batch_id + 1,
|
||||
uri=slot.uri_template % uri_params,
|
||||
feed_options=self.feeds[slot.uri_template],
|
||||
spider=spider,
|
||||
uri_template=slot.uri_template,
|
||||
))
|
||||
slots.append(
|
||||
self._start_new_batch(
|
||||
batch_id=slot.batch_id + 1,
|
||||
uri=slot.uri_template % uri_params,
|
||||
feed_options=self.feeds[slot.uri_template],
|
||||
spider=spider,
|
||||
uri_template=slot.uri_template,
|
||||
)
|
||||
)
|
||||
else:
|
||||
slots.append(slot)
|
||||
self.slots = slots
|
||||
@ -459,7 +500,7 @@ class FeedExporter:
|
||||
def _exporter_supported(self, format):
|
||||
if format in self.exporters:
|
||||
return True
|
||||
logger.error("Unknown feed format: %(format)s", {'format': format})
|
||||
logger.error("Unknown feed format: %(format)s", {"format": format})
|
||||
|
||||
def _settings_are_valid(self):
|
||||
"""
|
||||
@ -467,12 +508,14 @@ class FeedExporter:
|
||||
%(batch_time)s or %(batch_id)d to distinguish different files of partial output
|
||||
"""
|
||||
for uri_template, values in self.feeds.items():
|
||||
if values['batch_item_count'] and not re.search(r'%\(batch_time\)s|%\(batch_id\)', uri_template):
|
||||
if values["batch_item_count"] and not re.search(
|
||||
r"%\(batch_time\)s|%\(batch_id\)", uri_template
|
||||
):
|
||||
logger.error(
|
||||
'%%(batch_time)s or %%(batch_id)d must be in the feed URI (%s) if FEED_EXPORT_BATCH_ITEM_COUNT '
|
||||
'setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: '
|
||||
'https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count',
|
||||
uri_template
|
||||
"%%(batch_time)s or %%(batch_id)d must be in the feed URI (%s) if FEED_EXPORT_BATCH_ITEM_COUNT "
|
||||
"setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: "
|
||||
"https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count",
|
||||
uri_template,
|
||||
)
|
||||
return False
|
||||
return True
|
||||
@ -484,17 +527,17 @@ class FeedExporter:
|
||||
self._get_storage(uri, feed_options)
|
||||
return True
|
||||
except NotConfigured as e:
|
||||
logger.error("Disabled feed storage scheme: %(scheme)s. "
|
||||
"Reason: %(reason)s",
|
||||
{'scheme': scheme, 'reason': str(e)})
|
||||
logger.error(
|
||||
"Disabled feed storage scheme: %(scheme)s. " "Reason: %(reason)s",
|
||||
{"scheme": scheme, "reason": str(e)},
|
||||
)
|
||||
else:
|
||||
logger.error("Unknown feed storage scheme: %(scheme)s",
|
||||
{'scheme': scheme})
|
||||
logger.error("Unknown feed storage scheme: %(scheme)s", {"scheme": scheme})
|
||||
|
||||
def _get_instance(self, objcls, *args, **kwargs):
|
||||
return create_instance(
|
||||
objcls, self.settings, getattr(self, 'crawler', None),
|
||||
*args, **kwargs)
|
||||
objcls, self.settings, getattr(self, "crawler", None), *args, **kwargs
|
||||
)
|
||||
|
||||
def _get_exporter(self, file, format, *args, **kwargs):
|
||||
return self._get_instance(self.exporters[format], file, *args, **kwargs)
|
||||
@ -506,20 +549,22 @@ class FeedExporter:
|
||||
do not support it, and issuing a deprecation warning instead.
|
||||
"""
|
||||
feedcls = self.storages[urlparse(uri).scheme]
|
||||
crawler = getattr(self, 'crawler', None)
|
||||
crawler = getattr(self, "crawler", None)
|
||||
|
||||
def build_instance(builder, *preargs):
|
||||
return build_storage(builder, uri, feed_options=feed_options, preargs=preargs)
|
||||
return build_storage(
|
||||
builder, uri, feed_options=feed_options, preargs=preargs
|
||||
)
|
||||
|
||||
if crawler and hasattr(feedcls, 'from_crawler'):
|
||||
if crawler and hasattr(feedcls, "from_crawler"):
|
||||
instance = build_instance(feedcls.from_crawler, crawler)
|
||||
method_name = 'from_crawler'
|
||||
elif hasattr(feedcls, 'from_settings'):
|
||||
method_name = "from_crawler"
|
||||
elif hasattr(feedcls, "from_settings"):
|
||||
instance = build_instance(feedcls.from_settings, self.settings)
|
||||
method_name = 'from_settings'
|
||||
method_name = "from_settings"
|
||||
else:
|
||||
instance = build_instance(feedcls)
|
||||
method_name = '__new__'
|
||||
method_name = "__new__"
|
||||
if instance is None:
|
||||
raise TypeError(f"{feedcls.__qualname__}.{method_name} returned None")
|
||||
return instance
|
||||
@ -534,19 +579,23 @@ class FeedExporter:
|
||||
for k in dir(spider):
|
||||
params[k] = getattr(spider, k)
|
||||
utc_now = datetime.utcnow()
|
||||
params['time'] = utc_now.replace(microsecond=0).isoformat().replace(':', '-')
|
||||
params['batch_time'] = utc_now.isoformat().replace(':', '-')
|
||||
params['batch_id'] = slot.batch_id + 1 if slot is not None else 1
|
||||
params["time"] = utc_now.replace(microsecond=0).isoformat().replace(":", "-")
|
||||
params["batch_time"] = utc_now.isoformat().replace(":", "-")
|
||||
params["batch_id"] = slot.batch_id + 1 if slot is not None else 1
|
||||
original_params = params.copy()
|
||||
uripar_function = load_object(uri_params_function) if uri_params_function else lambda params, _: params
|
||||
uripar_function = (
|
||||
load_object(uri_params_function)
|
||||
if uri_params_function
|
||||
else lambda params, _: params
|
||||
)
|
||||
new_params = uripar_function(params, spider)
|
||||
if new_params is None or original_params != params:
|
||||
warnings.warn(
|
||||
'Modifying the params dictionary in-place in the function defined in '
|
||||
'the FEED_URI_PARAMS setting or in the uri_params key of the FEEDS '
|
||||
'setting is deprecated. The function must return a new dictionary '
|
||||
'instead.',
|
||||
category=ScrapyDeprecationWarning
|
||||
"Modifying the params dictionary in-place in the function defined in "
|
||||
"the FEED_URI_PARAMS setting or in the uri_params key of the FEEDS "
|
||||
"setting is deprecated. The function must return a new dictionary "
|
||||
"instead.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
)
|
||||
return new_params if new_params is not None else params
|
||||
|
||||
|
@ -21,10 +21,11 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DummyPolicy:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
|
||||
self.ignore_schemes = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
|
||||
self.ignore_http_codes = [
|
||||
int(x) for x in settings.getlist("HTTPCACHE_IGNORE_HTTP_CODES")
|
||||
]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
return urlparse_cached(request).scheme not in self.ignore_schemes
|
||||
@ -44,16 +45,17 @@ class RFC2616Policy:
|
||||
MAXAGE = 3600 * 24 * 365 # one year
|
||||
|
||||
def __init__(self, settings):
|
||||
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self.always_store = settings.getbool("HTTPCACHE_ALWAYS_STORE")
|
||||
self.ignore_schemes = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
|
||||
self._cc_parsed = WeakKeyDictionary()
|
||||
self.ignore_response_cache_controls = [
|
||||
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
|
||||
to_bytes(cc)
|
||||
for cc in settings.getlist("HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS")
|
||||
]
|
||||
|
||||
def _parse_cachecontrol(self, r):
|
||||
if r not in self._cc_parsed:
|
||||
cch = r.headers.get(b'Cache-Control', b'')
|
||||
cch = r.headers.get(b"Cache-Control", b"")
|
||||
parsed = parse_cachecontrol(cch)
|
||||
if isinstance(r, Response):
|
||||
for key in self.ignore_response_cache_controls:
|
||||
@ -66,7 +68,7 @@ class RFC2616Policy:
|
||||
return False
|
||||
cc = self._parse_cachecontrol(request)
|
||||
# obey user-agent directive "Cache-Control: no-store"
|
||||
if b'no-store' in cc:
|
||||
if b"no-store" in cc:
|
||||
return False
|
||||
# Any other is eligible for caching
|
||||
return True
|
||||
@ -77,7 +79,7 @@ class RFC2616Policy:
|
||||
# Status code 206 is not included because cache can not deal with partial contents
|
||||
cc = self._parse_cachecontrol(response)
|
||||
# obey directive "Cache-Control: no-store"
|
||||
if b'no-store' in cc:
|
||||
if b"no-store" in cc:
|
||||
return False
|
||||
# Never cache 304 (Not Modified) responses
|
||||
if response.status == 304:
|
||||
@ -86,14 +88,14 @@ class RFC2616Policy:
|
||||
if self.always_store:
|
||||
return True
|
||||
# Any hint on response expiration is good
|
||||
if b'max-age' in cc or b'Expires' in response.headers:
|
||||
if b"max-age" in cc or b"Expires" in response.headers:
|
||||
return True
|
||||
# Firefox fallbacks this statuses to one year expiration if none is set
|
||||
if response.status in (300, 301, 308):
|
||||
return True
|
||||
# Other statuses without expiration requires at least one validator
|
||||
if response.status in (200, 203, 401):
|
||||
return b'Last-Modified' in response.headers or b'ETag' in response.headers
|
||||
return b"Last-Modified" in response.headers or b"ETag" in response.headers
|
||||
# Any other is probably not eligible for caching
|
||||
# Makes no sense to cache responses that does not contain expiration
|
||||
# info and can not be revalidated
|
||||
@ -102,11 +104,13 @@ class RFC2616Policy:
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
ccreq = self._parse_cachecontrol(request)
|
||||
if b'no-cache' in cc or b'no-cache' in ccreq:
|
||||
if b"no-cache" in cc or b"no-cache" in ccreq:
|
||||
return False
|
||||
|
||||
now = time()
|
||||
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
|
||||
freshnesslifetime = self._compute_freshness_lifetime(
|
||||
cachedresponse, request, now
|
||||
)
|
||||
currentage = self._compute_current_age(cachedresponse, request, now)
|
||||
|
||||
reqmaxage = self._get_max_age(ccreq)
|
||||
@ -116,7 +120,7 @@ class RFC2616Policy:
|
||||
if currentage < freshnesslifetime:
|
||||
return True
|
||||
|
||||
if b'max-stale' in ccreq and b'must-revalidate' not in cc:
|
||||
if b"max-stale" in ccreq and b"must-revalidate" not in cc:
|
||||
# From RFC2616: "Indicates that the client is willing to
|
||||
# accept a response that has exceeded its expiration time.
|
||||
# If max-stale is assigned a value, then the client is
|
||||
@ -124,7 +128,7 @@ class RFC2616Policy:
|
||||
# expiration time by no more than the specified number of
|
||||
# seconds. If no value is assigned to max-stale, then the
|
||||
# client is willing to accept a stale response of any age."
|
||||
staleage = ccreq[b'max-stale']
|
||||
staleage = ccreq[b"max-stale"]
|
||||
if staleage is None:
|
||||
return True
|
||||
|
||||
@ -143,22 +147,24 @@ class RFC2616Policy:
|
||||
# as long as the old response didn't specify must-revalidate.
|
||||
if response.status >= 500:
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
if b'must-revalidate' not in cc:
|
||||
if b"must-revalidate" not in cc:
|
||||
return True
|
||||
|
||||
# Use the cached response if the server says it hasn't changed.
|
||||
return response.status == 304
|
||||
|
||||
def _set_conditional_validators(self, request, cachedresponse):
|
||||
if b'Last-Modified' in cachedresponse.headers:
|
||||
request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
|
||||
if b"Last-Modified" in cachedresponse.headers:
|
||||
request.headers[b"If-Modified-Since"] = cachedresponse.headers[
|
||||
b"Last-Modified"
|
||||
]
|
||||
|
||||
if b'ETag' in cachedresponse.headers:
|
||||
request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
|
||||
if b"ETag" in cachedresponse.headers:
|
||||
request.headers[b"If-None-Match"] = cachedresponse.headers[b"ETag"]
|
||||
|
||||
def _get_max_age(self, cc):
|
||||
try:
|
||||
return max(0, int(cc[b'max-age']))
|
||||
return max(0, int(cc[b"max-age"]))
|
||||
except (KeyError, ValueError):
|
||||
return None
|
||||
|
||||
@ -171,18 +177,18 @@ class RFC2616Policy:
|
||||
return maxage
|
||||
|
||||
# Parse date header or synthesize it if none exists
|
||||
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
|
||||
date = rfc1123_to_epoch(response.headers.get(b"Date")) or now
|
||||
|
||||
# Try HTTP/1.0 Expires header
|
||||
if b'Expires' in response.headers:
|
||||
expires = rfc1123_to_epoch(response.headers[b'Expires'])
|
||||
if b"Expires" in response.headers:
|
||||
expires = rfc1123_to_epoch(response.headers[b"Expires"])
|
||||
# When parsing Expires header fails RFC 2616 section 14.21 says we
|
||||
# should treat this as an expiration time in the past.
|
||||
return max(0, expires - date) if expires else 0
|
||||
|
||||
# Fallback to heuristic using last-modified header
|
||||
# This is not in RFC but on Firefox caching implementation
|
||||
lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
|
||||
lastmodified = rfc1123_to_epoch(response.headers.get(b"Last-Modified"))
|
||||
if lastmodified and lastmodified <= date:
|
||||
return (date - lastmodified) / 10
|
||||
|
||||
@ -199,13 +205,13 @@ class RFC2616Policy:
|
||||
currentage = 0
|
||||
# If Date header is not set we assume it is a fast connection, and
|
||||
# clock is in sync with the server
|
||||
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
|
||||
date = rfc1123_to_epoch(response.headers.get(b"Date")) or now
|
||||
if now > date:
|
||||
currentage = now - date
|
||||
|
||||
if b'Age' in response.headers:
|
||||
if b"Age" in response.headers:
|
||||
try:
|
||||
age = int(response.headers[b'Age'])
|
||||
age = int(response.headers[b"Age"])
|
||||
currentage = max(currentage, age)
|
||||
except ValueError:
|
||||
pass
|
||||
@ -214,18 +220,21 @@ class RFC2616Policy:
|
||||
|
||||
|
||||
class DbmCacheStorage:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
|
||||
self.cachedir = data_path(settings["HTTPCACHE_DIR"], createdir=True)
|
||||
self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
|
||||
self.dbmodule = import_module(settings["HTTPCACHE_DBM_MODULE"])
|
||||
self.db = None
|
||||
|
||||
def open_spider(self, spider: Spider):
|
||||
dbpath = Path(self.cachedir, f'{spider.name}.db')
|
||||
self.db = self.dbmodule.open(str(dbpath), 'c')
|
||||
dbpath = Path(self.cachedir, f"{spider.name}.db")
|
||||
self.db = self.dbmodule.open(str(dbpath), "c")
|
||||
|
||||
logger.debug("Using DBM cache storage in %(cachepath)s", {'cachepath': dbpath}, extra={'spider': spider})
|
||||
logger.debug(
|
||||
"Using DBM cache storage in %(cachepath)s",
|
||||
{"cachepath": dbpath},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
|
||||
self._fingerprinter = spider.crawler.request_fingerprinter
|
||||
|
||||
@ -236,10 +245,10 @@ class DbmCacheStorage:
|
||||
data = self._read_data(spider, request)
|
||||
if data is None:
|
||||
return # not cached
|
||||
url = data['url']
|
||||
status = data['status']
|
||||
headers = Headers(data['headers'])
|
||||
body = data['body']
|
||||
url = data["url"]
|
||||
status = data["status"]
|
||||
headers = Headers(data["headers"])
|
||||
body = data["body"]
|
||||
respcls = responsetypes.from_args(headers=headers, url=url, body=body)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
@ -247,18 +256,18 @@ class DbmCacheStorage:
|
||||
def store_response(self, spider, request, response):
|
||||
key = self._fingerprinter.fingerprint(request).hex()
|
||||
data = {
|
||||
'status': response.status,
|
||||
'url': response.url,
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
"status": response.status,
|
||||
"url": response.url,
|
||||
"headers": dict(response.headers),
|
||||
"body": response.body,
|
||||
}
|
||||
self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
|
||||
self.db[f'{key}_time'] = str(time())
|
||||
self.db[f"{key}_data"] = pickle.dumps(data, protocol=4)
|
||||
self.db[f"{key}_time"] = str(time())
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
key = self._fingerprinter.fingerprint(request).hex()
|
||||
db = self.db
|
||||
tkey = f'{key}_time'
|
||||
tkey = f"{key}_time"
|
||||
if tkey not in db:
|
||||
return # not found
|
||||
|
||||
@ -266,20 +275,22 @@ class DbmCacheStorage:
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return # expired
|
||||
|
||||
return pickle.loads(db[f'{key}_data'])
|
||||
return pickle.loads(db[f"{key}_data"])
|
||||
|
||||
|
||||
class FilesystemCacheStorage:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
|
||||
self.cachedir = data_path(settings["HTTPCACHE_DIR"])
|
||||
self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
|
||||
self.use_gzip = settings.getbool("HTTPCACHE_GZIP")
|
||||
self._open = gzip.open if self.use_gzip else open
|
||||
|
||||
def open_spider(self, spider: Spider):
|
||||
logger.debug("Using filesystem cache storage in %(cachedir)s", {'cachedir': self.cachedir},
|
||||
extra={'spider': spider})
|
||||
logger.debug(
|
||||
"Using filesystem cache storage in %(cachedir)s",
|
||||
{"cachedir": self.cachedir},
|
||||
extra={"spider": spider},
|
||||
)
|
||||
|
||||
self._fingerprinter = spider.crawler.request_fingerprinter
|
||||
|
||||
@ -292,12 +303,12 @@ class FilesystemCacheStorage:
|
||||
if metadata is None:
|
||||
return # not cached
|
||||
rpath = Path(self._get_request_path(spider, request))
|
||||
with self._open(rpath / 'response_body', 'rb') as f:
|
||||
with self._open(rpath / "response_body", "rb") as f:
|
||||
body = f.read()
|
||||
with self._open(rpath / 'response_headers', 'rb') as f:
|
||||
with self._open(rpath / "response_headers", "rb") as f:
|
||||
rawheaders = f.read()
|
||||
url = metadata.get('response_url')
|
||||
status = metadata['status']
|
||||
url = metadata.get("response_url")
|
||||
status = metadata["status"]
|
||||
headers = Headers(headers_raw_to_dict(rawheaders))
|
||||
respcls = responsetypes.from_args(headers=headers, url=url, body=body)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
@ -309,23 +320,23 @@ class FilesystemCacheStorage:
|
||||
if not rpath.exists():
|
||||
rpath.mkdir(parents=True)
|
||||
metadata = {
|
||||
'url': request.url,
|
||||
'method': request.method,
|
||||
'status': response.status,
|
||||
'response_url': response.url,
|
||||
'timestamp': time(),
|
||||
"url": request.url,
|
||||
"method": request.method,
|
||||
"status": response.status,
|
||||
"response_url": response.url,
|
||||
"timestamp": time(),
|
||||
}
|
||||
with self._open(rpath / 'meta', 'wb') as f:
|
||||
with self._open(rpath / "meta", "wb") as f:
|
||||
f.write(to_bytes(repr(metadata)))
|
||||
with self._open(rpath / 'pickled_meta', 'wb') as f:
|
||||
with self._open(rpath / "pickled_meta", "wb") as f:
|
||||
pickle.dump(metadata, f, protocol=4)
|
||||
with self._open(rpath / 'response_headers', 'wb') as f:
|
||||
with self._open(rpath / "response_headers", "wb") as f:
|
||||
f.write(headers_dict_to_raw(response.headers))
|
||||
with self._open(rpath / 'response_body', 'wb') as f:
|
||||
with self._open(rpath / "response_body", "wb") as f:
|
||||
f.write(response.body)
|
||||
with self._open(rpath / 'request_headers', 'wb') as f:
|
||||
with self._open(rpath / "request_headers", "wb") as f:
|
||||
f.write(headers_dict_to_raw(request.headers))
|
||||
with self._open(rpath / 'request_body', 'wb') as f:
|
||||
with self._open(rpath / "request_body", "wb") as f:
|
||||
f.write(request.body)
|
||||
|
||||
def _get_request_path(self, spider: Spider, request: Request) -> str:
|
||||
@ -334,13 +345,13 @@ class FilesystemCacheStorage:
|
||||
|
||||
def _read_meta(self, spider: Spider, request: Request):
|
||||
rpath = Path(self._get_request_path(spider, request))
|
||||
metapath = rpath / 'pickled_meta'
|
||||
metapath = rpath / "pickled_meta"
|
||||
if not metapath.exists():
|
||||
return # not found
|
||||
mtime = metapath.stat().st_mtime
|
||||
if 0 < self.expiration_secs < time() - mtime:
|
||||
return # expired
|
||||
with self._open(metapath, 'rb') as f:
|
||||
with self._open(metapath, "rb") as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
@ -357,8 +368,8 @@ def parse_cachecontrol(header):
|
||||
|
||||
"""
|
||||
directives = {}
|
||||
for directive in header.split(b','):
|
||||
key, sep, val = directive.strip().partition(b'=')
|
||||
for directive in header.split(b","):
|
||||
key, sep, val = directive.strip().partition(b"=")
|
||||
if key:
|
||||
directives[key.lower()] = val if sep else None
|
||||
return directives
|
||||
@ -366,7 +377,7 @@ def parse_cachecontrol(header):
|
||||
|
||||
def rfc1123_to_epoch(date_str):
|
||||
try:
|
||||
date_str = to_unicode(date_str, encoding='ascii')
|
||||
date_str = to_unicode(date_str, encoding="ascii")
|
||||
return mktime_tz(parsedate_tz(date_str))
|
||||
except Exception:
|
||||
return None
|
||||
|
@ -19,7 +19,7 @@ class LogStats:
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
|
||||
interval = crawler.settings.getfloat("LOGSTATS_INTERVAL")
|
||||
if not interval:
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats, interval)
|
||||
@ -35,17 +35,23 @@ class LogStats:
|
||||
self.task.start(self.interval)
|
||||
|
||||
def log(self, spider):
|
||||
items = self.stats.get_value('item_scraped_count', 0)
|
||||
pages = self.stats.get_value('response_received_count', 0)
|
||||
items = self.stats.get_value("item_scraped_count", 0)
|
||||
pages = self.stats.get_value("response_received_count", 0)
|
||||
irate = (items - self.itemsprev) * self.multiplier
|
||||
prate = (pages - self.pagesprev) * self.multiplier
|
||||
self.pagesprev, self.itemsprev = pages, items
|
||||
|
||||
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
|
||||
"scraped %(items)d items (at %(itemrate)d items/min)")
|
||||
log_args = {'pages': pages, 'pagerate': prate,
|
||||
'items': items, 'itemrate': irate}
|
||||
logger.info(msg, log_args, extra={'spider': spider})
|
||||
msg = (
|
||||
"Crawled %(pages)d pages (at %(pagerate)d pages/min), "
|
||||
"scraped %(items)d items (at %(itemrate)d items/min)"
|
||||
)
|
||||
log_args = {
|
||||
"pages": pages,
|
||||
"pagerate": prate,
|
||||
"items": items,
|
||||
"itemrate": irate,
|
||||
}
|
||||
logger.info(msg, log_args, extra={"spider": spider})
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
if self.task and self.task.running:
|
||||
|
@ -12,13 +12,12 @@ from scrapy.utils.trackref import live_refs
|
||||
|
||||
|
||||
class MemoryDebugger:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('MEMDEBUG_ENABLED'):
|
||||
if not crawler.settings.getbool("MEMDEBUG_ENABLED"):
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
@ -26,8 +25,12 @@ class MemoryDebugger:
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
gc.collect()
|
||||
self.stats.set_value('memdebug/gc_garbage_count', len(gc.garbage), spider=spider)
|
||||
self.stats.set_value(
|
||||
"memdebug/gc_garbage_count", len(gc.garbage), spider=spider
|
||||
)
|
||||
for cls, wdict in live_refs.items():
|
||||
if not wdict:
|
||||
continue
|
||||
self.stats.set_value(f'memdebug/live_refs/{cls.__name__}', len(wdict), spider=spider)
|
||||
self.stats.set_value(
|
||||
f"memdebug/live_refs/{cls.__name__}", len(wdict), spider=spider
|
||||
)
|
||||
|
@ -20,22 +20,23 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryUsage:
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
|
||||
if not crawler.settings.getbool("MEMUSAGE_ENABLED"):
|
||||
raise NotConfigured
|
||||
try:
|
||||
# stdlib's resource module is only available on unix platforms.
|
||||
self.resource = import_module('resource')
|
||||
self.resource = import_module("resource")
|
||||
except ImportError:
|
||||
raise NotConfigured
|
||||
|
||||
self.crawler = crawler
|
||||
self.warned = False
|
||||
self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
|
||||
self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB') * 1024 * 1024
|
||||
self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB') * 1024 * 1024
|
||||
self.check_interval = crawler.settings.getfloat('MEMUSAGE_CHECK_INTERVAL_SECONDS')
|
||||
self.notify_mails = crawler.settings.getlist("MEMUSAGE_NOTIFY_MAIL")
|
||||
self.limit = crawler.settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024
|
||||
self.warning = crawler.settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024
|
||||
self.check_interval = crawler.settings.getfloat(
|
||||
"MEMUSAGE_CHECK_INTERVAL_SECONDS"
|
||||
)
|
||||
self.mail = MailSender.from_settings(crawler.settings)
|
||||
crawler.signals.connect(self.engine_started, signal=signals.engine_started)
|
||||
crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
|
||||
@ -46,13 +47,13 @@ class MemoryUsage:
|
||||
|
||||
def get_virtual_size(self):
|
||||
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
|
||||
if sys.platform != 'darwin':
|
||||
if sys.platform != "darwin":
|
||||
# on macOS ru_maxrss is in bytes, on Linux it is in KB
|
||||
size *= 1024
|
||||
return size
|
||||
|
||||
def engine_started(self):
|
||||
self.crawler.stats.set_value('memusage/startup', self.get_virtual_size())
|
||||
self.crawler.stats.set_value("memusage/startup", self.get_virtual_size())
|
||||
self.tasks = []
|
||||
tsk = task.LoopingCall(self.update)
|
||||
self.tasks.append(tsk)
|
||||
@ -72,45 +73,56 @@ class MemoryUsage:
|
||||
tsk.stop()
|
||||
|
||||
def update(self):
|
||||
self.crawler.stats.max_value('memusage/max', self.get_virtual_size())
|
||||
self.crawler.stats.max_value("memusage/max", self.get_virtual_size())
|
||||
|
||||
def _check_limit(self):
|
||||
peak_mem_usage = self.get_virtual_size()
|
||||
if peak_mem_usage > self.limit:
|
||||
self.crawler.stats.set_value('memusage/limit_reached', 1)
|
||||
self.crawler.stats.set_value("memusage/limit_reached", 1)
|
||||
mem = self.limit / 1024 / 1024
|
||||
logger.error("Memory usage exceeded %(memusage)dMiB. Shutting down Scrapy...",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
logger.error(
|
||||
"Memory usage exceeded %(memusage)dMiB. Shutting down Scrapy...",
|
||||
{"memusage": mem},
|
||||
extra={"crawler": self.crawler},
|
||||
)
|
||||
if self.notify_mails:
|
||||
subj = (
|
||||
f"{self.crawler.settings['BOT_NAME']} terminated: "
|
||||
f"memory usage exceeded {mem}MiB at {socket.gethostname()}"
|
||||
)
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/limit_notified', 1)
|
||||
self.crawler.stats.set_value("memusage/limit_notified", 1)
|
||||
|
||||
if self.crawler.engine.spider is not None:
|
||||
self.crawler.engine.close_spider(self.crawler.engine.spider, 'memusage_exceeded')
|
||||
self.crawler.engine.close_spider(
|
||||
self.crawler.engine.spider, "memusage_exceeded"
|
||||
)
|
||||
else:
|
||||
self.crawler.stop()
|
||||
else:
|
||||
logger.info("Peak memory usage is %(virtualsize)dMiB", {'virtualsize': peak_mem_usage / 1024 / 1024})
|
||||
logger.info(
|
||||
"Peak memory usage is %(virtualsize)dMiB",
|
||||
{"virtualsize": peak_mem_usage / 1024 / 1024},
|
||||
)
|
||||
|
||||
def _check_warning(self):
|
||||
if self.warned: # warn only once
|
||||
return
|
||||
if self.get_virtual_size() > self.warning:
|
||||
self.crawler.stats.set_value('memusage/warning_reached', 1)
|
||||
self.crawler.stats.set_value("memusage/warning_reached", 1)
|
||||
mem = self.warning / 1024 / 1024
|
||||
logger.warning("Memory usage reached %(memusage)dMiB",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
logger.warning(
|
||||
"Memory usage reached %(memusage)dMiB",
|
||||
{"memusage": mem},
|
||||
extra={"crawler": self.crawler},
|
||||
)
|
||||
if self.notify_mails:
|
||||
subj = (
|
||||
f"{self.crawler.settings['BOT_NAME']} warning: "
|
||||
f"memory usage reached {mem}MiB at {socket.gethostname()}"
|
||||
)
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/warning_notified', 1)
|
||||
self.crawler.stats.set_value("memusage/warning_notified", 1)
|
||||
self.warned = True
|
||||
|
||||
def _send_report(self, rcpts, subject):
|
||||
@ -120,7 +132,9 @@ class MemoryUsage:
|
||||
s += f"Maximum memory usage : {stats.get_value('memusage/max')/1024/1024}M\r\n"
|
||||
s += f"Current memory usage : {self.get_virtual_size()/1024/1024}M\r\n"
|
||||
|
||||
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
|
||||
s += (
|
||||
"ENGINE STATUS ------------------------------------------------------- \r\n"
|
||||
)
|
||||
s += "\r\n"
|
||||
s += pformat(get_engine_status(self.crawler.engine))
|
||||
s += "\r\n"
|
||||
|
@ -29,8 +29,13 @@ class GzipPlugin:
|
||||
compress_level = self.feed_options.get("gzip_compresslevel", 9)
|
||||
mtime = self.feed_options.get("gzip_mtime")
|
||||
filename = self.feed_options.get("gzip_filename")
|
||||
self.gzipfile = GzipFile(fileobj=self.file, mode="wb", compresslevel=compress_level,
|
||||
mtime=mtime, filename=filename)
|
||||
self.gzipfile = GzipFile(
|
||||
fileobj=self.file,
|
||||
mode="wb",
|
||||
compresslevel=compress_level,
|
||||
mtime=mtime,
|
||||
filename=filename,
|
||||
)
|
||||
|
||||
def write(self, data: bytes) -> int:
|
||||
return self.gzipfile.write(data)
|
||||
@ -55,7 +60,9 @@ class Bz2Plugin:
|
||||
self.file = file
|
||||
self.feed_options = feed_options
|
||||
compress_level = self.feed_options.get("bz2_compresslevel", 9)
|
||||
self.bz2file = BZ2File(filename=self.file, mode="wb", compresslevel=compress_level)
|
||||
self.bz2file = BZ2File(
|
||||
filename=self.file, mode="wb", compresslevel=compress_level
|
||||
)
|
||||
|
||||
def write(self, data: bytes) -> int:
|
||||
return self.bz2file.write(data)
|
||||
@ -90,8 +97,14 @@ class LZMAPlugin:
|
||||
check = self.feed_options.get("lzma_check", -1)
|
||||
preset = self.feed_options.get("lzma_preset")
|
||||
filters = self.feed_options.get("lzma_filters")
|
||||
self.lzmafile = LZMAFile(filename=self.file, mode="wb", format=format,
|
||||
check=check, preset=preset, filters=filters)
|
||||
self.lzmafile = LZMAFile(
|
||||
filename=self.file,
|
||||
mode="wb",
|
||||
format=format,
|
||||
check=check,
|
||||
preset=preset,
|
||||
filters=filters,
|
||||
)
|
||||
|
||||
def write(self, data: bytes) -> int:
|
||||
return self.lzmafile.write(data)
|
||||
@ -114,7 +127,9 @@ class PostProcessingManager(IOBase):
|
||||
:type file: file like object
|
||||
"""
|
||||
|
||||
def __init__(self, plugins: List[Any], file: BinaryIO, feed_options: Dict[str, Any]) -> None:
|
||||
def __init__(
|
||||
self, plugins: List[Any], file: BinaryIO, feed_options: Dict[str, Any]
|
||||
) -> None:
|
||||
self.plugins = self._load_plugins(plugins)
|
||||
self.file = file
|
||||
self.feed_options = feed_options
|
||||
|
@ -25,16 +25,16 @@ class SpiderState:
|
||||
|
||||
def spider_closed(self, spider):
|
||||
if self.jobdir:
|
||||
with Path(self.statefn).open('wb') as f:
|
||||
with Path(self.statefn).open("wb") as f:
|
||||
pickle.dump(spider.state, f, protocol=4)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
if self.jobdir and Path(self.statefn).exists():
|
||||
with Path(self.statefn).open('rb') as f:
|
||||
with Path(self.statefn).open("rb") as f:
|
||||
spider.state = pickle.load(f)
|
||||
else:
|
||||
spider.state = {}
|
||||
|
||||
@property
|
||||
def statefn(self) -> str:
|
||||
return str(Path(self.jobdir, 'spider.state'))
|
||||
return str(Path(self.jobdir, "spider.state"))
|
||||
|
@ -10,7 +10,6 @@ from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class StatsMailer:
|
||||
|
||||
def __init__(self, stats, recipients, mail):
|
||||
self.stats = stats
|
||||
self.recipients = recipients
|
||||
|
@ -15,6 +15,7 @@ from twisted.internet import protocol
|
||||
try:
|
||||
from twisted.conch import manhole, telnet
|
||||
from twisted.conch.insults import insults
|
||||
|
||||
TWISTED_CONCH_AVAILABLE = True
|
||||
except (ImportError, SyntaxError):
|
||||
_TWISTED_CONCH_TRACEBACK = traceback.format_exc()
|
||||
@ -35,24 +36,26 @@ update_telnet_vars = object()
|
||||
|
||||
|
||||
class TelnetConsole(protocol.ServerFactory):
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
|
||||
if not crawler.settings.getbool("TELNETCONSOLE_ENABLED"):
|
||||
raise NotConfigured
|
||||
if not TWISTED_CONCH_AVAILABLE:
|
||||
raise NotConfigured(
|
||||
'TELNETCONSOLE_ENABLED setting is True but required twisted '
|
||||
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
|
||||
"TELNETCONSOLE_ENABLED setting is True but required twisted "
|
||||
"modules failed to import:\n" + _TWISTED_CONCH_TRACEBACK
|
||||
)
|
||||
self.crawler = crawler
|
||||
self.noisy = False
|
||||
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
|
||||
self.host = crawler.settings['TELNETCONSOLE_HOST']
|
||||
self.username = crawler.settings['TELNETCONSOLE_USERNAME']
|
||||
self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
|
||||
self.portrange = [
|
||||
int(x) for x in crawler.settings.getlist("TELNETCONSOLE_PORT")
|
||||
]
|
||||
self.host = crawler.settings["TELNETCONSOLE_HOST"]
|
||||
self.username = crawler.settings["TELNETCONSOLE_USERNAME"]
|
||||
self.password = crawler.settings["TELNETCONSOLE_PASSWORD"]
|
||||
|
||||
if not self.password:
|
||||
self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
|
||||
logger.info('Telnet Password: %s', self.password)
|
||||
self.password = binascii.hexlify(os.urandom(8)).decode("utf8")
|
||||
logger.info("Telnet Password: %s", self.password)
|
||||
|
||||
self.crawler.signals.connect(self.start_listening, signals.engine_started)
|
||||
self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
|
||||
@ -64,9 +67,11 @@ class TelnetConsole(protocol.ServerFactory):
|
||||
def start_listening(self):
|
||||
self.port = listen_tcp(self.portrange, self.host, self)
|
||||
h = self.port.getHost()
|
||||
logger.info("Telnet console listening on %(host)s:%(port)d",
|
||||
{'host': h.host, 'port': h.port},
|
||||
extra={'crawler': self.crawler})
|
||||
logger.info(
|
||||
"Telnet console listening on %(host)s:%(port)d",
|
||||
{"host": h.host, "port": h.port},
|
||||
extra={"crawler": self.crawler},
|
||||
)
|
||||
|
||||
def stop_listening(self):
|
||||
self.port.stopListening()
|
||||
@ -74,41 +79,37 @@ class TelnetConsole(protocol.ServerFactory):
|
||||
def protocol(self):
|
||||
class Portal:
|
||||
"""An implementation of IPortal"""
|
||||
|
||||
@defers
|
||||
def login(self_, credentials, mind, *interfaces):
|
||||
if not (
|
||||
credentials.username == self.username.encode('utf8')
|
||||
and credentials.checkPassword(self.password.encode('utf8'))
|
||||
credentials.username == self.username.encode("utf8")
|
||||
and credentials.checkPassword(self.password.encode("utf8"))
|
||||
):
|
||||
raise ValueError("Invalid credentials")
|
||||
|
||||
protocol = telnet.TelnetBootstrapProtocol(
|
||||
insults.ServerProtocol,
|
||||
manhole.Manhole,
|
||||
self._get_telnet_vars()
|
||||
insults.ServerProtocol, manhole.Manhole, self._get_telnet_vars()
|
||||
)
|
||||
return (interfaces[0], protocol, lambda: None)
|
||||
|
||||
return telnet.TelnetTransport(
|
||||
telnet.AuthenticatingTelnetProtocol,
|
||||
Portal()
|
||||
)
|
||||
return telnet.TelnetTransport(telnet.AuthenticatingTelnetProtocol, Portal())
|
||||
|
||||
def _get_telnet_vars(self):
|
||||
# Note: if you add entries here also update topics/telnetconsole.rst
|
||||
telnet_vars = {
|
||||
'engine': self.crawler.engine,
|
||||
'spider': self.crawler.engine.spider,
|
||||
'slot': self.crawler.engine.slot,
|
||||
'crawler': self.crawler,
|
||||
'extensions': self.crawler.extensions,
|
||||
'stats': self.crawler.stats,
|
||||
'settings': self.crawler.settings,
|
||||
'est': lambda: print_engine_status(self.crawler.engine),
|
||||
'p': pprint.pprint,
|
||||
'prefs': print_live_refs,
|
||||
'help': "This is Scrapy telnet console. For more info see: "
|
||||
"https://docs.scrapy.org/en/latest/topics/telnetconsole.html",
|
||||
"engine": self.crawler.engine,
|
||||
"spider": self.crawler.engine.spider,
|
||||
"slot": self.crawler.engine.slot,
|
||||
"crawler": self.crawler,
|
||||
"extensions": self.crawler.extensions,
|
||||
"stats": self.crawler.stats,
|
||||
"settings": self.crawler.settings,
|
||||
"est": lambda: print_engine_status(self.crawler.engine),
|
||||
"p": pprint.pprint,
|
||||
"prefs": print_live_refs,
|
||||
"help": "This is Scrapy telnet console. For more info see: "
|
||||
"https://docs.scrapy.org/en/latest/topics/telnetconsole.html",
|
||||
}
|
||||
self.crawler.signals.send_catch_log(update_telnet_vars, telnet_vars=telnet_vars)
|
||||
return telnet_vars
|
||||
|
@ -7,16 +7,19 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoThrottle:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
|
||||
if not crawler.settings.getbool("AUTOTHROTTLE_ENABLED"):
|
||||
raise NotConfigured
|
||||
|
||||
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
|
||||
self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
|
||||
self.target_concurrency = crawler.settings.getfloat(
|
||||
"AUTOTHROTTLE_TARGET_CONCURRENCY"
|
||||
)
|
||||
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
|
||||
crawler.signals.connect(
|
||||
self._response_downloaded, signal=signals.response_downloaded
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
@ -29,17 +32,19 @@ class AutoThrottle:
|
||||
|
||||
def _min_delay(self, spider):
|
||||
s = self.crawler.settings
|
||||
return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
|
||||
return getattr(spider, "download_delay", s.getfloat("DOWNLOAD_DELAY"))
|
||||
|
||||
def _max_delay(self, spider):
|
||||
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
|
||||
return self.crawler.settings.getfloat("AUTOTHROTTLE_MAX_DELAY")
|
||||
|
||||
def _start_delay(self, spider):
|
||||
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
|
||||
return max(
|
||||
self.mindelay, self.crawler.settings.getfloat("AUTOTHROTTLE_START_DELAY")
|
||||
)
|
||||
|
||||
def _response_downloaded(self, response, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
latency = request.meta.get('download_latency')
|
||||
latency = request.meta.get("download_latency")
|
||||
if latency is None or slot is None:
|
||||
return
|
||||
|
||||
@ -54,15 +59,18 @@ class AutoThrottle:
|
||||
"delay:%(delay)5d ms (%(delaydiff)+d) | "
|
||||
"latency:%(latency)5d ms | size:%(size)6d bytes",
|
||||
{
|
||||
'slot': key, 'concurrency': conc,
|
||||
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
|
||||
'latency': latency * 1000, 'size': size,
|
||||
"slot": key,
|
||||
"concurrency": conc,
|
||||
"delay": slot.delay * 1000,
|
||||
"delaydiff": diff * 1000,
|
||||
"latency": latency * 1000,
|
||||
"size": size,
|
||||
},
|
||||
extra={'spider': spider}
|
||||
extra={"spider": spider},
|
||||
)
|
||||
|
||||
def _get_slot(self, request, spider):
|
||||
key = request.meta.get('download_slot')
|
||||
key = request.meta.get("download_slot")
|
||||
return key, self.crawler.engine.downloader.slots.get(key)
|
||||
|
||||
def _adjust_delay(self, slot, latency, response):
|
||||
|
@ -3,4 +3,5 @@ def obsolete_setter(setter, attrname):
|
||||
c = self.__class__.__name__
|
||||
msg = f"{c}.{attrname} is not modifiable, use {c}.replace() instead"
|
||||
raise AttributeError(msg)
|
||||
|
||||
return newsetter
|
||||
|
@ -36,7 +36,7 @@ class CookieJar:
|
||||
|
||||
if not IPV4_RE.search(req_host):
|
||||
hosts = potential_domain_matches(req_host)
|
||||
if '.' not in req_host:
|
||||
if "." not in req_host:
|
||||
hosts += [req_host + ".local"]
|
||||
else:
|
||||
hosts = [req_host]
|
||||
@ -96,14 +96,14 @@ def potential_domain_matches(domain):
|
||||
"""
|
||||
matches = [domain]
|
||||
try:
|
||||
start = domain.index('.') + 1
|
||||
end = domain.rindex('.')
|
||||
start = domain.index(".") + 1
|
||||
end = domain.rindex(".")
|
||||
while start < end:
|
||||
matches.append(domain[start:])
|
||||
start = domain.index('.', start) + 1
|
||||
start = domain.index(".", start) + 1
|
||||
except ValueError:
|
||||
pass
|
||||
return matches + ['.' + d for d in matches]
|
||||
return matches + ["." + d for d in matches]
|
||||
|
||||
|
||||
class _DummyLock:
|
||||
@ -140,7 +140,7 @@ class WrappedRequest:
|
||||
HTML document, and the user had no option to approve the automatic
|
||||
fetching of the image, this should be true.
|
||||
"""
|
||||
return self.request.meta.get('is_unverifiable', False)
|
||||
return self.request.meta.get("is_unverifiable", False)
|
||||
|
||||
@property
|
||||
def full_url(self):
|
||||
@ -166,13 +166,14 @@ class WrappedRequest:
|
||||
return name in self.request.headers
|
||||
|
||||
def get_header(self, name, default=None):
|
||||
return to_unicode(self.request.headers.get(name, default),
|
||||
errors='replace')
|
||||
return to_unicode(self.request.headers.get(name, default), errors="replace")
|
||||
|
||||
def header_items(self):
|
||||
return [
|
||||
(to_unicode(k, errors='replace'),
|
||||
[to_unicode(x, errors='replace') for x in v])
|
||||
(
|
||||
to_unicode(k, errors="replace"),
|
||||
[to_unicode(x, errors="replace") for x in v],
|
||||
)
|
||||
for k, v in self.request.headers.items()
|
||||
]
|
||||
|
||||
@ -181,7 +182,6 @@ class WrappedRequest:
|
||||
|
||||
|
||||
class WrappedResponse:
|
||||
|
||||
def __init__(self, response):
|
||||
self.response = response
|
||||
|
||||
@ -189,5 +189,6 @@ class WrappedResponse:
|
||||
return self
|
||||
|
||||
def get_all(self, name, default=None):
|
||||
return [to_unicode(v, errors='replace')
|
||||
for v in self.response.headers.getlist(name)]
|
||||
return [
|
||||
to_unicode(v, errors="replace") for v in self.response.headers.getlist(name)
|
||||
]
|
||||
|
@ -8,7 +8,7 @@ from scrapy.utils.python import to_unicode
|
||||
class Headers(CaselessDict):
|
||||
"""Case insensitive http headers dictionary"""
|
||||
|
||||
def __init__(self, seq=None, encoding='utf-8'):
|
||||
def __init__(self, seq=None, encoding="utf-8"):
|
||||
self.encoding = encoding
|
||||
super().__init__(seq)
|
||||
|
||||
@ -29,7 +29,7 @@ class Headers(CaselessDict):
|
||||
value = []
|
||||
elif isinstance(value, (str, bytes)):
|
||||
value = [value]
|
||||
elif not hasattr(value, '__iter__'):
|
||||
elif not hasattr(value, "__iter__"):
|
||||
value = [value]
|
||||
|
||||
return [self._tobytes(x) for x in value]
|
||||
@ -41,7 +41,7 @@ class Headers(CaselessDict):
|
||||
return x.encode(self.encoding)
|
||||
if isinstance(x, int):
|
||||
return str(x).encode(self.encoding)
|
||||
raise TypeError(f'Unsupported value type: {type(x)}')
|
||||
raise TypeError(f"Unsupported value type: {type(x)}")
|
||||
|
||||
def __getitem__(self, key):
|
||||
try:
|
||||
@ -84,13 +84,16 @@ class Headers(CaselessDict):
|
||||
return headers_dict_to_raw(self)
|
||||
|
||||
def to_unicode_dict(self):
|
||||
""" Return headers as a CaselessDict with unicode keys
|
||||
"""Return headers as a CaselessDict with unicode keys
|
||||
and unicode values. Multiple values are joined with ','.
|
||||
"""
|
||||
return CaselessDict(
|
||||
(to_unicode(key, encoding=self.encoding),
|
||||
to_unicode(b','.join(value), encoding=self.encoding))
|
||||
for key, value in self.items())
|
||||
(
|
||||
to_unicode(key, encoding=self.encoding),
|
||||
to_unicode(b",".join(value), encoding=self.encoding),
|
||||
)
|
||||
for key, value in self.items()
|
||||
)
|
||||
|
||||
def __copy__(self):
|
||||
return self.__class__(self)
|
||||
|
@ -27,9 +27,19 @@ class Request(object_ref):
|
||||
"""
|
||||
|
||||
attributes: Tuple[str, ...] = (
|
||||
"url", "callback", "method", "headers", "body",
|
||||
"cookies", "meta", "encoding", "priority",
|
||||
"dont_filter", "errback", "flags", "cb_kwargs",
|
||||
"url",
|
||||
"callback",
|
||||
"method",
|
||||
"headers",
|
||||
"body",
|
||||
"cookies",
|
||||
"meta",
|
||||
"encoding",
|
||||
"priority",
|
||||
"dont_filter",
|
||||
"errback",
|
||||
"flags",
|
||||
"cb_kwargs",
|
||||
)
|
||||
"""A tuple of :class:`str` objects containing the name of all public
|
||||
attributes of the class that are also keyword parameters of the
|
||||
@ -64,9 +74,11 @@ class Request(object_ref):
|
||||
self.priority = priority
|
||||
|
||||
if callback is not None and not callable(callback):
|
||||
raise TypeError(f'callback must be a callable, got {type(callback).__name__}')
|
||||
raise TypeError(
|
||||
f"callback must be a callable, got {type(callback).__name__}"
|
||||
)
|
||||
if errback is not None and not callable(errback):
|
||||
raise TypeError(f'errback must be a callable, got {type(errback).__name__}')
|
||||
raise TypeError(f"errback must be a callable, got {type(errback).__name__}")
|
||||
self.callback = callback
|
||||
self.errback = errback
|
||||
|
||||
@ -101,13 +113,13 @@ class Request(object_ref):
|
||||
self._url = escape_ajax(s)
|
||||
|
||||
if (
|
||||
'://' not in self._url
|
||||
and not self._url.startswith('about:')
|
||||
and not self._url.startswith('data:')
|
||||
"://" not in self._url
|
||||
and not self._url.startswith("about:")
|
||||
and not self._url.startswith("data:")
|
||||
):
|
||||
raise ValueError(f'Missing scheme in request url: {self._url}')
|
||||
raise ValueError(f"Missing scheme in request url: {self._url}")
|
||||
|
||||
url = property(_get_url, obsolete_setter(_set_url, 'url'))
|
||||
url = property(_get_url, obsolete_setter(_set_url, "url"))
|
||||
|
||||
def _get_body(self) -> bytes:
|
||||
return self._body
|
||||
@ -115,7 +127,7 @@ class Request(object_ref):
|
||||
def _set_body(self, body: Optional[Union[str, bytes]]) -> None:
|
||||
self._body = b"" if body is None else to_bytes(body, self.encoding)
|
||||
|
||||
body = property(_get_body, obsolete_setter(_set_body, 'body'))
|
||||
body = property(_get_body, obsolete_setter(_set_body, "body"))
|
||||
|
||||
@property
|
||||
def encoding(self) -> str:
|
||||
@ -131,12 +143,15 @@ class Request(object_ref):
|
||||
"""Create a new Request with the same attributes except for those given new values"""
|
||||
for x in self.attributes:
|
||||
kwargs.setdefault(x, getattr(self, x))
|
||||
cls = kwargs.pop('cls', self.__class__)
|
||||
cls = kwargs.pop("cls", self.__class__)
|
||||
return cls(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_curl(
|
||||
cls: Type[RequestTypeVar], curl_command: str, ignore_unknown_options: bool = True, **kwargs
|
||||
cls: Type[RequestTypeVar],
|
||||
curl_command: str,
|
||||
ignore_unknown_options: bool = True,
|
||||
**kwargs,
|
||||
) -> RequestTypeVar:
|
||||
"""Create a Request object from a string containing a `cURL
|
||||
<https://curl.haxx.se/>`_ command. It populates the HTTP method, the
|
||||
@ -179,21 +194,25 @@ class Request(object_ref):
|
||||
"""
|
||||
d = {
|
||||
"url": self.url, # urls are safe (safe_string_url)
|
||||
"callback": _find_method(spider, self.callback) if callable(self.callback) else self.callback,
|
||||
"errback": _find_method(spider, self.errback) if callable(self.errback) else self.errback,
|
||||
"callback": _find_method(spider, self.callback)
|
||||
if callable(self.callback)
|
||||
else self.callback,
|
||||
"errback": _find_method(spider, self.errback)
|
||||
if callable(self.errback)
|
||||
else self.errback,
|
||||
"headers": dict(self.headers),
|
||||
}
|
||||
for attr in self.attributes:
|
||||
d.setdefault(attr, getattr(self, attr))
|
||||
if type(self) is not Request: # pylint: disable=unidiomatic-typecheck
|
||||
d["_class"] = self.__module__ + '.' + self.__class__.__name__
|
||||
d["_class"] = self.__module__ + "." + self.__class__.__name__
|
||||
return d
|
||||
|
||||
|
||||
def _find_method(obj, func):
|
||||
"""Helper function for Request.to_dict"""
|
||||
# Only instance methods contain ``__func__``
|
||||
if obj and hasattr(func, '__func__'):
|
||||
if obj and hasattr(func, "__func__"):
|
||||
members = inspect.getmembers(obj, predicate=inspect.ismethod)
|
||||
for name, obj_func in members:
|
||||
# We need to use __func__ to access the original function object because instance
|
||||
|
@ -24,22 +24,26 @@ FormdataType = Optional[Union[dict, List[Tuple[str, str]]]]
|
||||
|
||||
|
||||
class FormRequest(Request):
|
||||
valid_form_methods = ['GET', 'POST']
|
||||
valid_form_methods = ["GET", "POST"]
|
||||
|
||||
def __init__(self, *args, formdata: FormdataType = None, **kwargs) -> None:
|
||||
if formdata and kwargs.get('method') is None:
|
||||
kwargs['method'] = 'POST'
|
||||
if formdata and kwargs.get("method") is None:
|
||||
kwargs["method"] = "POST"
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if formdata:
|
||||
items = formdata.items() if isinstance(formdata, dict) else formdata
|
||||
form_query_str = _urlencode(items, self.encoding)
|
||||
if self.method == 'POST':
|
||||
self.headers.setdefault(b'Content-Type', b'application/x-www-form-urlencoded')
|
||||
if self.method == "POST":
|
||||
self.headers.setdefault(
|
||||
b"Content-Type", b"application/x-www-form-urlencoded"
|
||||
)
|
||||
self._set_body(form_query_str)
|
||||
else:
|
||||
self._set_url(urlunsplit(urlsplit(self.url)._replace(query=form_query_str)))
|
||||
self._set_url(
|
||||
urlunsplit(urlsplit(self.url)._replace(query=form_query_str))
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_response(
|
||||
@ -55,28 +59,29 @@ class FormRequest(Request):
|
||||
formcss: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> FormRequestTypeVar:
|
||||
kwargs.setdefault('encoding', response.encoding)
|
||||
kwargs.setdefault("encoding", response.encoding)
|
||||
|
||||
if formcss is not None:
|
||||
from parsel.csstranslator import HTMLTranslator
|
||||
|
||||
formxpath = HTMLTranslator().css_to_xpath(formcss)
|
||||
|
||||
form = _get_form(response, formname, formid, formnumber, formxpath)
|
||||
formdata = _get_inputs(form, formdata, dont_click, clickdata)
|
||||
url = _get_form_url(form, kwargs.pop('url', None))
|
||||
url = _get_form_url(form, kwargs.pop("url", None))
|
||||
|
||||
method = kwargs.pop('method', form.method)
|
||||
method = kwargs.pop("method", form.method)
|
||||
if method is not None:
|
||||
method = method.upper()
|
||||
if method not in cls.valid_form_methods:
|
||||
method = 'GET'
|
||||
method = "GET"
|
||||
|
||||
return cls(url=url, method=method, formdata=formdata, **kwargs)
|
||||
|
||||
|
||||
def _get_form_url(form: FormElement, url: Optional[str]) -> str:
|
||||
if url is None:
|
||||
action = form.get('action')
|
||||
action = form.get("action")
|
||||
if action is None:
|
||||
return form.base_url
|
||||
return urljoin(form.base_url, strip_html5_whitespace(action))
|
||||
@ -84,9 +89,11 @@ def _get_form_url(form: FormElement, url: Optional[str]) -> str:
|
||||
|
||||
|
||||
def _urlencode(seq: Iterable, enc: str) -> str:
|
||||
values = [(to_bytes(k, enc), to_bytes(v, enc))
|
||||
for k, vs in seq
|
||||
for v in (vs if is_listlike(vs) else [vs])]
|
||||
values = [
|
||||
(to_bytes(k, enc), to_bytes(v, enc))
|
||||
for k, vs in seq
|
||||
for v in (vs if is_listlike(vs) else [vs])
|
||||
]
|
||||
return urlencode(values, doseq=True)
|
||||
|
||||
|
||||
@ -99,7 +106,7 @@ def _get_form(
|
||||
) -> FormElement:
|
||||
"""Find the wanted form element within the given response."""
|
||||
root = create_root_node(response.text, HTMLParser, base_url=get_base_url(response))
|
||||
forms = root.xpath('//form')
|
||||
forms = root.xpath("//form")
|
||||
if not forms:
|
||||
raise ValueError(f"No <form> element found in {response}")
|
||||
|
||||
@ -119,12 +126,12 @@ def _get_form(
|
||||
if nodes:
|
||||
el = nodes[0]
|
||||
while True:
|
||||
if el.tag == 'form':
|
||||
if el.tag == "form":
|
||||
return el
|
||||
el = el.getparent()
|
||||
if el is None:
|
||||
break
|
||||
raise ValueError(f'No <form> element found with {formxpath}')
|
||||
raise ValueError(f"No <form> element found with {formxpath}")
|
||||
|
||||
# If we get here, it means that either formname was None or invalid
|
||||
if formnumber is not None:
|
||||
@ -146,19 +153,21 @@ def _get_inputs(
|
||||
try:
|
||||
formdata_keys = dict(formdata or ()).keys()
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError('formdata should be a dict or iterable of tuples')
|
||||
raise ValueError("formdata should be a dict or iterable of tuples")
|
||||
|
||||
if not formdata:
|
||||
formdata = []
|
||||
inputs = form.xpath('descendant::textarea'
|
||||
'|descendant::select'
|
||||
'|descendant::input[not(@type) or @type['
|
||||
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
|
||||
' and (../@checked or'
|
||||
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
|
||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
||||
inputs = form.xpath(
|
||||
"descendant::textarea"
|
||||
"|descendant::select"
|
||||
"|descendant::input[not(@type) or @type["
|
||||
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
|
||||
" and (../@checked or"
|
||||
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
|
||||
namespaces={"re": "http://exslt.org/regular-expressions"},
|
||||
)
|
||||
values = [
|
||||
(k, '' if v is None else v)
|
||||
(k, "" if v is None else v)
|
||||
for k, v in (_value(e) for e in inputs)
|
||||
if k and k not in formdata_keys
|
||||
]
|
||||
@ -178,7 +187,7 @@ def _get_inputs(
|
||||
def _value(ele: HtmlElement):
|
||||
n = ele.name
|
||||
v = ele.value
|
||||
if ele.tag == 'select':
|
||||
if ele.tag == "select":
|
||||
return _select_value(ele, n, v)
|
||||
return n, v
|
||||
|
||||
@ -193,51 +202,57 @@ def _select_value(ele: SelectElement, n: str, v: str):
|
||||
if v is not None and multiple:
|
||||
# This is a workround to bug in lxml fixed 2.3.1
|
||||
# fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139
|
||||
selected_options = ele.xpath('.//option[@selected]')
|
||||
values = [(o.get('value') or o.text or '').strip() for o in selected_options]
|
||||
selected_options = ele.xpath(".//option[@selected]")
|
||||
values = [(o.get("value") or o.text or "").strip() for o in selected_options]
|
||||
return n, values
|
||||
return n, v
|
||||
|
||||
|
||||
def _get_clickable(clickdata: Optional[dict], form: FormElement) -> Optional[Tuple[str, str]]:
|
||||
def _get_clickable(
|
||||
clickdata: Optional[dict], form: FormElement
|
||||
) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Returns the clickable element specified in clickdata,
|
||||
if the latter is given. If not, it returns the first
|
||||
clickable element found
|
||||
"""
|
||||
clickables = list(form.xpath(
|
||||
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
|
||||
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
|
||||
namespaces={"re": "http://exslt.org/regular-expressions"}
|
||||
))
|
||||
clickables = list(
|
||||
form.xpath(
|
||||
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
|
||||
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
|
||||
namespaces={"re": "http://exslt.org/regular-expressions"},
|
||||
)
|
||||
)
|
||||
if not clickables:
|
||||
return None
|
||||
|
||||
# If we don't have clickdata, we just use the first clickable element
|
||||
if clickdata is None:
|
||||
el = clickables[0]
|
||||
return (el.get('name'), el.get('value') or '')
|
||||
return (el.get("name"), el.get("value") or "")
|
||||
|
||||
# If clickdata is given, we compare it to the clickable elements to find a
|
||||
# match. We first look to see if the number is specified in clickdata,
|
||||
# because that uniquely identifies the element
|
||||
nr = clickdata.get('nr', None)
|
||||
nr = clickdata.get("nr", None)
|
||||
if nr is not None:
|
||||
try:
|
||||
el = list(form.inputs)[nr]
|
||||
except IndexError:
|
||||
pass
|
||||
else:
|
||||
return (el.get('name'), el.get('value') or '')
|
||||
return (el.get("name"), el.get("value") or "")
|
||||
|
||||
# We didn't find it, so now we build an XPath expression out of the other
|
||||
# arguments, because they can be used as such
|
||||
xpath = './/*' + ''.join(f'[@{k}="{v}"]' for k, v in clickdata.items())
|
||||
xpath = ".//*" + "".join(f'[@{k}="{v}"]' for k, v in clickdata.items())
|
||||
el = form.xpath(xpath)
|
||||
if len(el) == 1:
|
||||
return (el[0].get('name'), el[0].get('value') or '')
|
||||
return (el[0].get("name"), el[0].get("value") or "")
|
||||
if len(el) > 1:
|
||||
raise ValueError(f"Multiple elements found ({el!r}) matching the "
|
||||
f"criteria in clickdata: {clickdata!r}")
|
||||
raise ValueError(
|
||||
f"Multiple elements found ({el!r}) matching the "
|
||||
f"criteria in clickdata: {clickdata!r}"
|
||||
)
|
||||
else:
|
||||
raise ValueError(f'No clickable element matching clickdata: {clickdata!r}')
|
||||
raise ValueError(f"No clickable element matching clickdata: {clickdata!r}")
|
||||
|
@ -20,42 +20,44 @@ class JsonRequest(Request):
|
||||
|
||||
def __init__(self, *args, dumps_kwargs: Optional[dict] = None, **kwargs) -> None:
|
||||
dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {}
|
||||
dumps_kwargs.setdefault('sort_keys', True)
|
||||
dumps_kwargs.setdefault("sort_keys", True)
|
||||
self._dumps_kwargs = dumps_kwargs
|
||||
|
||||
body_passed = kwargs.get('body', None) is not None
|
||||
data = kwargs.pop('data', None)
|
||||
body_passed = kwargs.get("body", None) is not None
|
||||
data = kwargs.pop("data", None)
|
||||
data_passed = data is not None
|
||||
|
||||
if body_passed and data_passed:
|
||||
warnings.warn('Both body and data passed. data will be ignored')
|
||||
warnings.warn("Both body and data passed. data will be ignored")
|
||||
elif not body_passed and data_passed:
|
||||
kwargs['body'] = self._dumps(data)
|
||||
if 'method' not in kwargs:
|
||||
kwargs['method'] = 'POST'
|
||||
kwargs["body"] = self._dumps(data)
|
||||
if "method" not in kwargs:
|
||||
kwargs["method"] = "POST"
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.headers.setdefault('Content-Type', 'application/json')
|
||||
self.headers.setdefault('Accept', 'application/json, text/javascript, */*; q=0.01')
|
||||
self.headers.setdefault("Content-Type", "application/json")
|
||||
self.headers.setdefault(
|
||||
"Accept", "application/json, text/javascript, */*; q=0.01"
|
||||
)
|
||||
|
||||
@property
|
||||
def dumps_kwargs(self) -> dict:
|
||||
return self._dumps_kwargs
|
||||
|
||||
def replace(self, *args, **kwargs) -> Request:
|
||||
body_passed = kwargs.get('body', None) is not None
|
||||
data = kwargs.pop('data', None)
|
||||
body_passed = kwargs.get("body", None) is not None
|
||||
data = kwargs.pop("data", None)
|
||||
data_passed = data is not None
|
||||
|
||||
if body_passed and data_passed:
|
||||
warnings.warn('Both body and data passed. data will be ignored')
|
||||
warnings.warn("Both body and data passed. data will be ignored")
|
||||
elif not body_passed and data_passed:
|
||||
kwargs['body'] = self._dumps(data)
|
||||
kwargs["body"] = self._dumps(data)
|
||||
|
||||
return super().replace(*args, **kwargs)
|
||||
|
||||
def _dumps(self, data: dict) -> str:
|
||||
"""Convert to JSON """
|
||||
"""Convert to JSON"""
|
||||
return json.dumps(data, **self._dumps_kwargs)
|
||||
|
||||
|
||||
|
@ -15,21 +15,20 @@ DUMPS_ARGS = get_func_args(xmlrpclib.dumps)
|
||||
|
||||
|
||||
class XmlRpcRequest(Request):
|
||||
|
||||
def __init__(self, *args, encoding: Optional[str] = None, **kwargs):
|
||||
if 'body' not in kwargs and 'params' in kwargs:
|
||||
if "body" not in kwargs and "params" in kwargs:
|
||||
kw = dict((k, kwargs.pop(k)) for k in DUMPS_ARGS if k in kwargs)
|
||||
kwargs['body'] = xmlrpclib.dumps(**kw)
|
||||
kwargs["body"] = xmlrpclib.dumps(**kw)
|
||||
|
||||
# spec defines that requests must use POST method
|
||||
kwargs.setdefault('method', 'POST')
|
||||
kwargs.setdefault("method", "POST")
|
||||
|
||||
# xmlrpc query multiples times over the same url
|
||||
kwargs.setdefault('dont_filter', True)
|
||||
kwargs.setdefault("dont_filter", True)
|
||||
|
||||
# restore encoding
|
||||
if encoding is not None:
|
||||
kwargs['encoding'] = encoding
|
||||
kwargs["encoding"] = encoding
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.headers.setdefault('Content-Type', 'text/xml')
|
||||
self.headers.setdefault("Content-Type", "text/xml")
|
||||
|
@ -21,7 +21,15 @@ class Response(object_ref):
|
||||
"""
|
||||
|
||||
attributes: Tuple[str, ...] = (
|
||||
"url", "status", "headers", "body", "flags", "request", "certificate", "ip_address", "protocol",
|
||||
"url",
|
||||
"status",
|
||||
"headers",
|
||||
"body",
|
||||
"flags",
|
||||
"request",
|
||||
"certificate",
|
||||
"ip_address",
|
||||
"protocol",
|
||||
)
|
||||
"""A tuple of :class:`str` objects containing the name of all public
|
||||
attributes of the class that are also keyword parameters of the
|
||||
@ -79,26 +87,28 @@ class Response(object_ref):
|
||||
if isinstance(url, str):
|
||||
self._url = url
|
||||
else:
|
||||
raise TypeError(f'{type(self).__name__} url must be str, '
|
||||
f'got {type(url).__name__}')
|
||||
raise TypeError(
|
||||
f"{type(self).__name__} url must be str, " f"got {type(url).__name__}"
|
||||
)
|
||||
|
||||
url = property(_get_url, obsolete_setter(_set_url, 'url'))
|
||||
url = property(_get_url, obsolete_setter(_set_url, "url"))
|
||||
|
||||
def _get_body(self):
|
||||
return self._body
|
||||
|
||||
def _set_body(self, body):
|
||||
if body is None:
|
||||
self._body = b''
|
||||
self._body = b""
|
||||
elif not isinstance(body, bytes):
|
||||
raise TypeError(
|
||||
"Response body must be bytes. "
|
||||
"If you want to pass unicode body use TextResponse "
|
||||
"or HtmlResponse.")
|
||||
"or HtmlResponse."
|
||||
)
|
||||
else:
|
||||
self._body = body
|
||||
|
||||
body = property(_get_body, obsolete_setter(_set_body, 'body'))
|
||||
body = property(_get_body, obsolete_setter(_set_body, "body"))
|
||||
|
||||
def __repr__(self):
|
||||
return f"<{self.status} {self.url}>"
|
||||
@ -111,7 +121,7 @@ class Response(object_ref):
|
||||
"""Create a new Response with the same attributes except for those given new values"""
|
||||
for x in self.attributes:
|
||||
kwargs.setdefault(x, getattr(self, x))
|
||||
cls = kwargs.pop('cls', self.__class__)
|
||||
cls = kwargs.pop("cls", self.__class__)
|
||||
return cls(*args, **kwargs)
|
||||
|
||||
def urljoin(self, url):
|
||||
@ -138,9 +148,22 @@ class Response(object_ref):
|
||||
"""
|
||||
raise NotSupported("Response content isn't text")
|
||||
|
||||
def follow(self, url, callback=None, method='GET', headers=None, body=None,
|
||||
cookies=None, meta=None, encoding='utf-8', priority=0,
|
||||
dont_filter=False, errback=None, cb_kwargs=None, flags=None) -> Request:
|
||||
def follow(
|
||||
self,
|
||||
url,
|
||||
callback=None,
|
||||
method="GET",
|
||||
headers=None,
|
||||
body=None,
|
||||
cookies=None,
|
||||
meta=None,
|
||||
encoding="utf-8",
|
||||
priority=0,
|
||||
dont_filter=False,
|
||||
errback=None,
|
||||
cb_kwargs=None,
|
||||
flags=None,
|
||||
) -> Request:
|
||||
"""
|
||||
Return a :class:`~.Request` instance to follow a link ``url``.
|
||||
It accepts the same arguments as ``Request.__init__`` method,
|
||||
@ -176,10 +199,22 @@ class Response(object_ref):
|
||||
flags=flags,
|
||||
)
|
||||
|
||||
def follow_all(self, urls, callback=None, method='GET', headers=None, body=None,
|
||||
cookies=None, meta=None, encoding='utf-8', priority=0,
|
||||
dont_filter=False, errback=None, cb_kwargs=None,
|
||||
flags=None) -> Generator[Request, None, None]:
|
||||
def follow_all(
|
||||
self,
|
||||
urls,
|
||||
callback=None,
|
||||
method="GET",
|
||||
headers=None,
|
||||
body=None,
|
||||
cookies=None,
|
||||
meta=None,
|
||||
encoding="utf-8",
|
||||
priority=0,
|
||||
dont_filter=False,
|
||||
errback=None,
|
||||
cb_kwargs=None,
|
||||
flags=None,
|
||||
) -> Generator[Request, None, None]:
|
||||
"""
|
||||
.. versionadded:: 2.0
|
||||
|
||||
@ -192,7 +227,7 @@ class Response(object_ref):
|
||||
method which supports selectors in addition to absolute/relative URLs
|
||||
and Link objects.
|
||||
"""
|
||||
if not hasattr(urls, '__iter__'):
|
||||
if not hasattr(urls, "__iter__"):
|
||||
raise TypeError("'urls' argument must be an iterable")
|
||||
return (
|
||||
self.follow(
|
||||
|
@ -30,13 +30,13 @@ _NONE = object()
|
||||
|
||||
class TextResponse(Response):
|
||||
|
||||
_DEFAULT_ENCODING = 'ascii'
|
||||
_DEFAULT_ENCODING = "ascii"
|
||||
_cached_decoded_json = _NONE
|
||||
|
||||
attributes: Tuple[str, ...] = Response.attributes + ("encoding",)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._encoding = kwargs.pop('encoding', None)
|
||||
self._encoding = kwargs.pop("encoding", None)
|
||||
self._cached_benc = None
|
||||
self._cached_ubody = None
|
||||
self._cached_selector = None
|
||||
@ -49,11 +49,13 @@ class TextResponse(Response):
|
||||
super()._set_url(url)
|
||||
|
||||
def _set_body(self, body):
|
||||
self._body = b'' # used by encoding detection
|
||||
self._body = b"" # used by encoding detection
|
||||
if isinstance(body, str):
|
||||
if self._encoding is None:
|
||||
raise TypeError('Cannot convert unicode body - '
|
||||
f'{type(self).__name__} has no encoding')
|
||||
raise TypeError(
|
||||
"Cannot convert unicode body - "
|
||||
f"{type(self).__name__} has no encoding"
|
||||
)
|
||||
self._body = body.encode(self._encoding)
|
||||
else:
|
||||
super()._set_body(body)
|
||||
@ -82,12 +84,12 @@ class TextResponse(Response):
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
""" Body as unicode """
|
||||
"""Body as unicode"""
|
||||
# access self.encoding before _cached_ubody to make sure
|
||||
# _body_inferred_encoding is called
|
||||
benc = self.encoding
|
||||
if self._cached_ubody is None:
|
||||
charset = f'charset={benc}'
|
||||
charset = f"charset={benc}"
|
||||
self._cached_ubody = html_to_unicode(charset, self.body)[1]
|
||||
return self._cached_ubody
|
||||
|
||||
@ -98,21 +100,24 @@ class TextResponse(Response):
|
||||
|
||||
@memoizemethod_noargs
|
||||
def _headers_encoding(self):
|
||||
content_type = self.headers.get(b'Content-Type', b'')
|
||||
content_type = self.headers.get(b"Content-Type", b"")
|
||||
return http_content_type_encoding(to_unicode(content_type))
|
||||
|
||||
def _body_inferred_encoding(self):
|
||||
if self._cached_benc is None:
|
||||
content_type = to_unicode(self.headers.get(b'Content-Type', b''))
|
||||
benc, ubody = html_to_unicode(content_type, self.body,
|
||||
auto_detect_fun=self._auto_detect_fun,
|
||||
default_encoding=self._DEFAULT_ENCODING)
|
||||
content_type = to_unicode(self.headers.get(b"Content-Type", b""))
|
||||
benc, ubody = html_to_unicode(
|
||||
content_type,
|
||||
self.body,
|
||||
auto_detect_fun=self._auto_detect_fun,
|
||||
default_encoding=self._DEFAULT_ENCODING,
|
||||
)
|
||||
self._cached_benc = benc
|
||||
self._cached_ubody = ubody
|
||||
return self._cached_benc
|
||||
|
||||
def _auto_detect_fun(self, text):
|
||||
for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
|
||||
for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
|
||||
try:
|
||||
text.decode(enc)
|
||||
except UnicodeError:
|
||||
@ -130,6 +135,7 @@ class TextResponse(Response):
|
||||
@property
|
||||
def selector(self):
|
||||
from scrapy.selector import Selector
|
||||
|
||||
if self._cached_selector is None:
|
||||
self._cached_selector = Selector(self)
|
||||
return self._cached_selector
|
||||
@ -140,9 +146,22 @@ class TextResponse(Response):
|
||||
def css(self, query):
|
||||
return self.selector.css(query)
|
||||
|
||||
def follow(self, url, callback=None, method='GET', headers=None, body=None,
|
||||
cookies=None, meta=None, encoding=None, priority=0,
|
||||
dont_filter=False, errback=None, cb_kwargs=None, flags=None) -> Request:
|
||||
def follow(
|
||||
self,
|
||||
url,
|
||||
callback=None,
|
||||
method="GET",
|
||||
headers=None,
|
||||
body=None,
|
||||
cookies=None,
|
||||
meta=None,
|
||||
encoding=None,
|
||||
priority=0,
|
||||
dont_filter=False,
|
||||
errback=None,
|
||||
cb_kwargs=None,
|
||||
flags=None,
|
||||
) -> Request:
|
||||
"""
|
||||
Return a :class:`~.Request` instance to follow a link ``url``.
|
||||
It accepts the same arguments as ``Request.__init__`` method,
|
||||
@ -180,10 +199,24 @@ class TextResponse(Response):
|
||||
flags=flags,
|
||||
)
|
||||
|
||||
def follow_all(self, urls=None, callback=None, method='GET', headers=None, body=None,
|
||||
cookies=None, meta=None, encoding=None, priority=0,
|
||||
dont_filter=False, errback=None, cb_kwargs=None, flags=None,
|
||||
css=None, xpath=None) -> Generator[Request, None, None]:
|
||||
def follow_all(
|
||||
self,
|
||||
urls=None,
|
||||
callback=None,
|
||||
method="GET",
|
||||
headers=None,
|
||||
body=None,
|
||||
cookies=None,
|
||||
meta=None,
|
||||
encoding=None,
|
||||
priority=0,
|
||||
dont_filter=False,
|
||||
errback=None,
|
||||
cb_kwargs=None,
|
||||
flags=None,
|
||||
css=None,
|
||||
xpath=None,
|
||||
) -> Generator[Request, None, None]:
|
||||
"""
|
||||
A generator that produces :class:`~.Request` instances to follow all
|
||||
links in ``urls``. It accepts the same arguments as the :class:`~.Request`'s
|
||||
@ -251,12 +284,13 @@ def _url_from_selector(sel):
|
||||
if isinstance(sel.root, str):
|
||||
# e.g. ::attr(href) result
|
||||
return strip_html5_whitespace(sel.root)
|
||||
if not hasattr(sel.root, 'tag'):
|
||||
if not hasattr(sel.root, "tag"):
|
||||
raise _InvalidSelector(f"Unsupported selector: {sel}")
|
||||
if sel.root.tag not in ('a', 'link'):
|
||||
raise _InvalidSelector("Only <a> and <link> elements are supported; "
|
||||
f"got <{sel.root.tag}>")
|
||||
href = sel.root.get('href')
|
||||
if sel.root.tag not in ("a", "link"):
|
||||
raise _InvalidSelector(
|
||||
"Only <a> and <link> elements are supported; " f"got <{sel.root.tag}>"
|
||||
)
|
||||
href = sel.root.get("href")
|
||||
if href is None:
|
||||
raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
|
||||
return strip_html5_whitespace(href)
|
||||
|
@ -2,7 +2,6 @@ from zope.interface import Interface
|
||||
|
||||
|
||||
class ISpiderLoader(Interface):
|
||||
|
||||
def from_settings(settings):
|
||||
"""Return an instance of the class for the given settings"""
|
||||
|
||||
|
@ -24,11 +24,11 @@ class ItemMeta(ABCMeta):
|
||||
"""
|
||||
|
||||
def __new__(mcs, class_name, bases, attrs):
|
||||
classcell = attrs.pop('__classcell__', None)
|
||||
new_bases = tuple(base._class for base in bases if hasattr(base, '_class'))
|
||||
_class = super().__new__(mcs, 'x_' + class_name, new_bases, attrs)
|
||||
classcell = attrs.pop("__classcell__", None)
|
||||
new_bases = tuple(base._class for base in bases if hasattr(base, "_class"))
|
||||
_class = super().__new__(mcs, "x_" + class_name, new_bases, attrs)
|
||||
|
||||
fields = getattr(_class, 'fields', {})
|
||||
fields = getattr(_class, "fields", {})
|
||||
new_attrs = {}
|
||||
for n in dir(_class):
|
||||
v = getattr(_class, n)
|
||||
@ -37,10 +37,10 @@ class ItemMeta(ABCMeta):
|
||||
elif n in attrs:
|
||||
new_attrs[n] = attrs[n]
|
||||
|
||||
new_attrs['fields'] = fields
|
||||
new_attrs['_class'] = _class
|
||||
new_attrs["fields"] = fields
|
||||
new_attrs["_class"] = _class
|
||||
if classcell is not None:
|
||||
new_attrs['__classcell__'] = classcell
|
||||
new_attrs["__classcell__"] = classcell
|
||||
return super().__new__(mcs, class_name, bases, new_attrs)
|
||||
|
||||
|
||||
@ -93,7 +93,7 @@ class Item(MutableMapping, object_ref, metaclass=ItemMeta):
|
||||
raise AttributeError(name)
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
if not name.startswith('_'):
|
||||
if not name.startswith("_"):
|
||||
raise AttributeError(f"Use item[{name!r}] = {value!r} to set field value")
|
||||
super().__setattr__(name, value)
|
||||
|
||||
@ -115,6 +115,5 @@ class Item(MutableMapping, object_ref, metaclass=ItemMeta):
|
||||
return self.__class__(self)
|
||||
|
||||
def deepcopy(self):
|
||||
"""Return a :func:`~copy.deepcopy` of this item.
|
||||
"""
|
||||
"""Return a :func:`~copy.deepcopy` of this item."""
|
||||
return deepcopy(self)
|
||||
|
@ -24,9 +24,9 @@ class Link:
|
||||
of the anchor tag.
|
||||
"""
|
||||
|
||||
__slots__ = ['url', 'text', 'fragment', 'nofollow']
|
||||
__slots__ = ["url", "text", "fragment", "nofollow"]
|
||||
|
||||
def __init__(self, url, text='', fragment='', nofollow=False):
|
||||
def __init__(self, url, text="", fragment="", nofollow=False):
|
||||
if not isinstance(url, str):
|
||||
got = url.__class__.__name__
|
||||
raise TypeError(f"Link urls must be str objects, got {got}")
|
||||
@ -44,10 +44,12 @@ class Link:
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.url) ^ hash(self.text) ^ hash(self.fragment) ^ hash(self.nofollow)
|
||||
return (
|
||||
hash(self.url) ^ hash(self.text) ^ hash(self.fragment) ^ hash(self.nofollow)
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f'Link(url={self.url!r}, text={self.text!r}, '
|
||||
f'fragment={self.fragment!r}, nofollow={self.nofollow!r})'
|
||||
f"Link(url={self.url!r}, text={self.text!r}, "
|
||||
f"fragment={self.fragment!r}, nofollow={self.nofollow!r})"
|
||||
)
|
||||
|
@ -10,25 +10,81 @@ import re
|
||||
# common file extensions that are not followed if they occur in links
|
||||
IGNORED_EXTENSIONS = [
|
||||
# archives
|
||||
'7z', '7zip', 'bz2', 'rar', 'tar', 'tar.gz', 'xz', 'zip',
|
||||
|
||||
"7z",
|
||||
"7zip",
|
||||
"bz2",
|
||||
"rar",
|
||||
"tar",
|
||||
"tar.gz",
|
||||
"xz",
|
||||
"zip",
|
||||
# images
|
||||
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
|
||||
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'cdr', 'ico',
|
||||
|
||||
"mng",
|
||||
"pct",
|
||||
"bmp",
|
||||
"gif",
|
||||
"jpg",
|
||||
"jpeg",
|
||||
"png",
|
||||
"pst",
|
||||
"psp",
|
||||
"tif",
|
||||
"tiff",
|
||||
"ai",
|
||||
"drw",
|
||||
"dxf",
|
||||
"eps",
|
||||
"ps",
|
||||
"svg",
|
||||
"cdr",
|
||||
"ico",
|
||||
# audio
|
||||
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
|
||||
|
||||
"mp3",
|
||||
"wma",
|
||||
"ogg",
|
||||
"wav",
|
||||
"ra",
|
||||
"aac",
|
||||
"mid",
|
||||
"au",
|
||||
"aiff",
|
||||
# video
|
||||
'3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
|
||||
'm4a', 'm4v', 'flv', 'webm',
|
||||
|
||||
"3gp",
|
||||
"asf",
|
||||
"asx",
|
||||
"avi",
|
||||
"mov",
|
||||
"mp4",
|
||||
"mpg",
|
||||
"qt",
|
||||
"rm",
|
||||
"swf",
|
||||
"wmv",
|
||||
"m4a",
|
||||
"m4v",
|
||||
"flv",
|
||||
"webm",
|
||||
# office suites
|
||||
'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg',
|
||||
'odp',
|
||||
|
||||
"xls",
|
||||
"xlsx",
|
||||
"ppt",
|
||||
"pptx",
|
||||
"pps",
|
||||
"doc",
|
||||
"docx",
|
||||
"odt",
|
||||
"ods",
|
||||
"odg",
|
||||
"odp",
|
||||
# other
|
||||
'css', 'pdf', 'exe', 'bin', 'rss', 'dmg', 'iso', 'apk'
|
||||
"css",
|
||||
"pdf",
|
||||
"exe",
|
||||
"bin",
|
||||
"rss",
|
||||
"dmg",
|
||||
"iso",
|
||||
"apk",
|
||||
]
|
||||
|
||||
|
||||
@ -40,7 +96,7 @@ def _matches(url, regexs):
|
||||
|
||||
|
||||
def _is_valid_url(url):
|
||||
return url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
|
||||
return url.split("://", 1)[0] in {"http", "https", "file", "ftp"}
|
||||
|
||||
|
||||
# Top-level imports
|
||||
|
@ -11,8 +11,13 @@ from w3lib.html import strip_html5_whitespace
|
||||
from w3lib.url import canonicalize_url, safe_url_string
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.linkextractors import (IGNORED_EXTENSIONS, _is_valid_url, _matches,
|
||||
_re_type, re)
|
||||
from scrapy.linkextractors import (
|
||||
IGNORED_EXTENSIONS,
|
||||
_is_valid_url,
|
||||
_matches,
|
||||
_re_type,
|
||||
re,
|
||||
)
|
||||
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
|
||||
from scrapy.utils.python import unique as unique_list
|
||||
from scrapy.utils.response import get_base_url
|
||||
@ -26,8 +31,8 @@ _collect_string_content = etree.XPath("string()")
|
||||
|
||||
def _nons(tag):
|
||||
if isinstance(tag, str):
|
||||
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE) + 1] == XHTML_NAMESPACE:
|
||||
return tag.split('}')[-1]
|
||||
if tag[0] == "{" and tag[1 : len(XHTML_NAMESPACE) + 1] == XHTML_NAMESPACE:
|
||||
return tag.split("}")[-1]
|
||||
return tag
|
||||
|
||||
|
||||
@ -41,14 +46,22 @@ def _canonicalize_link_url(link):
|
||||
|
||||
class LxmlParserLinkExtractor:
|
||||
def __init__(
|
||||
self, tag="a", attr="href", process=None, unique=False, strip=True, canonicalized=False
|
||||
self,
|
||||
tag="a",
|
||||
attr="href",
|
||||
process=None,
|
||||
unique=False,
|
||||
strip=True,
|
||||
canonicalized=False,
|
||||
):
|
||||
self.scan_tag = tag if callable(tag) else partial(operator.eq, tag)
|
||||
self.scan_attr = attr if callable(attr) else partial(operator.eq, attr)
|
||||
self.process_attr = process if callable(process) else _identity
|
||||
self.unique = unique
|
||||
self.strip = strip
|
||||
self.link_key = operator.attrgetter("url") if canonicalized else _canonicalize_link_url
|
||||
self.link_key = (
|
||||
operator.attrgetter("url") if canonicalized else _canonicalize_link_url
|
||||
)
|
||||
|
||||
def _iter_links(self, document):
|
||||
for el in document.iter(etree.Element):
|
||||
@ -78,17 +91,22 @@ class LxmlParserLinkExtractor:
|
||||
url = safe_url_string(url, encoding=response_encoding)
|
||||
# to fix relative links after process_value
|
||||
url = urljoin(response_url, url)
|
||||
link = Link(url, _collect_string_content(el) or '',
|
||||
nofollow=rel_has_nofollow(el.get('rel')))
|
||||
link = Link(
|
||||
url,
|
||||
_collect_string_content(el) or "",
|
||||
nofollow=rel_has_nofollow(el.get("rel")),
|
||||
)
|
||||
links.append(link)
|
||||
return self._deduplicate_if_needed(links)
|
||||
|
||||
def extract_links(self, response):
|
||||
base_url = get_base_url(response)
|
||||
return self._extract_links(response.selector, response.url, response.encoding, base_url)
|
||||
return self._extract_links(
|
||||
response.selector, response.url, response.encoding, base_url
|
||||
)
|
||||
|
||||
def _process_links(self, links):
|
||||
""" Normalize and filter extracted links
|
||||
"""Normalize and filter extracted links
|
||||
|
||||
The subclass should override it if necessary
|
||||
"""
|
||||
@ -110,8 +128,8 @@ class LxmlLinkExtractor:
|
||||
allow_domains=(),
|
||||
deny_domains=(),
|
||||
restrict_xpaths=(),
|
||||
tags=('a', 'area'),
|
||||
attrs=('href',),
|
||||
tags=("a", "area"),
|
||||
attrs=("href",),
|
||||
canonicalize=False,
|
||||
unique=True,
|
||||
process_value=None,
|
||||
@ -127,26 +145,31 @@ class LxmlLinkExtractor:
|
||||
unique=unique,
|
||||
process=process_value,
|
||||
strip=strip,
|
||||
canonicalized=canonicalize
|
||||
canonicalized=canonicalize,
|
||||
)
|
||||
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x)
|
||||
for x in arg_to_iter(allow)]
|
||||
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x)
|
||||
for x in arg_to_iter(deny)]
|
||||
self.allow_res = [
|
||||
x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)
|
||||
]
|
||||
self.deny_res = [
|
||||
x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)
|
||||
]
|
||||
|
||||
self.allow_domains = set(arg_to_iter(allow_domains))
|
||||
self.deny_domains = set(arg_to_iter(deny_domains))
|
||||
|
||||
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
|
||||
self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
|
||||
arg_to_iter(restrict_css)))
|
||||
self.restrict_xpaths += tuple(
|
||||
map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css))
|
||||
)
|
||||
|
||||
if deny_extensions is None:
|
||||
deny_extensions = IGNORED_EXTENSIONS
|
||||
self.canonicalize = canonicalize
|
||||
self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
|
||||
self.restrict_text = [x if isinstance(x, _re_type) else re.compile(x)
|
||||
for x in arg_to_iter(restrict_text)]
|
||||
self.deny_extensions = {"." + e for e in arg_to_iter(deny_extensions)}
|
||||
self.restrict_text = [
|
||||
x if isinstance(x, _re_type) else re.compile(x)
|
||||
for x in arg_to_iter(restrict_text)
|
||||
]
|
||||
|
||||
def _link_allowed(self, link):
|
||||
if not _is_valid_url(link.url):
|
||||
@ -156,11 +179,15 @@ class LxmlLinkExtractor:
|
||||
if self.deny_res and _matches(link.url, self.deny_res):
|
||||
return False
|
||||
parsed_url = urlparse(link.url)
|
||||
if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains):
|
||||
if self.allow_domains and not url_is_from_any_domain(
|
||||
parsed_url, self.allow_domains
|
||||
):
|
||||
return False
|
||||
if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
|
||||
return False
|
||||
if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions):
|
||||
if self.deny_extensions and url_has_any_extension(
|
||||
parsed_url, self.deny_extensions
|
||||
):
|
||||
return False
|
||||
if self.restrict_text and not _matches(link.text, self.restrict_text):
|
||||
return False
|
||||
@ -173,7 +200,11 @@ class LxmlLinkExtractor:
|
||||
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
|
||||
return False
|
||||
|
||||
allowed = (regex.search(url) for regex in self.allow_res) if self.allow_res else [True]
|
||||
allowed = (
|
||||
(regex.search(url) for regex in self.allow_res)
|
||||
if self.allow_res
|
||||
else [True]
|
||||
)
|
||||
denied = (regex.search(url) for regex in self.deny_res) if self.deny_res else []
|
||||
return any(allowed) and not any(denied)
|
||||
|
||||
@ -200,9 +231,7 @@ class LxmlLinkExtractor:
|
||||
base_url = get_base_url(response)
|
||||
if self.restrict_xpaths:
|
||||
docs = [
|
||||
subdoc
|
||||
for x in self.restrict_xpaths
|
||||
for subdoc in response.xpath(x)
|
||||
subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)
|
||||
]
|
||||
else:
|
||||
docs = [response.selector]
|
||||
|
@ -15,7 +15,7 @@ def wrap_loader_context(function, context):
|
||||
"scrapy.loader.common.wrap_loader_context has moved to a new library."
|
||||
"Please update your reference to itemloaders.common.wrap_loader_context",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
return common.wrap_loader_context(function, context)
|
||||
|
@ -8,14 +8,14 @@ from itemloaders import processors
|
||||
from scrapy.utils.deprecate import create_deprecated_class
|
||||
|
||||
|
||||
MapCompose = create_deprecated_class('MapCompose', processors.MapCompose)
|
||||
MapCompose = create_deprecated_class("MapCompose", processors.MapCompose)
|
||||
|
||||
Compose = create_deprecated_class('Compose', processors.Compose)
|
||||
Compose = create_deprecated_class("Compose", processors.Compose)
|
||||
|
||||
TakeFirst = create_deprecated_class('TakeFirst', processors.TakeFirst)
|
||||
TakeFirst = create_deprecated_class("TakeFirst", processors.TakeFirst)
|
||||
|
||||
Identity = create_deprecated_class('Identity', processors.Identity)
|
||||
Identity = create_deprecated_class("Identity", processors.Identity)
|
||||
|
||||
SelectJmes = create_deprecated_class('SelectJmes', processors.SelectJmes)
|
||||
SelectJmes = create_deprecated_class("SelectJmes", processors.SelectJmes)
|
||||
|
||||
Join = create_deprecated_class('Join', processors.Join)
|
||||
Join = create_deprecated_class("Join", processors.Join)
|
||||
|
@ -54,20 +54,20 @@ class LogFormatter:
|
||||
|
||||
def crawled(self, request, response, spider):
|
||||
"""Logs a message when the crawler finds a webpage."""
|
||||
request_flags = f' {str(request.flags)}' if request.flags else ''
|
||||
response_flags = f' {str(response.flags)}' if response.flags else ''
|
||||
request_flags = f" {str(request.flags)}" if request.flags else ""
|
||||
response_flags = f" {str(response.flags)}" if response.flags else ""
|
||||
return {
|
||||
'level': logging.DEBUG,
|
||||
'msg': CRAWLEDMSG,
|
||||
'args': {
|
||||
'status': response.status,
|
||||
'request': request,
|
||||
'request_flags': request_flags,
|
||||
'referer': referer_str(request),
|
||||
'response_flags': response_flags,
|
||||
"level": logging.DEBUG,
|
||||
"msg": CRAWLEDMSG,
|
||||
"args": {
|
||||
"status": response.status,
|
||||
"request": request,
|
||||
"request_flags": request_flags,
|
||||
"referer": referer_str(request),
|
||||
"response_flags": response_flags,
|
||||
# backward compatibility with Scrapy logformatter below 1.4 version
|
||||
'flags': response_flags
|
||||
}
|
||||
"flags": response_flags,
|
||||
},
|
||||
}
|
||||
|
||||
def scraped(self, item, response, spider):
|
||||
@ -77,23 +77,23 @@ class LogFormatter:
|
||||
else:
|
||||
src = response
|
||||
return {
|
||||
'level': logging.DEBUG,
|
||||
'msg': SCRAPEDMSG,
|
||||
'args': {
|
||||
'src': src,
|
||||
'item': item,
|
||||
}
|
||||
"level": logging.DEBUG,
|
||||
"msg": SCRAPEDMSG,
|
||||
"args": {
|
||||
"src": src,
|
||||
"item": item,
|
||||
},
|
||||
}
|
||||
|
||||
def dropped(self, item, exception, response, spider):
|
||||
"""Logs a message when an item is dropped while it is passing through the item pipeline."""
|
||||
return {
|
||||
'level': logging.WARNING,
|
||||
'msg': DROPPEDMSG,
|
||||
'args': {
|
||||
'exception': exception,
|
||||
'item': item,
|
||||
}
|
||||
"level": logging.WARNING,
|
||||
"msg": DROPPEDMSG,
|
||||
"args": {
|
||||
"exception": exception,
|
||||
"item": item,
|
||||
},
|
||||
}
|
||||
|
||||
def item_error(self, item, exception, response, spider):
|
||||
@ -103,11 +103,11 @@ class LogFormatter:
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
return {
|
||||
'level': logging.ERROR,
|
||||
'msg': ITEMERRORMSG,
|
||||
'args': {
|
||||
'item': item,
|
||||
}
|
||||
"level": logging.ERROR,
|
||||
"msg": ITEMERRORMSG,
|
||||
"args": {
|
||||
"item": item,
|
||||
},
|
||||
}
|
||||
|
||||
def spider_error(self, failure, request, response, spider):
|
||||
@ -116,12 +116,12 @@ class LogFormatter:
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
return {
|
||||
'level': logging.ERROR,
|
||||
'msg': SPIDERERRORMSG,
|
||||
'args': {
|
||||
'request': request,
|
||||
'referer': referer_str(request),
|
||||
}
|
||||
"level": logging.ERROR,
|
||||
"msg": SPIDERERRORMSG,
|
||||
"args": {
|
||||
"request": request,
|
||||
"referer": referer_str(request),
|
||||
},
|
||||
}
|
||||
|
||||
def download_error(self, failure, request, spider, errmsg=None):
|
||||
@ -130,16 +130,16 @@ class LogFormatter:
|
||||
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
args = {'request': request}
|
||||
args = {"request": request}
|
||||
if errmsg:
|
||||
msg = DOWNLOADERRORMSG_LONG
|
||||
args['errmsg'] = errmsg
|
||||
args["errmsg"] = errmsg
|
||||
else:
|
||||
msg = DOWNLOADERRORMSG_SHORT
|
||||
return {
|
||||
'level': logging.ERROR,
|
||||
'msg': msg,
|
||||
'args': args,
|
||||
"level": logging.ERROR,
|
||||
"msg": msg,
|
||||
"args": args,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
123
scrapy/mail.py
123
scrapy/mail.py
@ -34,8 +34,15 @@ def _to_bytes_or_none(text):
|
||||
|
||||
class MailSender:
|
||||
def __init__(
|
||||
self, smtphost='localhost', mailfrom='scrapy@localhost', smtpuser=None,
|
||||
smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False
|
||||
self,
|
||||
smtphost="localhost",
|
||||
mailfrom="scrapy@localhost",
|
||||
smtpuser=None,
|
||||
smtppass=None,
|
||||
smtpport=25,
|
||||
smtptls=False,
|
||||
smtpssl=False,
|
||||
debug=False,
|
||||
):
|
||||
self.smtphost = smtphost
|
||||
self.smtpport = smtpport
|
||||
@ -49,44 +56,57 @@ class MailSender:
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
return cls(
|
||||
smtphost=settings['MAIL_HOST'],
|
||||
mailfrom=settings['MAIL_FROM'],
|
||||
smtpuser=settings['MAIL_USER'],
|
||||
smtppass=settings['MAIL_PASS'],
|
||||
smtpport=settings.getint('MAIL_PORT'),
|
||||
smtptls=settings.getbool('MAIL_TLS'),
|
||||
smtpssl=settings.getbool('MAIL_SSL'),
|
||||
smtphost=settings["MAIL_HOST"],
|
||||
mailfrom=settings["MAIL_FROM"],
|
||||
smtpuser=settings["MAIL_USER"],
|
||||
smtppass=settings["MAIL_PASS"],
|
||||
smtpport=settings.getint("MAIL_PORT"),
|
||||
smtptls=settings.getbool("MAIL_TLS"),
|
||||
smtpssl=settings.getbool("MAIL_SSL"),
|
||||
)
|
||||
|
||||
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
|
||||
def send(
|
||||
self,
|
||||
to,
|
||||
subject,
|
||||
body,
|
||||
cc=None,
|
||||
attachs=(),
|
||||
mimetype="text/plain",
|
||||
charset=None,
|
||||
_callback=None,
|
||||
):
|
||||
from twisted.internet import reactor
|
||||
|
||||
if attachs:
|
||||
msg = MIMEMultipart()
|
||||
else:
|
||||
msg = MIMENonMultipart(*mimetype.split('/', 1))
|
||||
msg = MIMENonMultipart(*mimetype.split("/", 1))
|
||||
|
||||
to = list(arg_to_iter(to))
|
||||
cc = list(arg_to_iter(cc))
|
||||
|
||||
msg['From'] = self.mailfrom
|
||||
msg['To'] = COMMASPACE.join(to)
|
||||
msg['Date'] = formatdate(localtime=True)
|
||||
msg['Subject'] = subject
|
||||
msg["From"] = self.mailfrom
|
||||
msg["To"] = COMMASPACE.join(to)
|
||||
msg["Date"] = formatdate(localtime=True)
|
||||
msg["Subject"] = subject
|
||||
rcpts = to[:]
|
||||
if cc:
|
||||
rcpts.extend(cc)
|
||||
msg['Cc'] = COMMASPACE.join(cc)
|
||||
msg["Cc"] = COMMASPACE.join(cc)
|
||||
|
||||
if charset:
|
||||
msg.set_charset(charset)
|
||||
|
||||
if attachs:
|
||||
msg.attach(MIMEText(body, 'plain', charset or 'us-ascii'))
|
||||
msg.attach(MIMEText(body, "plain", charset or "us-ascii"))
|
||||
for attach_name, mimetype, f in attachs:
|
||||
part = MIMEBase(*mimetype.split('/'))
|
||||
part = MIMEBase(*mimetype.split("/"))
|
||||
part.set_payload(f.read())
|
||||
Encoders.encode_base64(part)
|
||||
part.add_header('Content-Disposition', 'attachment', filename=attach_name)
|
||||
part.add_header(
|
||||
"Content-Disposition", "attachment", filename=attach_name
|
||||
)
|
||||
msg.attach(part)
|
||||
else:
|
||||
msg.set_payload(body)
|
||||
@ -95,50 +115,79 @@ class MailSender:
|
||||
_callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg)
|
||||
|
||||
if self.debug:
|
||||
logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
|
||||
'mailattachs': len(attachs)})
|
||||
logger.debug(
|
||||
"Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s "
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
{
|
||||
"mailto": to,
|
||||
"mailcc": cc,
|
||||
"mailsubject": subject,
|
||||
"mailattachs": len(attachs),
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
|
||||
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or "utf-8"))
|
||||
dfd.addCallbacks(
|
||||
callback=self._sent_ok,
|
||||
errback=self._sent_failed,
|
||||
callbackArgs=[to, cc, subject, len(attachs)],
|
||||
errbackArgs=[to, cc, subject, len(attachs)],
|
||||
)
|
||||
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
|
||||
reactor.addSystemEventTrigger("before", "shutdown", lambda: dfd)
|
||||
return dfd
|
||||
|
||||
def _sent_ok(self, result, to, cc, subject, nattachs):
|
||||
logger.info('Mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
|
||||
'mailattachs': nattachs})
|
||||
logger.info(
|
||||
"Mail sent OK: To=%(mailto)s Cc=%(mailcc)s "
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
{
|
||||
"mailto": to,
|
||||
"mailcc": cc,
|
||||
"mailsubject": subject,
|
||||
"mailattachs": nattachs,
|
||||
},
|
||||
)
|
||||
|
||||
def _sent_failed(self, failure, to, cc, subject, nattachs):
|
||||
errstr = str(failure.value)
|
||||
logger.error('Unable to send mail: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
|
||||
'- %(mailerr)s',
|
||||
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
|
||||
'mailattachs': nattachs, 'mailerr': errstr})
|
||||
logger.error(
|
||||
"Unable to send mail: To=%(mailto)s Cc=%(mailcc)s "
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
|
||||
"- %(mailerr)s",
|
||||
{
|
||||
"mailto": to,
|
||||
"mailcc": cc,
|
||||
"mailsubject": subject,
|
||||
"mailattachs": nattachs,
|
||||
"mailerr": errstr,
|
||||
},
|
||||
)
|
||||
|
||||
def _sendmail(self, to_addrs, msg):
|
||||
# Import twisted.mail here because it is not available in python3
|
||||
from twisted.internet import reactor
|
||||
from twisted.mail.smtp import ESMTPSenderFactory
|
||||
|
||||
msg = BytesIO(msg)
|
||||
d = defer.Deferred()
|
||||
factory = ESMTPSenderFactory(
|
||||
self.smtpuser, self.smtppass, self.mailfrom, to_addrs, msg, d,
|
||||
heloFallback=True, requireAuthentication=False, requireTransportSecurity=self.smtptls,
|
||||
self.smtpuser,
|
||||
self.smtppass,
|
||||
self.mailfrom,
|
||||
to_addrs,
|
||||
msg,
|
||||
d,
|
||||
heloFallback=True,
|
||||
requireAuthentication=False,
|
||||
requireTransportSecurity=self.smtptls,
|
||||
)
|
||||
factory.noisy = False
|
||||
|
||||
if self.smtpssl:
|
||||
reactor.connectSSL(self.smtphost, self.smtpport, factory, ssl.ClientContextFactory())
|
||||
reactor.connectSSL(
|
||||
self.smtphost, self.smtpport, factory, ssl.ClientContextFactory()
|
||||
)
|
||||
else:
|
||||
reactor.connectTCP(self.smtphost, self.smtpport, factory)
|
||||
|
||||
|
@ -17,13 +17,15 @@ logger = logging.getLogger(__name__)
|
||||
class MiddlewareManager:
|
||||
"""Base class for implementing middleware managers"""
|
||||
|
||||
component_name = 'foo middleware'
|
||||
component_name = "foo middleware"
|
||||
|
||||
def __init__(self, *middlewares: Any) -> None:
|
||||
self.middlewares = middlewares
|
||||
# Only process_spider_output and process_spider_exception can be None.
|
||||
# Only process_spider_output can be a tuple, and only until _async compatibility methods are removed.
|
||||
self.methods: Dict[str, Deque[Union[None, Callable, Tuple[Callable, Callable]]]] = defaultdict(deque)
|
||||
self.methods: Dict[
|
||||
str, Deque[Union[None, Callable, Tuple[Callable, Callable]]]
|
||||
] = defaultdict(deque)
|
||||
for mw in middlewares:
|
||||
self._add_middleware(mw)
|
||||
|
||||
@ -44,15 +46,21 @@ class MiddlewareManager:
|
||||
enabled.append(clspath)
|
||||
except NotConfigured as e:
|
||||
if e.args:
|
||||
clsname = clspath.split('.')[-1]
|
||||
logger.warning("Disabled %(clsname)s: %(eargs)s",
|
||||
{'clsname': clsname, 'eargs': e.args[0]},
|
||||
extra={'crawler': crawler})
|
||||
clsname = clspath.split(".")[-1]
|
||||
logger.warning(
|
||||
"Disabled %(clsname)s: %(eargs)s",
|
||||
{"clsname": clsname, "eargs": e.args[0]},
|
||||
extra={"crawler": crawler},
|
||||
)
|
||||
|
||||
logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
|
||||
{'componentname': cls.component_name,
|
||||
'enabledlist': pprint.pformat(enabled)},
|
||||
extra={'crawler': crawler})
|
||||
logger.info(
|
||||
"Enabled %(componentname)ss:\n%(enabledlist)s",
|
||||
{
|
||||
"componentname": cls.component_name,
|
||||
"enabledlist": pprint.pformat(enabled),
|
||||
},
|
||||
extra={"crawler": crawler},
|
||||
)
|
||||
return cls(*middlewares)
|
||||
|
||||
@classmethod
|
||||
@ -60,10 +68,10 @@ class MiddlewareManager:
|
||||
return cls.from_settings(crawler.settings, crawler)
|
||||
|
||||
def _add_middleware(self, mw) -> None:
|
||||
if hasattr(mw, 'open_spider'):
|
||||
self.methods['open_spider'].append(mw.open_spider)
|
||||
if hasattr(mw, 'close_spider'):
|
||||
self.methods['close_spider'].appendleft(mw.close_spider)
|
||||
if hasattr(mw, "open_spider"):
|
||||
self.methods["open_spider"].append(mw.open_spider)
|
||||
if hasattr(mw, "close_spider"):
|
||||
self.methods["close_spider"].appendleft(mw.close_spider)
|
||||
|
||||
def _process_parallel(self, methodname: str, obj, *args) -> Deferred:
|
||||
methods = cast(Iterable[Callable], self.methods[methodname])
|
||||
@ -74,7 +82,7 @@ class MiddlewareManager:
|
||||
return process_chain(methods, obj, *args)
|
||||
|
||||
def open_spider(self, spider: Spider) -> Deferred:
|
||||
return self._process_parallel('open_spider', spider)
|
||||
return self._process_parallel("open_spider", spider)
|
||||
|
||||
def close_spider(self, spider: Spider) -> Deferred:
|
||||
return self._process_parallel('close_spider', spider)
|
||||
return self._process_parallel("close_spider", spider)
|
||||
|
@ -11,16 +11,18 @@ from scrapy.utils.defer import deferred_f_from_coro_f
|
||||
|
||||
class ItemPipelineManager(MiddlewareManager):
|
||||
|
||||
component_name = 'item pipeline'
|
||||
component_name = "item pipeline"
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(settings.getwithbase('ITEM_PIPELINES'))
|
||||
return build_component_list(settings.getwithbase("ITEM_PIPELINES"))
|
||||
|
||||
def _add_middleware(self, pipe):
|
||||
super()._add_middleware(pipe)
|
||||
if hasattr(pipe, 'process_item'):
|
||||
self.methods['process_item'].append(deferred_f_from_coro_f(pipe.process_item))
|
||||
if hasattr(pipe, "process_item"):
|
||||
self.methods["process_item"].append(
|
||||
deferred_f_from_coro_f(pipe.process_item)
|
||||
)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return self._process_chain('process_item', item, spider)
|
||||
return self._process_chain("process_item", item, spider)
|
||||
|
@ -42,8 +42,8 @@ class FileException(Exception):
|
||||
|
||||
class FSFilesStore:
|
||||
def __init__(self, basedir: str):
|
||||
if '://' in basedir:
|
||||
basedir = basedir.split('://', 1)[1]
|
||||
if "://" in basedir:
|
||||
basedir = basedir.split("://", 1)[1]
|
||||
self.basedir = basedir
|
||||
self._mkdir(Path(self.basedir))
|
||||
self.created_directories: DefaultDict[str, Set[str]] = defaultdict(set)
|
||||
@ -60,13 +60,13 @@ class FSFilesStore:
|
||||
except os.error:
|
||||
return {}
|
||||
|
||||
with absolute_path.open('rb') as f:
|
||||
with absolute_path.open("rb") as f:
|
||||
checksum = md5sum(f)
|
||||
|
||||
return {'last_modified': last_modified, 'checksum': checksum}
|
||||
return {"last_modified": last_modified, "checksum": checksum}
|
||||
|
||||
def _get_filesystem_path(self, path: str) -> Path:
|
||||
path_comps = path.split('/')
|
||||
path_comps = path.split("/")
|
||||
return Path(self.basedir, *path_comps)
|
||||
|
||||
def _mkdir(self, dirname: Path, domain: Optional[str] = None):
|
||||
@ -86,49 +86,49 @@ class S3FilesStore:
|
||||
AWS_USE_SSL = None
|
||||
AWS_VERIFY = None
|
||||
|
||||
POLICY = 'private' # Overridden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
|
||||
POLICY = "private" # Overridden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
|
||||
HEADERS = {
|
||||
'Cache-Control': 'max-age=172800',
|
||||
"Cache-Control": "max-age=172800",
|
||||
}
|
||||
|
||||
def __init__(self, uri):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured('missing botocore library')
|
||||
raise NotConfigured("missing botocore library")
|
||||
import botocore.session
|
||||
|
||||
session = botocore.session.get_session()
|
||||
self.s3_client = session.create_client(
|
||||
's3',
|
||||
"s3",
|
||||
aws_access_key_id=self.AWS_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
|
||||
aws_session_token=self.AWS_SESSION_TOKEN,
|
||||
endpoint_url=self.AWS_ENDPOINT_URL,
|
||||
region_name=self.AWS_REGION_NAME,
|
||||
use_ssl=self.AWS_USE_SSL,
|
||||
verify=self.AWS_VERIFY
|
||||
verify=self.AWS_VERIFY,
|
||||
)
|
||||
if not uri.startswith("s3://"):
|
||||
raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
|
||||
self.bucket, self.prefix = uri[5:].split('/', 1)
|
||||
self.bucket, self.prefix = uri[5:].split("/", 1)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
def _onsuccess(boto_key):
|
||||
checksum = boto_key['ETag'].strip('"')
|
||||
last_modified = boto_key['LastModified']
|
||||
checksum = boto_key["ETag"].strip('"')
|
||||
last_modified = boto_key["LastModified"]
|
||||
modified_stamp = time.mktime(last_modified.timetuple())
|
||||
return {'checksum': checksum, 'last_modified': modified_stamp}
|
||||
return {"checksum": checksum, "last_modified": modified_stamp}
|
||||
|
||||
return self._get_boto_key(path).addCallback(_onsuccess)
|
||||
|
||||
def _get_boto_key(self, path):
|
||||
key_name = f'{self.prefix}{path}'
|
||||
key_name = f"{self.prefix}{path}"
|
||||
return threads.deferToThread(
|
||||
self.s3_client.head_object,
|
||||
Bucket=self.bucket,
|
||||
Key=key_name)
|
||||
self.s3_client.head_object, Bucket=self.bucket, Key=key_name
|
||||
)
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
"""Upload file to S3 storage"""
|
||||
key_name = f'{self.prefix}{path}'
|
||||
key_name = f"{self.prefix}{path}"
|
||||
buf.seek(0)
|
||||
extra = self._headers_to_botocore_kwargs(self.HEADERS)
|
||||
if headers:
|
||||
@ -140,39 +140,41 @@ class S3FilesStore:
|
||||
Body=buf,
|
||||
Metadata={k: str(v) for k, v in (meta or {}).items()},
|
||||
ACL=self.POLICY,
|
||||
**extra)
|
||||
**extra,
|
||||
)
|
||||
|
||||
def _headers_to_botocore_kwargs(self, headers):
|
||||
""" Convert headers to botocore keyword arguments.
|
||||
"""
|
||||
"""Convert headers to botocore keyword arguments."""
|
||||
# This is required while we need to support both boto and botocore.
|
||||
mapping = CaselessDict({
|
||||
'Content-Type': 'ContentType',
|
||||
'Cache-Control': 'CacheControl',
|
||||
'Content-Disposition': 'ContentDisposition',
|
||||
'Content-Encoding': 'ContentEncoding',
|
||||
'Content-Language': 'ContentLanguage',
|
||||
'Content-Length': 'ContentLength',
|
||||
'Content-MD5': 'ContentMD5',
|
||||
'Expires': 'Expires',
|
||||
'X-Amz-Grant-Full-Control': 'GrantFullControl',
|
||||
'X-Amz-Grant-Read': 'GrantRead',
|
||||
'X-Amz-Grant-Read-ACP': 'GrantReadACP',
|
||||
'X-Amz-Grant-Write-ACP': 'GrantWriteACP',
|
||||
'X-Amz-Object-Lock-Legal-Hold': 'ObjectLockLegalHoldStatus',
|
||||
'X-Amz-Object-Lock-Mode': 'ObjectLockMode',
|
||||
'X-Amz-Object-Lock-Retain-Until-Date': 'ObjectLockRetainUntilDate',
|
||||
'X-Amz-Request-Payer': 'RequestPayer',
|
||||
'X-Amz-Server-Side-Encryption': 'ServerSideEncryption',
|
||||
'X-Amz-Server-Side-Encryption-Aws-Kms-Key-Id': 'SSEKMSKeyId',
|
||||
'X-Amz-Server-Side-Encryption-Context': 'SSEKMSEncryptionContext',
|
||||
'X-Amz-Server-Side-Encryption-Customer-Algorithm': 'SSECustomerAlgorithm',
|
||||
'X-Amz-Server-Side-Encryption-Customer-Key': 'SSECustomerKey',
|
||||
'X-Amz-Server-Side-Encryption-Customer-Key-Md5': 'SSECustomerKeyMD5',
|
||||
'X-Amz-Storage-Class': 'StorageClass',
|
||||
'X-Amz-Tagging': 'Tagging',
|
||||
'X-Amz-Website-Redirect-Location': 'WebsiteRedirectLocation',
|
||||
})
|
||||
mapping = CaselessDict(
|
||||
{
|
||||
"Content-Type": "ContentType",
|
||||
"Cache-Control": "CacheControl",
|
||||
"Content-Disposition": "ContentDisposition",
|
||||
"Content-Encoding": "ContentEncoding",
|
||||
"Content-Language": "ContentLanguage",
|
||||
"Content-Length": "ContentLength",
|
||||
"Content-MD5": "ContentMD5",
|
||||
"Expires": "Expires",
|
||||
"X-Amz-Grant-Full-Control": "GrantFullControl",
|
||||
"X-Amz-Grant-Read": "GrantRead",
|
||||
"X-Amz-Grant-Read-ACP": "GrantReadACP",
|
||||
"X-Amz-Grant-Write-ACP": "GrantWriteACP",
|
||||
"X-Amz-Object-Lock-Legal-Hold": "ObjectLockLegalHoldStatus",
|
||||
"X-Amz-Object-Lock-Mode": "ObjectLockMode",
|
||||
"X-Amz-Object-Lock-Retain-Until-Date": "ObjectLockRetainUntilDate",
|
||||
"X-Amz-Request-Payer": "RequestPayer",
|
||||
"X-Amz-Server-Side-Encryption": "ServerSideEncryption",
|
||||
"X-Amz-Server-Side-Encryption-Aws-Kms-Key-Id": "SSEKMSKeyId",
|
||||
"X-Amz-Server-Side-Encryption-Context": "SSEKMSEncryptionContext",
|
||||
"X-Amz-Server-Side-Encryption-Customer-Algorithm": "SSECustomerAlgorithm",
|
||||
"X-Amz-Server-Side-Encryption-Customer-Key": "SSECustomerKey",
|
||||
"X-Amz-Server-Side-Encryption-Customer-Key-Md5": "SSECustomerKeyMD5",
|
||||
"X-Amz-Storage-Class": "StorageClass",
|
||||
"X-Amz-Tagging": "Tagging",
|
||||
"X-Amz-Website-Redirect-Location": "WebsiteRedirectLocation",
|
||||
}
|
||||
)
|
||||
extra = {}
|
||||
for key, value in headers.items():
|
||||
try:
|
||||
@ -188,7 +190,7 @@ class GCSFilesStore:
|
||||
|
||||
GCS_PROJECT_ID = None
|
||||
|
||||
CACHE_CONTROL = 'max-age=172800'
|
||||
CACHE_CONTROL = "max-age=172800"
|
||||
|
||||
# The bucket's default object ACL will be applied to the object.
|
||||
# Overridden from settings.FILES_STORE_GCS_ACL in FilesPipeline.from_settings.
|
||||
@ -196,23 +198,24 @@ class GCSFilesStore:
|
||||
|
||||
def __init__(self, uri):
|
||||
from google.cloud import storage
|
||||
|
||||
client = storage.Client(project=self.GCS_PROJECT_ID)
|
||||
bucket, prefix = uri[5:].split('/', 1)
|
||||
bucket, prefix = uri[5:].split("/", 1)
|
||||
self.bucket = client.bucket(bucket)
|
||||
self.prefix = prefix
|
||||
permissions = self.bucket.test_iam_permissions(
|
||||
['storage.objects.get', 'storage.objects.create']
|
||||
["storage.objects.get", "storage.objects.create"]
|
||||
)
|
||||
if 'storage.objects.get' not in permissions:
|
||||
if "storage.objects.get" not in permissions:
|
||||
logger.warning(
|
||||
"No 'storage.objects.get' permission for GSC bucket %(bucket)s. "
|
||||
"Checking if files are up to date will be impossible. Files will be downloaded every time.",
|
||||
{'bucket': bucket}
|
||||
{"bucket": bucket},
|
||||
)
|
||||
if 'storage.objects.create' not in permissions:
|
||||
if "storage.objects.create" not in permissions:
|
||||
logger.error(
|
||||
"No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!",
|
||||
{'bucket': bucket}
|
||||
{"bucket": bucket},
|
||||
)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
@ -220,15 +223,18 @@ class GCSFilesStore:
|
||||
if blob:
|
||||
checksum = blob.md5_hash
|
||||
last_modified = time.mktime(blob.updated.timetuple())
|
||||
return {'checksum': checksum, 'last_modified': last_modified}
|
||||
return {"checksum": checksum, "last_modified": last_modified}
|
||||
return {}
|
||||
|
||||
blob_path = self._get_blob_path(path)
|
||||
return threads.deferToThread(self.bucket.get_blob, blob_path).addCallback(_onsuccess)
|
||||
return threads.deferToThread(self.bucket.get_blob, blob_path).addCallback(
|
||||
_onsuccess
|
||||
)
|
||||
|
||||
def _get_content_type(self, headers):
|
||||
if headers and 'Content-Type' in headers:
|
||||
return headers['Content-Type']
|
||||
return 'application/octet-stream'
|
||||
if headers and "Content-Type" in headers:
|
||||
return headers["Content-Type"]
|
||||
return "application/octet-stream"
|
||||
|
||||
def _get_blob_path(self, path):
|
||||
return self.prefix + path
|
||||
@ -242,7 +248,7 @@ class GCSFilesStore:
|
||||
blob.upload_from_string,
|
||||
data=buf.getvalue(),
|
||||
content_type=self._get_content_type(headers),
|
||||
predefined_acl=self.POLICY
|
||||
predefined_acl=self.POLICY,
|
||||
)
|
||||
|
||||
|
||||
@ -261,14 +267,19 @@ class FTPFilesStore:
|
||||
self.port = int(u.port or 21)
|
||||
self.username = u.username or self.FTP_USERNAME
|
||||
self.password = u.password or self.FTP_PASSWORD
|
||||
self.basedir = u.path.rstrip('/')
|
||||
self.basedir = u.path.rstrip("/")
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
path = f'{self.basedir}/{path}'
|
||||
path = f"{self.basedir}/{path}"
|
||||
return threads.deferToThread(
|
||||
ftp_store_file, path=path, file=buf,
|
||||
host=self.host, port=self.port, username=self.username,
|
||||
password=self.password, use_active_mode=self.USE_ACTIVE_MODE
|
||||
ftp_store_file,
|
||||
path=path,
|
||||
file=buf,
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
username=self.username,
|
||||
password=self.password,
|
||||
use_active_mode=self.USE_ACTIVE_MODE,
|
||||
)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
@ -282,11 +293,12 @@ class FTPFilesStore:
|
||||
file_path = f"{self.basedir}/{path}"
|
||||
last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
|
||||
m = hashlib.md5()
|
||||
ftp.retrbinary(f'RETR {file_path}', m.update)
|
||||
return {'last_modified': last_modified, 'checksum': m.hexdigest()}
|
||||
ftp.retrbinary(f"RETR {file_path}", m.update)
|
||||
return {"last_modified": last_modified, "checksum": m.hexdigest()}
|
||||
# The file doesn't exist
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
return threads.deferToThread(_stat_file, path)
|
||||
|
||||
|
||||
@ -312,14 +324,14 @@ class FilesPipeline(MediaPipeline):
|
||||
MEDIA_NAME = "file"
|
||||
EXPIRES = 90
|
||||
STORE_SCHEMES = {
|
||||
'': FSFilesStore,
|
||||
'file': FSFilesStore,
|
||||
's3': S3FilesStore,
|
||||
'gs': GCSFilesStore,
|
||||
'ftp': FTPFilesStore
|
||||
"": FSFilesStore,
|
||||
"file": FSFilesStore,
|
||||
"s3": S3FilesStore,
|
||||
"gs": GCSFilesStore,
|
||||
"ftp": FTPFilesStore,
|
||||
}
|
||||
DEFAULT_FILES_URLS_FIELD = 'file_urls'
|
||||
DEFAULT_FILES_RESULT_FIELD = 'files'
|
||||
DEFAULT_FILES_URLS_FIELD = "file_urls"
|
||||
DEFAULT_FILES_RESULT_FIELD = "files"
|
||||
|
||||
def __init__(self, store_uri, download_func=None, settings=None):
|
||||
if not store_uri:
|
||||
@ -330,52 +342,50 @@ class FilesPipeline(MediaPipeline):
|
||||
|
||||
cls_name = "FilesPipeline"
|
||||
self.store = self._get_store(store_uri)
|
||||
resolve = functools.partial(self._key_for_pipe,
|
||||
base_class_name=cls_name,
|
||||
settings=settings)
|
||||
self.expires = settings.getint(
|
||||
resolve('FILES_EXPIRES'), self.EXPIRES
|
||||
resolve = functools.partial(
|
||||
self._key_for_pipe, base_class_name=cls_name, settings=settings
|
||||
)
|
||||
self.expires = settings.getint(resolve("FILES_EXPIRES"), self.EXPIRES)
|
||||
if not hasattr(self, "FILES_URLS_FIELD"):
|
||||
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
|
||||
if not hasattr(self, "FILES_RESULT_FIELD"):
|
||||
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
|
||||
self.files_urls_field = settings.get(
|
||||
resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
|
||||
resolve("FILES_URLS_FIELD"), self.FILES_URLS_FIELD
|
||||
)
|
||||
self.files_result_field = settings.get(
|
||||
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
|
||||
resolve("FILES_RESULT_FIELD"), self.FILES_RESULT_FIELD
|
||||
)
|
||||
|
||||
super().__init__(download_func=download_func, settings=settings)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
s3store = cls.STORE_SCHEMES['s3']
|
||||
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||
s3store.AWS_SESSION_TOKEN = settings['AWS_SESSION_TOKEN']
|
||||
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
|
||||
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
|
||||
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
|
||||
s3store.AWS_VERIFY = settings['AWS_VERIFY']
|
||||
s3store.POLICY = settings['FILES_STORE_S3_ACL']
|
||||
s3store = cls.STORE_SCHEMES["s3"]
|
||||
s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
|
||||
s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
|
||||
s3store.AWS_ENDPOINT_URL = settings["AWS_ENDPOINT_URL"]
|
||||
s3store.AWS_REGION_NAME = settings["AWS_REGION_NAME"]
|
||||
s3store.AWS_USE_SSL = settings["AWS_USE_SSL"]
|
||||
s3store.AWS_VERIFY = settings["AWS_VERIFY"]
|
||||
s3store.POLICY = settings["FILES_STORE_S3_ACL"]
|
||||
|
||||
gcs_store = cls.STORE_SCHEMES['gs']
|
||||
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
|
||||
gcs_store.POLICY = settings['FILES_STORE_GCS_ACL'] or None
|
||||
gcs_store = cls.STORE_SCHEMES["gs"]
|
||||
gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
|
||||
gcs_store.POLICY = settings["FILES_STORE_GCS_ACL"] or None
|
||||
|
||||
ftp_store = cls.STORE_SCHEMES['ftp']
|
||||
ftp_store.FTP_USERNAME = settings['FTP_USER']
|
||||
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
|
||||
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
|
||||
ftp_store = cls.STORE_SCHEMES["ftp"]
|
||||
ftp_store.FTP_USERNAME = settings["FTP_USER"]
|
||||
ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
|
||||
ftp_store.USE_ACTIVE_MODE = settings.getbool("FEED_STORAGE_FTP_ACTIVE")
|
||||
|
||||
store_uri = settings['FILES_STORE']
|
||||
store_uri = settings["FILES_STORE"]
|
||||
return cls(store_uri, settings=settings)
|
||||
|
||||
def _get_store(self, uri: str):
|
||||
if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir
|
||||
scheme = 'file'
|
||||
scheme = "file"
|
||||
else:
|
||||
scheme = urlparse(uri).scheme
|
||||
store_cls = self.STORE_SCHEMES[scheme]
|
||||
@ -386,7 +396,7 @@ class FilesPipeline(MediaPipeline):
|
||||
if not result:
|
||||
return # returning None force download
|
||||
|
||||
last_modified = result.get('last_modified', None)
|
||||
last_modified = result.get("last_modified", None)
|
||||
if not last_modified:
|
||||
return # returning None force download
|
||||
|
||||
@ -397,25 +407,30 @@ class FilesPipeline(MediaPipeline):
|
||||
|
||||
referer = referer_str(request)
|
||||
logger.debug(
|
||||
'File (uptodate): Downloaded %(medianame)s from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
"File (uptodate): Downloaded %(medianame)s from %(request)s "
|
||||
"referred in <%(referer)s>",
|
||||
{"medianame": self.MEDIA_NAME, "request": request, "referer": referer},
|
||||
extra={"spider": info.spider},
|
||||
)
|
||||
self.inc_stats(info.spider, 'uptodate')
|
||||
self.inc_stats(info.spider, "uptodate")
|
||||
|
||||
checksum = result.get('checksum', None)
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': 'uptodate'}
|
||||
checksum = result.get("checksum", None)
|
||||
return {
|
||||
"url": request.url,
|
||||
"path": path,
|
||||
"checksum": checksum,
|
||||
"status": "uptodate",
|
||||
}
|
||||
|
||||
path = self.file_path(request, info=info, item=item)
|
||||
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
|
||||
dfd.addCallbacks(_onsuccess, lambda _: None)
|
||||
dfd.addErrback(
|
||||
lambda f:
|
||||
logger.error(self.__class__.__name__ + '.store.stat_file',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': info.spider})
|
||||
lambda f: logger.error(
|
||||
self.__class__.__name__ + ".store.stat_file",
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={"spider": info.spider},
|
||||
)
|
||||
)
|
||||
return dfd
|
||||
|
||||
@ -423,11 +438,15 @@ class FilesPipeline(MediaPipeline):
|
||||
if not isinstance(failure.value, IgnoreRequest):
|
||||
referer = referer_str(request)
|
||||
logger.warning(
|
||||
'File (unknown-error): Error downloading %(medianame)s from '
|
||||
'%(request)s referred in <%(referer)s>: %(exception)s',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer, 'exception': failure.value},
|
||||
extra={'spider': info.spider}
|
||||
"File (unknown-error): Error downloading %(medianame)s from "
|
||||
"%(request)s referred in <%(referer)s>: %(exception)s",
|
||||
{
|
||||
"medianame": self.MEDIA_NAME,
|
||||
"request": request,
|
||||
"referer": referer,
|
||||
"exception": failure.value,
|
||||
},
|
||||
extra={"spider": info.spider},
|
||||
)
|
||||
|
||||
raise FileException
|
||||
@ -437,29 +456,28 @@ class FilesPipeline(MediaPipeline):
|
||||
|
||||
if response.status != 200:
|
||||
logger.warning(
|
||||
'File (code: %(status)s): Error downloading file from '
|
||||
'%(request)s referred in <%(referer)s>',
|
||||
{'status': response.status,
|
||||
'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
"File (code: %(status)s): Error downloading file from "
|
||||
"%(request)s referred in <%(referer)s>",
|
||||
{"status": response.status, "request": request, "referer": referer},
|
||||
extra={"spider": info.spider},
|
||||
)
|
||||
raise FileException('download-error')
|
||||
raise FileException("download-error")
|
||||
|
||||
if not response.body:
|
||||
logger.warning(
|
||||
'File (empty-content): Empty file from %(request)s referred '
|
||||
'in <%(referer)s>: no-content',
|
||||
{'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
"File (empty-content): Empty file from %(request)s referred "
|
||||
"in <%(referer)s>: no-content",
|
||||
{"request": request, "referer": referer},
|
||||
extra={"spider": info.spider},
|
||||
)
|
||||
raise FileException('empty-content')
|
||||
raise FileException("empty-content")
|
||||
|
||||
status = 'cached' if 'cached' in response.flags else 'downloaded'
|
||||
status = "cached" if "cached" in response.flags else "downloaded"
|
||||
logger.debug(
|
||||
'File (%(status)s): Downloaded file from %(request)s referred in '
|
||||
'<%(referer)s>',
|
||||
{'status': status, 'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
"File (%(status)s): Downloaded file from %(request)s referred in "
|
||||
"<%(referer)s>",
|
||||
{"status": status, "request": request, "referer": referer},
|
||||
extra={"spider": info.spider},
|
||||
)
|
||||
self.inc_stats(info.spider, status)
|
||||
|
||||
@ -468,26 +486,33 @@ class FilesPipeline(MediaPipeline):
|
||||
checksum = self.file_downloaded(response, request, info, item=item)
|
||||
except FileException as exc:
|
||||
logger.warning(
|
||||
'File (error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>: %(errormsg)s',
|
||||
{'request': request, 'referer': referer, 'errormsg': str(exc)},
|
||||
extra={'spider': info.spider}, exc_info=True
|
||||
"File (error): Error processing file from %(request)s "
|
||||
"referred in <%(referer)s>: %(errormsg)s",
|
||||
{"request": request, "referer": referer, "errormsg": str(exc)},
|
||||
extra={"spider": info.spider},
|
||||
exc_info=True,
|
||||
)
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.error(
|
||||
'File (unknown-error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'request': request, 'referer': referer},
|
||||
exc_info=True, extra={'spider': info.spider}
|
||||
"File (unknown-error): Error processing file from %(request)s "
|
||||
"referred in <%(referer)s>",
|
||||
{"request": request, "referer": referer},
|
||||
exc_info=True,
|
||||
extra={"spider": info.spider},
|
||||
)
|
||||
raise FileException(str(exc))
|
||||
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': status}
|
||||
return {
|
||||
"url": request.url,
|
||||
"path": path,
|
||||
"checksum": checksum,
|
||||
"status": status,
|
||||
}
|
||||
|
||||
def inc_stats(self, spider, status):
|
||||
spider.crawler.stats.inc_value('file_count', spider=spider)
|
||||
spider.crawler.stats.inc_value(f'file_status_count/{status}', spider=spider)
|
||||
spider.crawler.stats.inc_value("file_count", spider=spider)
|
||||
spider.crawler.stats.inc_value(f"file_status_count/{status}", spider=spider)
|
||||
|
||||
# Overridable Interface
|
||||
def get_media_requests(self, item, info):
|
||||
@ -513,8 +538,8 @@ class FilesPipeline(MediaPipeline):
|
||||
# Handles empty and wild extensions by trying to guess the
|
||||
# mime type then extension or default to empty string otherwise
|
||||
if media_ext not in mimetypes.types_map:
|
||||
media_ext = ''
|
||||
media_ext = ""
|
||||
media_type = mimetypes.guess_type(request.url)[0]
|
||||
if media_type:
|
||||
media_ext = mimetypes.guess_extension(media_type)
|
||||
return f'full/{media_guid}{media_ext}'
|
||||
return f"full/{media_guid}{media_ext}"
|
||||
|
@ -14,6 +14,7 @@ from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem, NotConfigured, ScrapyDeprecationWarning
|
||||
from scrapy.http import Request
|
||||
from scrapy.pipelines.files import FileException, FilesPipeline
|
||||
|
||||
# TODO: from scrapy.pipelines.media import MediaPipeline
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.misc import md5sum
|
||||
@ -24,7 +25,11 @@ class NoimagesDrop(DropItem):
|
||||
"""Product with no images exception"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn("The NoimagesDrop class is deprecated", category=ScrapyDeprecationWarning, stacklevel=2)
|
||||
warnings.warn(
|
||||
"The NoimagesDrop class is deprecated",
|
||||
category=ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
@ -33,11 +38,9 @@ class ImageException(FileException):
|
||||
|
||||
|
||||
class ImagesPipeline(FilesPipeline):
|
||||
"""Abstract pipeline that implement the image thumbnail generation logic
|
||||
"""Abstract pipeline that implement the image thumbnail generation logic"""
|
||||
|
||||
"""
|
||||
|
||||
MEDIA_NAME = 'image'
|
||||
MEDIA_NAME = "image"
|
||||
|
||||
# Uppercase attributes kept for backward compatibility with code that subclasses
|
||||
# ImagesPipeline. They may be overridden by settings.
|
||||
@ -45,16 +48,17 @@ class ImagesPipeline(FilesPipeline):
|
||||
MIN_HEIGHT = 0
|
||||
EXPIRES = 90
|
||||
THUMBS = {}
|
||||
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
|
||||
DEFAULT_IMAGES_RESULT_FIELD = 'images'
|
||||
DEFAULT_IMAGES_URLS_FIELD = "image_urls"
|
||||
DEFAULT_IMAGES_RESULT_FIELD = "images"
|
||||
|
||||
def __init__(self, store_uri, download_func=None, settings=None):
|
||||
try:
|
||||
from PIL import Image
|
||||
|
||||
self._Image = Image
|
||||
except ImportError:
|
||||
raise NotConfigured(
|
||||
'ImagesPipeline requires installing Pillow 4.0.0 or later'
|
||||
"ImagesPipeline requires installing Pillow 4.0.0 or later"
|
||||
)
|
||||
|
||||
super().__init__(store_uri, settings=settings, download_func=download_func)
|
||||
@ -62,12 +66,10 @@ class ImagesPipeline(FilesPipeline):
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
|
||||
resolve = functools.partial(self._key_for_pipe,
|
||||
base_class_name="ImagesPipeline",
|
||||
settings=settings)
|
||||
self.expires = settings.getint(
|
||||
resolve("IMAGES_EXPIRES"), self.EXPIRES
|
||||
resolve = functools.partial(
|
||||
self._key_for_pipe, base_class_name="ImagesPipeline", settings=settings
|
||||
)
|
||||
self.expires = settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
|
||||
|
||||
if not hasattr(self, "IMAGES_RESULT_FIELD"):
|
||||
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
|
||||
@ -75,47 +77,39 @@ class ImagesPipeline(FilesPipeline):
|
||||
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
|
||||
|
||||
self.images_urls_field = settings.get(
|
||||
resolve('IMAGES_URLS_FIELD'),
|
||||
self.IMAGES_URLS_FIELD
|
||||
resolve("IMAGES_URLS_FIELD"), self.IMAGES_URLS_FIELD
|
||||
)
|
||||
self.images_result_field = settings.get(
|
||||
resolve('IMAGES_RESULT_FIELD'),
|
||||
self.IMAGES_RESULT_FIELD
|
||||
)
|
||||
self.min_width = settings.getint(
|
||||
resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
|
||||
)
|
||||
self.min_height = settings.getint(
|
||||
resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
|
||||
)
|
||||
self.thumbs = settings.get(
|
||||
resolve('IMAGES_THUMBS'), self.THUMBS
|
||||
resolve("IMAGES_RESULT_FIELD"), self.IMAGES_RESULT_FIELD
|
||||
)
|
||||
self.min_width = settings.getint(resolve("IMAGES_MIN_WIDTH"), self.MIN_WIDTH)
|
||||
self.min_height = settings.getint(resolve("IMAGES_MIN_HEIGHT"), self.MIN_HEIGHT)
|
||||
self.thumbs = settings.get(resolve("IMAGES_THUMBS"), self.THUMBS)
|
||||
|
||||
self._deprecated_convert_image = None
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
s3store = cls.STORE_SCHEMES['s3']
|
||||
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||
s3store.AWS_SESSION_TOKEN = settings['AWS_SESSION_TOKEN']
|
||||
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
|
||||
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
|
||||
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
|
||||
s3store.AWS_VERIFY = settings['AWS_VERIFY']
|
||||
s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
|
||||
s3store = cls.STORE_SCHEMES["s3"]
|
||||
s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
|
||||
s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
|
||||
s3store.AWS_ENDPOINT_URL = settings["AWS_ENDPOINT_URL"]
|
||||
s3store.AWS_REGION_NAME = settings["AWS_REGION_NAME"]
|
||||
s3store.AWS_USE_SSL = settings["AWS_USE_SSL"]
|
||||
s3store.AWS_VERIFY = settings["AWS_VERIFY"]
|
||||
s3store.POLICY = settings["IMAGES_STORE_S3_ACL"]
|
||||
|
||||
gcs_store = cls.STORE_SCHEMES['gs']
|
||||
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
|
||||
gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None
|
||||
gcs_store = cls.STORE_SCHEMES["gs"]
|
||||
gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
|
||||
gcs_store.POLICY = settings["IMAGES_STORE_GCS_ACL"] or None
|
||||
|
||||
ftp_store = cls.STORE_SCHEMES['ftp']
|
||||
ftp_store.FTP_USERNAME = settings['FTP_USER']
|
||||
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
|
||||
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
|
||||
ftp_store = cls.STORE_SCHEMES["ftp"]
|
||||
ftp_store.FTP_USERNAME = settings["FTP_USER"]
|
||||
ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
|
||||
ftp_store.USE_ACTIVE_MODE = settings.getbool("FEED_STORAGE_FTP_ACTIVE")
|
||||
|
||||
store_uri = settings['IMAGES_STORE']
|
||||
store_uri = settings["IMAGES_STORE"]
|
||||
return cls(store_uri, settings=settings)
|
||||
|
||||
def file_downloaded(self, response, request, info, *, item=None):
|
||||
@ -129,9 +123,12 @@ class ImagesPipeline(FilesPipeline):
|
||||
checksum = md5sum(buf)
|
||||
width, height = image.size
|
||||
self.store.persist_file(
|
||||
path, buf, info,
|
||||
meta={'width': width, 'height': height},
|
||||
headers={'Content-Type': 'image/jpeg'})
|
||||
path,
|
||||
buf,
|
||||
info,
|
||||
meta={"width": width, "height": height},
|
||||
headers={"Content-Type": "image/jpeg"},
|
||||
)
|
||||
return checksum
|
||||
|
||||
def get_images(self, response, request, info, *, item=None):
|
||||
@ -140,25 +137,35 @@ class ImagesPipeline(FilesPipeline):
|
||||
|
||||
width, height = orig_image.size
|
||||
if width < self.min_width or height < self.min_height:
|
||||
raise ImageException("Image too small "
|
||||
f"({width}x{height} < "
|
||||
f"{self.min_width}x{self.min_height})")
|
||||
raise ImageException(
|
||||
"Image too small "
|
||||
f"({width}x{height} < "
|
||||
f"{self.min_width}x{self.min_height})"
|
||||
)
|
||||
|
||||
if self._deprecated_convert_image is None:
|
||||
self._deprecated_convert_image = 'response_body' not in get_func_args(self.convert_image)
|
||||
self._deprecated_convert_image = "response_body" not in get_func_args(
|
||||
self.convert_image
|
||||
)
|
||||
if self._deprecated_convert_image:
|
||||
warnings.warn(f'{self.__class__.__name__}.convert_image() method overriden in a deprecated way, '
|
||||
'overriden method does not accept response_body argument.',
|
||||
category=ScrapyDeprecationWarning)
|
||||
warnings.warn(
|
||||
f"{self.__class__.__name__}.convert_image() method overriden in a deprecated way, "
|
||||
"overriden method does not accept response_body argument.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
)
|
||||
|
||||
if self._deprecated_convert_image:
|
||||
image, buf = self.convert_image(orig_image)
|
||||
else:
|
||||
image, buf = self.convert_image(orig_image, response_body=BytesIO(response.body))
|
||||
image, buf = self.convert_image(
|
||||
orig_image, response_body=BytesIO(response.body)
|
||||
)
|
||||
yield path, image, buf
|
||||
|
||||
for thumb_id, size in self.thumbs.items():
|
||||
thumb_path = self.thumb_path(request, thumb_id, response=response, info=info, item=item)
|
||||
thumb_path = self.thumb_path(
|
||||
request, thumb_id, response=response, info=info, item=item
|
||||
)
|
||||
if self._deprecated_convert_image:
|
||||
thumb_image, thumb_buf = self.convert_image(image, size)
|
||||
else:
|
||||
@ -167,21 +174,24 @@ class ImagesPipeline(FilesPipeline):
|
||||
|
||||
def convert_image(self, image, size=None, response_body=None):
|
||||
if response_body is None:
|
||||
warnings.warn(f'{self.__class__.__name__}.convert_image() method called in a deprecated way, '
|
||||
'method called without response_body argument.',
|
||||
category=ScrapyDeprecationWarning, stacklevel=2)
|
||||
warnings.warn(
|
||||
f"{self.__class__.__name__}.convert_image() method called in a deprecated way, "
|
||||
"method called without response_body argument.",
|
||||
category=ScrapyDeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if image.format == 'PNG' and image.mode == 'RGBA':
|
||||
background = self._Image.new('RGBA', image.size, (255, 255, 255))
|
||||
if image.format == "PNG" and image.mode == "RGBA":
|
||||
background = self._Image.new("RGBA", image.size, (255, 255, 255))
|
||||
background.paste(image, image)
|
||||
image = background.convert('RGB')
|
||||
elif image.mode == 'P':
|
||||
image = background.convert("RGB")
|
||||
elif image.mode == "P":
|
||||
image = image.convert("RGBA")
|
||||
background = self._Image.new('RGBA', image.size, (255, 255, 255))
|
||||
background = self._Image.new("RGBA", image.size, (255, 255, 255))
|
||||
background.paste(image, image)
|
||||
image = background.convert('RGB')
|
||||
elif image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
image = background.convert("RGB")
|
||||
elif image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
if size:
|
||||
image = image.copy()
|
||||
@ -193,11 +203,11 @@ class ImagesPipeline(FilesPipeline):
|
||||
except AttributeError:
|
||||
resampling_filter = self._Image.ANTIALIAS
|
||||
image.thumbnail(size, resampling_filter)
|
||||
elif response_body is not None and image.format == 'JPEG':
|
||||
elif response_body is not None and image.format == "JPEG":
|
||||
return image, response_body
|
||||
|
||||
buf = BytesIO()
|
||||
image.save(buf, 'JPEG')
|
||||
image.save(buf, "JPEG")
|
||||
return image, buf
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
@ -211,8 +221,8 @@ class ImagesPipeline(FilesPipeline):
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
return f'full/{image_guid}.jpg'
|
||||
return f"full/{image_guid}.jpg"
|
||||
|
||||
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
|
||||
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
return f'thumbs/{thumb_id}/{thumb_guid}.jpg'
|
||||
return f"thumbs/{thumb_id}/{thumb_guid}.jpg"
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user