Merge remote-tracking branch 'upstream/master' into fix_iternodes
18
.bandit.yml
Normal file
@ -0,0 +1,18 @@
|
||||
skips:
|
||||
- B101
|
||||
- B105
|
||||
- B301
|
||||
- B303
|
||||
- B306
|
||||
- B307
|
||||
- B311
|
||||
- B320
|
||||
- B321
|
||||
- B402 # https://github.com/scrapy/scrapy/issues/4180
|
||||
- B403
|
||||
- B404
|
||||
- B406
|
||||
- B410
|
||||
- B503
|
||||
- B603
|
||||
- B605
|
@ -1,8 +1,7 @@
|
||||
[bumpversion]
|
||||
current_version = 0.25.1
|
||||
current_version = 2.3.0
|
||||
commit = True
|
||||
tag = True
|
||||
tag_name = {new_version}
|
||||
|
||||
[bumpversion:file:scrapy/VERSION]
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
[run]
|
||||
branch = true
|
||||
include = scrapy/*
|
||||
omit = scrapy/xlib*,scrapy/tests*
|
||||
omit =
|
||||
tests/*
|
||||
|
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
||||
tests/sample_data/** binary
|
41
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
@ -0,0 +1,41 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Report a problem to help us improve
|
||||
---
|
||||
|
||||
<!--
|
||||
|
||||
Thanks for taking an interest in Scrapy!
|
||||
|
||||
If you have a question that starts with "How to...", please see the Scrapy Community page: https://scrapy.org/community/.
|
||||
The GitHub issue tracker's purpose is to deal with bug reports and feature requests for the project itself.
|
||||
|
||||
Keep in mind that by filing an issue, you are expected to comply with Scrapy's Code of Conduct, including treating everyone with respect: https://github.com/scrapy/scrapy/blob/master/CODE_OF_CONDUCT.md
|
||||
|
||||
The following is a suggested template to structure your issue, you can find more guidelines at https://doc.scrapy.org/en/latest/contributing.html#reporting-bugs
|
||||
|
||||
-->
|
||||
|
||||
### Description
|
||||
|
||||
[Description of the issue]
|
||||
|
||||
### Steps to Reproduce
|
||||
|
||||
1. [First Step]
|
||||
2. [Second Step]
|
||||
3. [and so on...]
|
||||
|
||||
**Expected behavior:** [What you expect to happen]
|
||||
|
||||
**Actual behavior:** [What actually happens]
|
||||
|
||||
**Reproduces how often:** [What percentage of the time does it reproduce?]
|
||||
|
||||
### Versions
|
||||
|
||||
Please paste here the output of executing `scrapy version --verbose` in the command line.
|
||||
|
||||
### Additional context
|
||||
|
||||
Any additional information, configuration, data or output from commands that might be necessary to reproduce or understand the issue. Please try not to include screenshots of code or the command line, paste the contents as text instead. You can use [GitHub Flavored Markdown](https://help.github.com/en/articles/creating-and-highlighting-code-blocks) to make the text look better.
|
33
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for an enhancement or new feature
|
||||
---
|
||||
|
||||
<!--
|
||||
|
||||
Thanks for taking an interest in Scrapy!
|
||||
|
||||
If you have a question that starts with "How to...", please see the Scrapy Community page: https://scrapy.org/community/.
|
||||
The GitHub issue tracker's purpose is to deal with bug reports and feature requests for the project itself.
|
||||
|
||||
Keep in mind that by filing an issue, you are expected to comply with Scrapy's Code of Conduct, including treating everyone with respect: https://github.com/scrapy/scrapy/blob/master/CODE_OF_CONDUCT.md
|
||||
|
||||
The following is a suggested template to structure your pull request, you can find more guidelines at https://doc.scrapy.org/en/latest/contributing.html#writing-patches and https://doc.scrapy.org/en/latest/contributing.html#submitting-patches
|
||||
|
||||
-->
|
||||
|
||||
## Summary
|
||||
|
||||
One paragraph explanation of the feature.
|
||||
|
||||
## Motivation
|
||||
|
||||
Why are we doing this? What use cases does it support? What is the expected outcome?
|
||||
|
||||
## Describe alternatives you've considered
|
||||
|
||||
A clear and concise description of the alternative solutions you've considered. Be sure to explain why Scrapy's existing customizability isn't suitable for this feature.
|
||||
|
||||
## Additional context
|
||||
|
||||
Any additional information about the feature request here.
|
11
.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
/.vagrant
|
||||
/scrapy.iml
|
||||
*.pyc
|
||||
_trial_temp*
|
||||
dropin.cache
|
||||
@ -8,3 +10,12 @@ venv
|
||||
build
|
||||
dist
|
||||
.idea
|
||||
htmlcov/
|
||||
.coverage
|
||||
.pytest_cache/
|
||||
.coverage.*
|
||||
.cache/
|
||||
.mypy_cache/
|
||||
|
||||
# Windows
|
||||
Thumbs.db
|
||||
|
12
.readthedocs.yml
Normal file
@ -0,0 +1,12 @@
|
||||
version: 2
|
||||
formats: all
|
||||
sphinx:
|
||||
configuration: docs/conf.py
|
||||
fail_on_warning: true
|
||||
python:
|
||||
# For available versions, see:
|
||||
# https://docs.readthedocs.io/en/stable/config-file/v2.html#build-image
|
||||
version: 3.7 # Keep in sync with .travis.yml
|
||||
install:
|
||||
- requirements: docs/requirements.txt
|
||||
- path: .
|
@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
set -x
|
||||
|
||||
if [[ "${TOXENV}" == "pypy" ]]; then
|
||||
sudo add-apt-repository -y ppa:pypy/ppa
|
||||
sudo apt-get -qy update
|
||||
sudo apt-get install -y pypy pypy-dev
|
||||
# This is required because we need to get rid of the Travis installed PyPy
|
||||
# or it'll take precedence over the PPA installed one.
|
||||
sudo rm -rf /usr/local/pypy/bin
|
||||
fi
|
||||
|
||||
# Workaround travis-ci/travis-ci#2065
|
||||
pip install -U wheel
|
72
.travis.yml
@ -1,19 +1,74 @@
|
||||
language: python
|
||||
python: 2.7
|
||||
env:
|
||||
- TOXENV=py27
|
||||
- TOXENV=precise
|
||||
- TOXENV=py33
|
||||
dist: xenial
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
- /^\d\.\d+$/
|
||||
- /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/
|
||||
matrix:
|
||||
include:
|
||||
- env: TOXENV=security
|
||||
python: 3.8
|
||||
- env: TOXENV=flake8
|
||||
python: 3.8
|
||||
- env: TOXENV=pylint
|
||||
python: 3.8
|
||||
- env: TOXENV=docs
|
||||
python: 3.7 # Keep in sync with .readthedocs.yml
|
||||
- env: TOXENV=typing
|
||||
python: 3.8
|
||||
|
||||
- env: TOXENV=pinned
|
||||
python: 3.5.2
|
||||
- env: TOXENV=asyncio-pinned
|
||||
python: 3.5.2 # We use additional code to support 3.5.3 and earlier
|
||||
- env: TOXENV=pypy3-pinned PYPY_VERSION=3-v5.9.0
|
||||
|
||||
- env: TOXENV=py
|
||||
python: 3.5
|
||||
- env: TOXENV=asyncio
|
||||
python: 3.5 # We use specific code to support >= 3.5.4, < 3.6
|
||||
- env: TOXENV=pypy3 PYPY_VERSION=3.5-v7.0.0
|
||||
|
||||
- env: TOXENV=py
|
||||
python: 3.6
|
||||
- env: TOXENV=pypy3 PYPY_VERSION=3.6-v7.3.1
|
||||
|
||||
- env: TOXENV=py
|
||||
python: 3.7
|
||||
|
||||
- env: TOXENV=py PYPI_RELEASE_JOB=true
|
||||
python: 3.8
|
||||
dist: bionic
|
||||
- env: TOXENV=extra-deps
|
||||
python: 3.8
|
||||
dist: bionic
|
||||
- env: TOXENV=asyncio
|
||||
python: 3.8
|
||||
dist: bionic
|
||||
install:
|
||||
- "./.travis-workarounds.sh"
|
||||
- pip install -U tox
|
||||
- |
|
||||
if [[ ! -z "$PYPY_VERSION" ]]; then
|
||||
export PYPY_VERSION="pypy$PYPY_VERSION-linux64"
|
||||
wget "https://downloads.python.org/pypy/${PYPY_VERSION}.tar.bz2"
|
||||
tar -jxf ${PYPY_VERSION}.tar.bz2
|
||||
virtualenv --python="$PYPY_VERSION/bin/pypy3" "$HOME/virtualenvs/$PYPY_VERSION"
|
||||
source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate"
|
||||
fi
|
||||
- pip install -U tox twine wheel codecov
|
||||
|
||||
script: tox
|
||||
after_success:
|
||||
- codecov
|
||||
notifications:
|
||||
irc:
|
||||
use_notice: true
|
||||
skip_join: true
|
||||
channels:
|
||||
- irc.freenode.org#scrapy
|
||||
cache:
|
||||
directories:
|
||||
- $HOME/.cache/pip
|
||||
deploy:
|
||||
provider: pypi
|
||||
distributions: "sdist bdist_wheel"
|
||||
@ -22,6 +77,5 @@ deploy:
|
||||
secure: JaAKcy1AXWXDK3LXdjOtKyaVPCSFoCGCnW15g4f65E/8Fsi9ZzDfmBa4Equs3IQb/vs/if2SVrzJSr7arN7r9Z38Iv1mUXHkFAyA3Ym8mThfABBzzcUWEQhIHrCX0Tdlx9wQkkhs+PZhorlmRS4gg5s6DzPaeA2g8SCgmlRmFfA=
|
||||
on:
|
||||
tags: true
|
||||
all_branches: true
|
||||
repo: scrapy/scrapy
|
||||
condition: "$TOXENV == py27 && $TRAVIS_TAG =~ ^[0-9][.][0-9]*[02468][.]"
|
||||
condition: "$PYPI_RELEASE_JOB == true && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$"
|
||||
|
74
CODE_OF_CONDUCT.md
Normal file
@ -0,0 +1,74 @@
|
||||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
In the interest of fostering an open and welcoming environment, we as
|
||||
contributors and maintainers pledge to make participation in our project and
|
||||
our community a harassment-free experience for everyone, regardless of age, body
|
||||
size, disability, ethnicity, gender identity and expression, level of experience,
|
||||
nationality, personal appearance, race, religion, or sexual identity and
|
||||
orientation.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to creating a positive environment
|
||||
include:
|
||||
|
||||
* Using welcoming and inclusive language
|
||||
* Being respectful of differing viewpoints and experiences
|
||||
* Gracefully accepting constructive criticism
|
||||
* Focusing on what is best for the community
|
||||
* Showing empathy towards other community members
|
||||
|
||||
Examples of unacceptable behavior by participants include:
|
||||
|
||||
* The use of sexualized language or imagery and unwelcome sexual attention or
|
||||
advances
|
||||
* Trolling, insulting/derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or electronic
|
||||
address, without explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Our Responsibilities
|
||||
|
||||
Project maintainers are responsible for clarifying the standards of acceptable
|
||||
behavior and are expected to take appropriate and fair corrective action in
|
||||
response to any instances of unacceptable behavior.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or
|
||||
reject comments, commits, code, wiki edits, issues, and other contributions
|
||||
that are not aligned to this Code of Conduct, or to ban temporarily or
|
||||
permanently any contributor for other behaviors that they deem inappropriate,
|
||||
threatening, offensive, or harmful.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies both within project spaces and in public spaces
|
||||
when an individual is representing the project or its community. Examples of
|
||||
representing a project or community include using an official project e-mail
|
||||
address, posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event. Representation of a project may be
|
||||
further defined and clarified by project maintainers.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported by contacting the project team at opensource@scrapinghub.com. All
|
||||
complaints will be reviewed and investigated and will result in a response that
|
||||
is deemed necessary and appropriate to the circumstances. The project team is
|
||||
obligated to maintain confidentiality with regard to the reporter of an incident.
|
||||
Further details of specific enforcement policies may be posted separately.
|
||||
|
||||
Project maintainers who do not follow or enforce the Code of Conduct in good
|
||||
faith may face temporary or permanent repercussions as determined by other
|
||||
members of the project's leadership.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
||||
available at [http://contributor-covenant.org/version/1/4][version].
|
||||
|
||||
[homepage]: http://contributor-covenant.org
|
||||
[version]: http://contributor-covenant.org/version/1/4/
|
@ -1,2 +1,6 @@
|
||||
The guidelines for contributing are available here:
|
||||
http://doc.scrapy.org/en/latest/contributing.html
|
||||
https://docs.scrapy.org/en/master/contributing.html
|
||||
|
||||
Please do not abuse the issue tracker for support questions.
|
||||
If your issue topic can be rephrased to "How to ...?", please use the
|
||||
support channels to get it answered: https://scrapy.org/community/
|
||||
|
2
INSTALL
@ -1,4 +1,4 @@
|
||||
For information about installing Scrapy see:
|
||||
|
||||
* docs/intro/install.rst (local file)
|
||||
* http://doc.scrapy.org/en/latest/intro/install.html (online version)
|
||||
* https://docs.scrapy.org/en/latest/intro/install.html (online version)
|
||||
|
10
LICENSE
@ -4,11 +4,11 @@ All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
1. Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions, and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of Scrapy nor the names of its contributors may be used
|
||||
|
13
MANIFEST.in
@ -3,11 +3,24 @@ include AUTHORS
|
||||
include INSTALL
|
||||
include LICENSE
|
||||
include MANIFEST.in
|
||||
include NEWS
|
||||
|
||||
include scrapy/VERSION
|
||||
include scrapy/mime.types
|
||||
|
||||
include codecov.yml
|
||||
include conftest.py
|
||||
include pytest.ini
|
||||
include requirements-*.txt
|
||||
include tox.ini
|
||||
|
||||
recursive-include scrapy/templates *
|
||||
recursive-include scrapy license.txt
|
||||
recursive-include docs *
|
||||
prune docs/build
|
||||
|
||||
recursive-include extras *
|
||||
recursive-include bin *
|
||||
recursive-include tests *
|
||||
|
||||
global-exclude __pycache__ *.py[cod]
|
||||
|
@ -1,21 +0,0 @@
|
||||
TRIAL := $(shell which trial)
|
||||
BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
|
||||
export PYTHONPATH=$(PWD)
|
||||
|
||||
test:
|
||||
coverage run --branch $(TRIAL) --reporter=text tests
|
||||
rm -rf htmlcov && coverage html
|
||||
-s3cmd sync -P htmlcov/ s3://static.scrapy.org/coverage-scrapy-$(BRANCH)/
|
||||
|
||||
build:
|
||||
test $(BRANCH) != master || git describe >scrapy/VERSION
|
||||
python extras/makedeb.py build
|
||||
|
||||
clean:
|
||||
git checkout debian scrapy/VERSION
|
||||
git clean -dfq
|
||||
|
||||
pypi:
|
||||
umask 0022 && chmod -R a+rX . && python setup.py sdist upload
|
||||
|
||||
.PHONY: clean test build
|
73
README.rst
@ -2,31 +2,46 @@
|
||||
Scrapy
|
||||
======
|
||||
|
||||
.. image:: https://badge.fury.io/py/Scrapy.png
|
||||
:target: http://badge.fury.io/py/Scrapy
|
||||
.. image:: https://img.shields.io/pypi/v/Scrapy.svg
|
||||
:target: https://pypi.python.org/pypi/Scrapy
|
||||
:alt: PyPI Version
|
||||
|
||||
.. image:: https://secure.travis-ci.org/scrapy/scrapy.png?branch=master
|
||||
:target: http://travis-ci.org/scrapy/scrapy
|
||||
.. image:: https://img.shields.io/pypi/pyversions/Scrapy.svg
|
||||
:target: https://pypi.python.org/pypi/Scrapy
|
||||
:alt: Supported Python Versions
|
||||
|
||||
.. image:: https://img.shields.io/travis/scrapy/scrapy/master.svg
|
||||
:target: https://travis-ci.org/scrapy/scrapy
|
||||
:alt: Build Status
|
||||
|
||||
.. image:: https://img.shields.io/badge/wheel-yes-brightgreen.svg
|
||||
:target: https://pypi.python.org/pypi/Scrapy
|
||||
:alt: Wheel Status
|
||||
|
||||
.. image:: https://img.shields.io/codecov/c/github/scrapy/scrapy/master.svg
|
||||
:target: https://codecov.io/github/scrapy/scrapy?branch=master
|
||||
:alt: Coverage report
|
||||
|
||||
.. image:: https://anaconda.org/conda-forge/scrapy/badges/version.svg
|
||||
:target: https://anaconda.org/conda-forge/scrapy
|
||||
:alt: Conda Version
|
||||
|
||||
.. image:: https://pypip.in/wheel/Scrapy/badge.png
|
||||
:target: https://pypi.python.org/pypi/Scrapy/
|
||||
:alt: Wheel Status
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
Scrapy is a fast high-level screen scraping and web crawling framework, used to
|
||||
Scrapy is a fast high-level web crawling and web scraping framework, used to
|
||||
crawl websites and extract structured data from their pages. It can be used for
|
||||
a wide range of purposes, from data mining to monitoring and automated testing.
|
||||
|
||||
For more information including a list of features check the Scrapy homepage at:
|
||||
http://scrapy.org
|
||||
Check the Scrapy homepage at https://scrapy.org for more information,
|
||||
including a list of features.
|
||||
|
||||
Requirements
|
||||
============
|
||||
|
||||
* Python 2.7
|
||||
* Works on Linux, Windows, Mac OSX, BSD
|
||||
* Python 3.5.2+
|
||||
* Works on Linux, Windows, macOS, BSD
|
||||
|
||||
Install
|
||||
=======
|
||||
@ -35,37 +50,45 @@ The quick way::
|
||||
|
||||
pip install scrapy
|
||||
|
||||
For more details see the install section in the documentation:
|
||||
http://doc.scrapy.org/en/latest/intro/install.html
|
||||
|
||||
Releases
|
||||
========
|
||||
|
||||
You can download the latest stable and development releases from:
|
||||
http://scrapy.org/download/
|
||||
See the install section in the documentation at
|
||||
https://docs.scrapy.org/en/latest/intro/install.html for more details.
|
||||
|
||||
Documentation
|
||||
=============
|
||||
|
||||
Documentation is available online at http://doc.scrapy.org/ and in the ``docs``
|
||||
Documentation is available online at https://docs.scrapy.org/ and in the ``docs``
|
||||
directory.
|
||||
|
||||
Releases
|
||||
========
|
||||
|
||||
You can check https://docs.scrapy.org/en/latest/news.html for the release notes.
|
||||
|
||||
Community (blog, twitter, mail list, IRC)
|
||||
=========================================
|
||||
|
||||
See http://scrapy.org/community/
|
||||
See https://scrapy.org/community/ for details.
|
||||
|
||||
Contributing
|
||||
============
|
||||
|
||||
See http://doc.scrapy.org/en/latest/contributing.html
|
||||
See https://docs.scrapy.org/en/master/contributing.html for details.
|
||||
|
||||
Code of Conduct
|
||||
---------------
|
||||
|
||||
Please note that this project is released with a Contributor Code of Conduct
|
||||
(see https://github.com/scrapy/scrapy/blob/master/CODE_OF_CONDUCT.md).
|
||||
|
||||
By participating in this project you agree to abide by its terms.
|
||||
Please report unacceptable behavior to opensource@scrapinghub.com.
|
||||
|
||||
Companies using Scrapy
|
||||
======================
|
||||
|
||||
See http://scrapy.org/companies/
|
||||
See https://scrapy.org/companies/ for a list.
|
||||
|
||||
Commercial Support
|
||||
==================
|
||||
|
||||
See http://scrapy.org/support/
|
||||
See https://scrapy.org/support/ for details.
|
||||
|
@ -1,3 +1,4 @@
|
||||
==============
|
||||
Scrapy artwork
|
||||
==============
|
||||
|
||||
@ -8,10 +9,10 @@ scrapy-logo.jpg
|
||||
|
||||
Main Scrapy logo, in JPEG format.
|
||||
|
||||
qlassik.zip
|
||||
qlassik.zip
|
||||
-----------
|
||||
|
||||
Font used for Scrapy logo. Homepage: http://www.dafont.com/qlassik.font
|
||||
Font used for Scrapy logo. Homepage: https://www.dafont.com/qlassik.font
|
||||
|
||||
scrapy-blog.logo.xcf
|
||||
--------------------
|
24
azure-pipelines.yml
Normal file
@ -0,0 +1,24 @@
|
||||
variables:
|
||||
TOXENV: py
|
||||
pool:
|
||||
vmImage: 'windows-latest'
|
||||
strategy:
|
||||
matrix:
|
||||
Python35:
|
||||
python.version: '3.5'
|
||||
TOXENV: windows-pinned
|
||||
Python36:
|
||||
python.version: '3.6'
|
||||
Python37:
|
||||
python.version: '3.7'
|
||||
Python38:
|
||||
python.version: '3.8'
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: '$(python.version)'
|
||||
displayName: 'Use Python $(python.version)'
|
||||
- script: |
|
||||
pip install -U tox twine wheel codecov
|
||||
tox
|
||||
displayName: 'Run test suite'
|
@ -1,4 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from scrapy.cmdline import execute
|
||||
execute()
|
6
codecov.yml
Normal file
@ -0,0 +1,6 @@
|
||||
comment:
|
||||
layout: "header, diff, tree"
|
||||
|
||||
coverage:
|
||||
status:
|
||||
project: false
|
82
conftest.py
@ -1,49 +1,55 @@
|
||||
import six
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from twisted.python import log
|
||||
|
||||
from scrapy import optional_features
|
||||
|
||||
collect_ignore = ["scrapy/stats.py"]
|
||||
if 'django' not in optional_features:
|
||||
collect_ignore.append("tests/test_djangoitem/models.py")
|
||||
|
||||
if six.PY3:
|
||||
for fn in open('tests/py3-ignores.txt'):
|
||||
if fn.strip():
|
||||
collect_ignore.append(fn.strip())
|
||||
|
||||
class LogObservers:
|
||||
"""Class for keeping track of log observers across test modules"""
|
||||
|
||||
def __init__(self):
|
||||
self.observers = []
|
||||
|
||||
def add(self, logfile='test.log'):
|
||||
fileobj = open(logfile, 'wb')
|
||||
observer = log.FileLogObserver(fileobj)
|
||||
log.startLoggingWithObserver(observer.emit, 0)
|
||||
self.observers.append((fileobj, observer))
|
||||
|
||||
def remove(self):
|
||||
fileobj, observer = self.observers.pop()
|
||||
log.removeObserver(observer.emit)
|
||||
fileobj.close()
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def log_observers():
|
||||
return LogObservers()
|
||||
def _py_files(folder):
|
||||
return (str(p) for p in Path(folder).rglob('*.py'))
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def setlog(request, log_observers):
|
||||
"""Attach test.log file observer to twisted log, for trial compatibility"""
|
||||
log_observers.add()
|
||||
request.addfinalizer(log_observers.remove)
|
||||
collect_ignore = [
|
||||
# not a test, but looks like a test
|
||||
"scrapy/utils/testsite.py",
|
||||
# contains scripts to be run by tests/test_crawler.py::CrawlerProcessSubprocess
|
||||
*_py_files("tests/CrawlerProcess"),
|
||||
# contains scripts to be run by tests/test_crawler.py::CrawlerRunnerSubprocess
|
||||
*_py_files("tests/CrawlerRunner"),
|
||||
# Py36-only parts of respective tests
|
||||
*_py_files("tests/py36"),
|
||||
]
|
||||
|
||||
for line in open('tests/ignores.txt'):
|
||||
file_path = line.strip()
|
||||
if file_path and file_path[0] != '#':
|
||||
collect_ignore.append(file_path)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def chdir(tmpdir):
|
||||
"""Change to pytest-provided temporary directory"""
|
||||
tmpdir.chdir()
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(session, config, items):
|
||||
# Avoid executing tests when executing `--flake8` flag (pytest-flake8)
|
||||
try:
|
||||
from pytest_flake8 import Flake8Item
|
||||
if config.getoption('--flake8'):
|
||||
items[:] = [item for item in items if isinstance(item, Flake8Item)]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope='class')
|
||||
def reactor_pytest(request):
|
||||
if not request.cls:
|
||||
# doctests
|
||||
return
|
||||
request.cls.reactor_pytest = request.config.getoption("--reactor")
|
||||
return request.cls.reactor_pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def only_asyncio(request, reactor_pytest):
|
||||
if request.node.get_closest_marker('only_asyncio') and reactor_pytest != 'asyncio':
|
||||
pytest.skip('This test is only run with --reactor=asyncio')
|
||||
|
5
debian/changelog
vendored
@ -1,5 +0,0 @@
|
||||
scrapy-SUFFIX (0.11) unstable; urgency=low
|
||||
|
||||
* Initial release.
|
||||
|
||||
-- Scrapinghub Team <info@scrapinghub.com> Thu, 10 Jun 2010 17:24:02 -0300
|
1
debian/compat
vendored
@ -1 +0,0 @@
|
||||
7
|
20
debian/control
vendored
@ -1,20 +0,0 @@
|
||||
Source: scrapy-SUFFIX
|
||||
Section: python
|
||||
Priority: optional
|
||||
Maintainer: Scrapinghub Team <info@scrapinghub.com>
|
||||
Build-Depends: debhelper (>= 7.0.50), python (>=2.7), python-twisted, python-w3lib, python-lxml, python-six (>=1.5.2)
|
||||
Standards-Version: 3.8.4
|
||||
Homepage: http://scrapy.org/
|
||||
|
||||
Package: scrapy-SUFFIX
|
||||
Architecture: all
|
||||
Depends: ${python:Depends}, python-lxml, python-twisted, python-openssl,
|
||||
python-w3lib (>= 1.8.0), python-queuelib, python-cssselect (>= 0.9), python-six (>=1.5.2)
|
||||
Recommends: python-setuptools
|
||||
Conflicts: python-scrapy, scrapy, scrapy-0.11
|
||||
Provides: python-scrapy, scrapy
|
||||
Description: Python web crawling and scraping framework
|
||||
Scrapy is a fast high-level screen scraping and web crawling framework,
|
||||
used to crawl websites and extract structured data from their pages.
|
||||
It can be used for a wide range of purposes, from data mining to
|
||||
monitoring and automated testing.
|
40
debian/copyright
vendored
@ -1,40 +0,0 @@
|
||||
This package was debianized by the Scrapinghub team <info@scrapinghub.com>.
|
||||
|
||||
It was downloaded from http://scrapy.org
|
||||
|
||||
Upstream Author: Scrapy Developers
|
||||
|
||||
Copyright: 2007-2013 Scrapy Developers
|
||||
|
||||
License: bsd
|
||||
|
||||
Copyright (c) Scrapy developers.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of Scrapy nor the names of its contributors may be used
|
||||
to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
The Debian packaging is (C) 2010-2013, Scrapinghub <info@scrapinghub.com> and
|
||||
is licensed under the BSD, see `/usr/share/common-licenses/BSD'.
|
1
debian/pyversions
vendored
@ -1 +0,0 @@
|
||||
2.7
|
5
debian/rules
vendored
@ -1,5 +0,0 @@
|
||||
#!/usr/bin/make -f
|
||||
# -*- makefile -*-
|
||||
|
||||
%:
|
||||
dh $@
|
2
debian/scrapy.docs
vendored
@ -1,2 +0,0 @@
|
||||
README.rst
|
||||
AUTHORS
|
1
debian/scrapy.install
vendored
@ -1 +0,0 @@
|
||||
extras/scrapy_bash_completion etc/bash_completion.d/
|
2
debian/scrapy.lintian-overrides
vendored
@ -1,2 +0,0 @@
|
||||
new-package-should-close-itp-bug
|
||||
extra-license-file usr/share/pyshared/scrapy/xlib/pydispatch/license.txt
|
1
debian/scrapy.manpages
vendored
@ -1 +0,0 @@
|
||||
extras/scrapy.1
|
@ -8,8 +8,10 @@ PYTHON = python
|
||||
SPHINXOPTS =
|
||||
PAPER =
|
||||
SOURCES =
|
||||
SHELL = /bin/bash
|
||||
|
||||
ALLSPHINXOPTS = -b $(BUILDER) -d build/doctrees -D latex_paper_size=$(PAPER) \
|
||||
ALLSPHINXOPTS = -b $(BUILDER) -d build/doctrees \
|
||||
-D latex_elements.papersize=$(PAPER) \
|
||||
$(SPHINXOPTS) . build/$(BUILDER) $(SOURCES)
|
||||
|
||||
.PHONY: help update build html htmlhelp clean
|
||||
@ -22,13 +24,19 @@ help:
|
||||
@echo " text to make plain text files"
|
||||
@echo " changes to make an overview over all changed/added/deprecated items"
|
||||
@echo " linkcheck to check all external links for integrity"
|
||||
@echo " watch build HTML docs, open in browser and watch for changes"
|
||||
|
||||
|
||||
build:
|
||||
build-dirs:
|
||||
mkdir -p build/$(BUILDER) build/doctrees
|
||||
|
||||
build: build-dirs
|
||||
sphinx-build $(ALLSPHINXOPTS)
|
||||
@echo
|
||||
|
||||
build-ignore-errors: build-dirs
|
||||
-sphinx-build $(ALLSPHINXOPTS)
|
||||
@echo
|
||||
|
||||
|
||||
html: BUILDER = html
|
||||
html: build
|
||||
@ -58,6 +66,12 @@ linkcheck: build
|
||||
@echo "Link check complete; look for any errors in the above output " \
|
||||
"or in build/$(BUILDER)/output.txt"
|
||||
|
||||
linkfix: BUILDER = linkcheck
|
||||
linkfix: build-ignore-errors
|
||||
$(PYTHON) utils/linkfix.py
|
||||
@echo "Fixing redirecting links in docs has finished; check all " \
|
||||
"replacements before committing them"
|
||||
|
||||
doctest: BUILDER = doctest
|
||||
doctest: build
|
||||
@echo "Testing of doctests in the sources finished, look at the " \
|
||||
@ -68,9 +82,15 @@ pydoc-topics: build
|
||||
@echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \
|
||||
"into the Lib/ directory"
|
||||
|
||||
coverage: BUILDER = coverage
|
||||
coverage: build
|
||||
|
||||
htmlview: html
|
||||
$(PYTHON) -c "import webbrowser; webbrowser.open('build/html/index.html')"
|
||||
$(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
|
||||
os.path.realpath('build/html/index.html'))"
|
||||
|
||||
clean:
|
||||
-rm -rf build/*
|
||||
|
||||
watch: htmlview
|
||||
watchmedo shell-command -p '*.rst' -c 'make html' -R -D
|
||||
|
@ -1,3 +1,5 @@
|
||||
:orphan:
|
||||
|
||||
======================================
|
||||
Scrapy documentation quick start guide
|
||||
======================================
|
||||
@ -8,16 +10,12 @@ This file provides a quick guide on how to compile the Scrapy documentation.
|
||||
Setup the environment
|
||||
---------------------
|
||||
|
||||
To compile the documentation you need the following Python libraries:
|
||||
To compile the documentation you need Sphinx Python library. To install it
|
||||
and all its dependencies run the following command from this dir
|
||||
|
||||
* Sphinx
|
||||
* docutils
|
||||
* jinja
|
||||
::
|
||||
|
||||
If you have setuptools available the following command will install all of them
|
||||
(since Sphinx requires both docutils and jinja)::
|
||||
|
||||
easy_install Sphinx
|
||||
pip install -r requirements.txt
|
||||
|
||||
|
||||
Compile the documentation
|
||||
@ -52,3 +50,19 @@ To cleanup all generated documentation files and start from scratch run::
|
||||
Keep in mind that this command won't touch any documentation source files.
|
||||
|
||||
|
||||
Recreating documentation on the fly
|
||||
-----------------------------------
|
||||
|
||||
There is a way to recreate the doc automatically when you make changes, you
|
||||
need to install watchdog (``pip install watchdog``) and then use::
|
||||
|
||||
make watch
|
||||
|
||||
Alternative method using tox
|
||||
----------------------------
|
||||
|
||||
To compile the documentation to HTML run the following command::
|
||||
|
||||
tox -e docs
|
||||
|
||||
Documentation will be generated (in HTML format) inside the ``.tox/docs/tmp/html`` dir.
|
@ -1,5 +1,82 @@
|
||||
from docutils.parsers.rst.roles import set_classes
|
||||
from docutils import nodes
|
||||
from docutils.parsers.rst import Directive
|
||||
from sphinx.util.nodes import make_refnode
|
||||
from operator import itemgetter
|
||||
|
||||
|
||||
class settingslist_node(nodes.General, nodes.Element):
|
||||
pass
|
||||
|
||||
|
||||
class SettingsListDirective(Directive):
|
||||
def run(self):
|
||||
return [settingslist_node('')]
|
||||
|
||||
|
||||
def is_setting_index(node):
|
||||
if node.tagname == 'index':
|
||||
# index entries for setting directives look like:
|
||||
# [('pair', 'SETTING_NAME; setting', 'std:setting-SETTING_NAME', '')]
|
||||
entry_type, info, refid = node['entries'][0][:3]
|
||||
return entry_type == 'pair' and info.endswith('; setting')
|
||||
return False
|
||||
|
||||
|
||||
def get_setting_target(node):
|
||||
# target nodes are placed next to the node in the doc tree
|
||||
return node.parent[node.parent.index(node) + 1]
|
||||
|
||||
|
||||
def get_setting_name_and_refid(node):
|
||||
"""Extract setting name from directive index node"""
|
||||
entry_type, info, refid = node['entries'][0][:3]
|
||||
return info.replace('; setting', ''), refid
|
||||
|
||||
|
||||
def collect_scrapy_settings_refs(app, doctree):
|
||||
env = app.builder.env
|
||||
|
||||
if not hasattr(env, 'scrapy_all_settings'):
|
||||
env.scrapy_all_settings = []
|
||||
|
||||
for node in doctree.traverse(is_setting_index):
|
||||
targetnode = get_setting_target(node)
|
||||
assert isinstance(targetnode, nodes.target), "Next node is not a target"
|
||||
|
||||
setting_name, refid = get_setting_name_and_refid(node)
|
||||
|
||||
env.scrapy_all_settings.append({
|
||||
'docname': env.docname,
|
||||
'setting_name': setting_name,
|
||||
'refid': refid,
|
||||
})
|
||||
|
||||
|
||||
def make_setting_element(setting_data, app, fromdocname):
|
||||
refnode = make_refnode(app.builder, fromdocname,
|
||||
todocname=setting_data['docname'],
|
||||
targetid=setting_data['refid'],
|
||||
child=nodes.Text(setting_data['setting_name']))
|
||||
p = nodes.paragraph()
|
||||
p += refnode
|
||||
|
||||
item = nodes.list_item()
|
||||
item += p
|
||||
return item
|
||||
|
||||
|
||||
def replace_settingslist_nodes(app, doctree, fromdocname):
|
||||
env = app.builder.env
|
||||
|
||||
for node in doctree.traverse(settingslist_node):
|
||||
settings_list = nodes.bullet_list()
|
||||
settings_list.extend([make_setting_element(d, app, fromdocname)
|
||||
for d in sorted(env.scrapy_all_settings,
|
||||
key=itemgetter('setting_name'))
|
||||
if fromdocname != d['docname']])
|
||||
node.replace_self(settings_list)
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.add_crossref_type(
|
||||
@ -27,24 +104,34 @@ def setup(app):
|
||||
app.add_role('issue', issue_role)
|
||||
app.add_role('rev', rev_role)
|
||||
|
||||
app.add_node(settingslist_node)
|
||||
app.add_directive('settingslist', SettingsListDirective)
|
||||
|
||||
app.connect('doctree-read', collect_scrapy_settings_refs)
|
||||
app.connect('doctree-resolved', replace_settingslist_nodes)
|
||||
|
||||
|
||||
def source_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
ref = 'https://github.com/scrapy/scrapy/blob/master/' + text
|
||||
set_classes(options)
|
||||
node = nodes.reference(rawtext, text, refuri=ref, **options)
|
||||
return [node], []
|
||||
|
||||
|
||||
def issue_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
ref = 'https://github.com/scrapy/scrapy/issues/' + text
|
||||
set_classes(options)
|
||||
node = nodes.reference(rawtext, 'issue ' + text, refuri=ref, **options)
|
||||
return [node], []
|
||||
|
||||
|
||||
def commit_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
ref = 'https://github.com/scrapy/scrapy/commit/' + text
|
||||
set_classes(options)
|
||||
node = nodes.reference(rawtext, 'commit ' + text, refuri=ref, **options)
|
||||
return [node], []
|
||||
|
||||
|
||||
def rev_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
ref = 'http://hg.scrapy.org/scrapy/changeset/' + text
|
||||
set_classes(options)
|
||||
|
657
docs/_static/scrapydoc.css
vendored
@ -1,657 +0,0 @@
|
||||
/**
|
||||
* Sphinx Doc Design
|
||||
*/
|
||||
|
||||
body {
|
||||
font-family: sans-serif;
|
||||
font-size: 100%;
|
||||
background-color: #3d1e11;
|
||||
color: #000;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
/* :::: LAYOUT :::: */
|
||||
|
||||
div.document {
|
||||
background-color: #69341e;
|
||||
}
|
||||
|
||||
div.documentwrapper {
|
||||
float: left;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
div.bodywrapper {
|
||||
margin: 0 0 0 230px;
|
||||
}
|
||||
|
||||
div.body {
|
||||
background-color: white;
|
||||
padding: 0 20px 30px 20px;
|
||||
}
|
||||
|
||||
div.sphinxsidebarwrapper {
|
||||
padding: 10px 5px 0 10px;
|
||||
}
|
||||
|
||||
div.sphinxsidebar {
|
||||
float: left;
|
||||
width: 230px;
|
||||
margin-left: -100%;
|
||||
font-size: 90%;
|
||||
}
|
||||
|
||||
div.clearer {
|
||||
clear: both;
|
||||
}
|
||||
|
||||
div.footer {
|
||||
color: #fff;
|
||||
width: 100%;
|
||||
padding: 9px 0 9px 0;
|
||||
text-align: center;
|
||||
font-size: 75%;
|
||||
}
|
||||
|
||||
div.footer a {
|
||||
color: #fff;
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
div.related {
|
||||
background-color: #5b1616;
|
||||
color: #fff;
|
||||
width: 100%;
|
||||
line-height: 30px;
|
||||
font-size: 90%;
|
||||
}
|
||||
|
||||
div.related h3 {
|
||||
display: none;
|
||||
}
|
||||
|
||||
div.related ul {
|
||||
margin: 0;
|
||||
padding: 0 0 0 10px;
|
||||
list-style: none;
|
||||
}
|
||||
|
||||
div.related li {
|
||||
display: inline;
|
||||
}
|
||||
|
||||
div.related li.right {
|
||||
float: right;
|
||||
margin-right: 5px;
|
||||
}
|
||||
|
||||
div.related a {
|
||||
color: white;
|
||||
}
|
||||
|
||||
/* ::: TOC :::: */
|
||||
div.sphinxsidebar h3 {
|
||||
font-family: 'Trebuchet MS', sans-serif;
|
||||
color: white;
|
||||
font-size: 1.4em;
|
||||
font-weight: normal;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
div.sphinxsidebar h3 a {
|
||||
color: white;
|
||||
}
|
||||
|
||||
div.sphinxsidebar h4 {
|
||||
font-family: 'Trebuchet MS', sans-serif;
|
||||
color: white;
|
||||
font-size: 1.3em;
|
||||
font-weight: normal;
|
||||
margin: 5px 0 0 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
div.sphinxsidebar p {
|
||||
color: white;
|
||||
}
|
||||
|
||||
div.sphinxsidebar p.topless {
|
||||
margin: 5px 10px 10px 10px;
|
||||
}
|
||||
|
||||
div.sphinxsidebar ul {
|
||||
margin: 10px;
|
||||
padding: 0;
|
||||
list-style: none;
|
||||
color: white;
|
||||
}
|
||||
|
||||
div.sphinxsidebar ul ul,
|
||||
div.sphinxsidebar ul.want-points {
|
||||
margin-left: 20px;
|
||||
list-style: square;
|
||||
}
|
||||
|
||||
div.sphinxsidebar ul ul {
|
||||
margin-top: 0;
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
div.sphinxsidebar a {
|
||||
color: #ffca9b;
|
||||
}
|
||||
|
||||
div.sphinxsidebar form {
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
div.sphinxsidebar input {
|
||||
border: 1px solid #ffca9b;
|
||||
font-family: sans-serif;
|
||||
font-size: 1em;
|
||||
}
|
||||
|
||||
/* :::: MODULE CLOUD :::: */
|
||||
div.modulecloud {
|
||||
margin: -5px 10px 5px 10px;
|
||||
padding: 10px;
|
||||
line-height: 160%;
|
||||
border: 1px solid #cbe7e5;
|
||||
background-color: #f2fbfd;
|
||||
}
|
||||
|
||||
div.modulecloud a {
|
||||
padding: 0 5px 0 5px;
|
||||
}
|
||||
|
||||
/* :::: SEARCH :::: */
|
||||
ul.search {
|
||||
margin: 10px 0 0 20px;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
ul.search li {
|
||||
padding: 5px 0 5px 20px;
|
||||
background-image: url(file.png);
|
||||
background-repeat: no-repeat;
|
||||
background-position: 0 7px;
|
||||
}
|
||||
|
||||
ul.search li a {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
ul.search li div.context {
|
||||
color: #888;
|
||||
margin: 2px 0 0 30px;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
ul.keywordmatches li.goodmatch a {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
/* :::: COMMON FORM STYLES :::: */
|
||||
|
||||
div.actions {
|
||||
padding: 5px 10px 5px 10px;
|
||||
border-top: 1px solid #cbe7e5;
|
||||
border-bottom: 1px solid #cbe7e5;
|
||||
background-color: #e0f6f4;
|
||||
}
|
||||
|
||||
form dl {
|
||||
color: #333;
|
||||
}
|
||||
|
||||
form dt {
|
||||
clear: both;
|
||||
float: left;
|
||||
min-width: 110px;
|
||||
margin-right: 10px;
|
||||
padding-top: 2px;
|
||||
}
|
||||
|
||||
input#homepage {
|
||||
display: none;
|
||||
}
|
||||
|
||||
div.error {
|
||||
margin: 5px 20px 0 0;
|
||||
padding: 5px;
|
||||
border: 1px solid #d00;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
/* :::: INDEX PAGE :::: */
|
||||
|
||||
table.contentstable {
|
||||
width: 90%;
|
||||
}
|
||||
|
||||
table.contentstable p.biglink {
|
||||
line-height: 150%;
|
||||
}
|
||||
|
||||
a.biglink {
|
||||
font-size: 1.3em;
|
||||
}
|
||||
|
||||
span.linkdescr {
|
||||
font-style: italic;
|
||||
padding-top: 5px;
|
||||
font-size: 90%;
|
||||
}
|
||||
|
||||
/* :::: INDEX STYLES :::: */
|
||||
|
||||
table.indextable td {
|
||||
text-align: left;
|
||||
vertical-align: top;
|
||||
}
|
||||
|
||||
table.indextable dl, table.indextable dd {
|
||||
margin-top: 0;
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
table.indextable tr.pcap {
|
||||
height: 10px;
|
||||
}
|
||||
|
||||
table.indextable tr.cap {
|
||||
margin-top: 10px;
|
||||
background-color: #f2f2f2;
|
||||
}
|
||||
|
||||
img.toggler {
|
||||
margin-right: 3px;
|
||||
margin-top: 3px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
form.pfform {
|
||||
margin: 10px 0 20px 0;
|
||||
}
|
||||
|
||||
/* :::: GLOBAL STYLES :::: */
|
||||
|
||||
.docwarning {
|
||||
background-color: #ffe4e4;
|
||||
padding: 10px;
|
||||
margin: 0 -20px 0 -20px;
|
||||
border-bottom: 1px solid #f66;
|
||||
}
|
||||
|
||||
p.subhead {
|
||||
font-weight: bold;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #6e0909;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
div.body h1,
|
||||
div.body h2,
|
||||
div.body h3,
|
||||
div.body h4,
|
||||
div.body h5,
|
||||
div.body h6 {
|
||||
font-family: 'Trebuchet MS', sans-serif;
|
||||
background-color: #f2f2f2;
|
||||
font-weight: normal;
|
||||
color: #331F0A;
|
||||
border-bottom: 1px solid #ccc;
|
||||
margin: 20px -20px 10px -20px;
|
||||
padding: 3px 0 3px 10px;
|
||||
}
|
||||
|
||||
div.body h1 { margin-top: 0; font-size: 200%; }
|
||||
div.body h2 { font-size: 160%; }
|
||||
div.body h3 { font-size: 140%; }
|
||||
div.body h4 { font-size: 120%; }
|
||||
div.body h5 { font-size: 110%; }
|
||||
div.body h6 { font-size: 100%; }
|
||||
|
||||
a.headerlink {
|
||||
color: #c60f0f;
|
||||
font-size: 0.8em;
|
||||
padding: 0 4px 0 4px;
|
||||
text-decoration: none;
|
||||
visibility: hidden;
|
||||
}
|
||||
|
||||
h1:hover > a.headerlink,
|
||||
h2:hover > a.headerlink,
|
||||
h3:hover > a.headerlink,
|
||||
h4:hover > a.headerlink,
|
||||
h5:hover > a.headerlink,
|
||||
h6:hover > a.headerlink,
|
||||
dt:hover > a.headerlink {
|
||||
visibility: visible;
|
||||
}
|
||||
|
||||
a.headerlink:hover {
|
||||
background-color: #c60f0f;
|
||||
color: white;
|
||||
}
|
||||
|
||||
div.body p, div.body dd, div.body li {
|
||||
text-align: justify;
|
||||
line-height: 130%;
|
||||
}
|
||||
|
||||
div.body p.caption {
|
||||
text-align: inherit;
|
||||
}
|
||||
|
||||
div.body td {
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
ul.fakelist {
|
||||
list-style: none;
|
||||
margin: 10px 0 10px 20px;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.field-list ul {
|
||||
padding-left: 1em;
|
||||
}
|
||||
|
||||
.first {
|
||||
margin-top: 0 !important;
|
||||
}
|
||||
|
||||
/* "Footnotes" heading */
|
||||
p.rubric {
|
||||
margin-top: 30px;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
/* Sidebars */
|
||||
|
||||
div.sidebar {
|
||||
margin: 0 0 0.5em 1em;
|
||||
border: 1px solid #ddb;
|
||||
padding: 7px 7px 0 7px;
|
||||
background-color: #ffe;
|
||||
width: 40%;
|
||||
float: right;
|
||||
}
|
||||
|
||||
p.sidebar-title {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
/* "Topics" */
|
||||
|
||||
div.topic {
|
||||
background-color: #eee;
|
||||
border: 1px solid #ccc;
|
||||
padding: 7px 7px 0 7px;
|
||||
margin: 10px 0 10px 0;
|
||||
}
|
||||
|
||||
p.topic-title {
|
||||
font-size: 1.1em;
|
||||
font-weight: bold;
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
/* Admonitions */
|
||||
|
||||
div.admonition {
|
||||
margin-top: 10px;
|
||||
margin-bottom: 10px;
|
||||
padding: 7px;
|
||||
}
|
||||
|
||||
div.admonition dt {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
div.admonition dl {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
div.admonition p.admonition-title + p {
|
||||
display: inline;
|
||||
}
|
||||
|
||||
div.seealso {
|
||||
background-color: #ffc;
|
||||
border: 1px solid #ff6;
|
||||
}
|
||||
|
||||
div.warning {
|
||||
background-color: #ffe4e4;
|
||||
border: 1px solid #f66;
|
||||
}
|
||||
|
||||
div.note {
|
||||
background-color: #eee;
|
||||
border: 1px solid #ccc;
|
||||
}
|
||||
|
||||
p.admonition-title {
|
||||
margin: 0px 10px 5px 0px;
|
||||
font-weight: bold;
|
||||
display: inline;
|
||||
}
|
||||
|
||||
p.admonition-title:after {
|
||||
content: ":";
|
||||
}
|
||||
|
||||
div.body p.centered {
|
||||
text-align: center;
|
||||
margin-top: 25px;
|
||||
}
|
||||
|
||||
table.docutils {
|
||||
border: 0;
|
||||
}
|
||||
|
||||
table.docutils td, table.docutils th {
|
||||
padding: 1px 8px 1px 0;
|
||||
border-top: 0;
|
||||
border-left: 0;
|
||||
border-right: 0;
|
||||
border-bottom: 1px solid #aaa;
|
||||
}
|
||||
|
||||
table.field-list td, table.field-list th {
|
||||
border: 0 !important;
|
||||
}
|
||||
|
||||
table.footnote td, table.footnote th {
|
||||
border: 0 !important;
|
||||
}
|
||||
|
||||
.field-list ul {
|
||||
margin: 0;
|
||||
padding-left: 1em;
|
||||
}
|
||||
|
||||
.field-list p {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
dl {
|
||||
margin-bottom: 15px;
|
||||
clear: both;
|
||||
}
|
||||
|
||||
dd p {
|
||||
margin-top: 0px;
|
||||
}
|
||||
|
||||
dd ul, dd table {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
dd {
|
||||
margin-top: 3px;
|
||||
margin-bottom: 10px;
|
||||
margin-left: 30px;
|
||||
}
|
||||
|
||||
.refcount {
|
||||
color: #060;
|
||||
}
|
||||
|
||||
dt:target,
|
||||
.highlight {
|
||||
background-color: #fbe54e;
|
||||
}
|
||||
|
||||
dl.glossary dt {
|
||||
font-weight: bold;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
|
||||
th {
|
||||
text-align: left;
|
||||
padding-right: 5px;
|
||||
}
|
||||
|
||||
pre {
|
||||
padding: 5px;
|
||||
background-color: #efc;
|
||||
color: #333;
|
||||
border: 1px solid #ac9;
|
||||
border-left: none;
|
||||
border-right: none;
|
||||
overflow: auto;
|
||||
}
|
||||
|
||||
td.linenos pre {
|
||||
padding: 5px 0px;
|
||||
border: 0;
|
||||
background-color: transparent;
|
||||
color: #aaa;
|
||||
}
|
||||
|
||||
table.highlighttable {
|
||||
margin-left: 0.5em;
|
||||
}
|
||||
|
||||
table.highlighttable td {
|
||||
padding: 0 0.5em 0 0.5em;
|
||||
}
|
||||
|
||||
tt {
|
||||
background-color: #ecf0f3;
|
||||
padding: 0 1px 0 1px;
|
||||
font-size: 0.95em;
|
||||
}
|
||||
|
||||
tt.descname {
|
||||
background-color: transparent;
|
||||
font-weight: bold;
|
||||
font-size: 1.2em;
|
||||
}
|
||||
|
||||
tt.descclassname {
|
||||
background-color: transparent;
|
||||
}
|
||||
|
||||
tt.xref, a tt {
|
||||
background-color: transparent;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.footnote:target { background-color: #ffa }
|
||||
|
||||
h1 tt, h2 tt, h3 tt, h4 tt, h5 tt, h6 tt {
|
||||
background-color: transparent;
|
||||
}
|
||||
|
||||
.optional {
|
||||
font-size: 1.3em;
|
||||
}
|
||||
|
||||
.versionmodified {
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
form.comment {
|
||||
margin: 0;
|
||||
padding: 10px 30px 10px 30px;
|
||||
background-color: #eee;
|
||||
}
|
||||
|
||||
form.comment h3 {
|
||||
background-color: #326591;
|
||||
color: white;
|
||||
margin: -10px -30px 10px -30px;
|
||||
padding: 5px;
|
||||
font-size: 1.4em;
|
||||
}
|
||||
|
||||
form.comment input,
|
||||
form.comment textarea {
|
||||
border: 1px solid #ccc;
|
||||
padding: 2px;
|
||||
font-family: sans-serif;
|
||||
font-size: 100%;
|
||||
}
|
||||
|
||||
form.comment input[type="text"] {
|
||||
width: 240px;
|
||||
}
|
||||
|
||||
form.comment textarea {
|
||||
width: 100%;
|
||||
height: 200px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.system-message {
|
||||
background-color: #fda;
|
||||
padding: 5px;
|
||||
border: 3px solid red;
|
||||
}
|
||||
|
||||
img.math {
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
div.math p {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
span.eqno {
|
||||
float: right;
|
||||
}
|
||||
|
||||
img.logo {
|
||||
border: 0;
|
||||
}
|
||||
|
||||
/* :::: PRINT :::: */
|
||||
@media print {
|
||||
div.document,
|
||||
div.documentwrapper,
|
||||
div.bodywrapper {
|
||||
margin: 0;
|
||||
width : 100%;
|
||||
}
|
||||
|
||||
div.sphinxsidebar,
|
||||
div.related,
|
||||
div.footer,
|
||||
div#comments div.new-comment-box,
|
||||
#top-link {
|
||||
display: none;
|
||||
}
|
||||
}
|
16
docs/_templates/layout.html
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
{% extends "!layout.html" %}
|
||||
|
||||
{% block footer %}
|
||||
{{ super() }}
|
||||
<script type="text/javascript">
|
||||
!function(){var analytics=window.analytics=window.analytics||[];if(!analytics.initialize)if(analytics.invoked)window.console&&console.error&&console.error("Segment snippet included twice.");else{analytics.invoked=!0;analytics.methods=["trackSubmit","trackClick","trackLink","trackForm","pageview","identify","reset","group","track","ready","alias","page","once","off","on"];analytics.factory=function(t){return function(){var e=Array.prototype.slice.call(arguments);e.unshift(t);analytics.push(e);return analytics}};for(var t=0;t<analytics.methods.length;t++){var e=analytics.methods[t];analytics[e]=analytics.factory(e)}analytics.load=function(t){var e=document.createElement("script");e.type="text/javascript";e.async=!0;e.src=("https:"===document.location.protocol?"https://":"http://")+"cdn.segment.com/analytics.js/v1/"+t+"/analytics.min.js";var n=document.getElementsByTagName("script")[0];n.parentNode.insertBefore(e,n)};analytics.SNIPPET_VERSION="3.1.0";
|
||||
analytics.load("8UDQfnf3cyFSTsM4YANnW5sXmgZVILbA");
|
||||
analytics.page();
|
||||
}}();
|
||||
|
||||
analytics.ready(function () {
|
||||
ga('require', 'linker');
|
||||
ga('linker:autoLink', ['scrapinghub.com', 'crawlera.com']);
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
281
docs/_tests/quotes.html
Normal file
@ -0,0 +1,281 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Quotes to Scrape</title>
|
||||
<link rel="stylesheet" href="/static/bootstrap.min.css">
|
||||
<link rel="stylesheet" href="/static/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="row header-box">
|
||||
<div class="col-md-8">
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
|
||||
</h1>
|
||||
</div>
|
||||
<div class="col-md-4">
|
||||
<p>
|
||||
|
||||
<a href="/login">Login</a>
|
||||
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-8">
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="change,deep-thoughts,thinking,world" / >
|
||||
|
||||
<a class="tag" href="/tag/change/page/1/">change</a>
|
||||
|
||||
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
|
||||
|
||||
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
|
||||
|
||||
<a class="tag" href="/tag/world/page/1/">world</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
|
||||
<span>by <small class="author" itemprop="author">J.K. Rowling</small>
|
||||
<a href="/author/J-K-Rowling">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="abilities,choices" / >
|
||||
|
||||
<a class="tag" href="/tag/abilities/page/1/">abilities</a>
|
||||
|
||||
<a class="tag" href="/tag/choices/page/1/">choices</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="inspirational,life,live,miracle,miracles" / >
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
<a class="tag" href="/tag/life/page/1/">life</a>
|
||||
|
||||
<a class="tag" href="/tag/live/page/1/">live</a>
|
||||
|
||||
<a class="tag" href="/tag/miracle/page/1/">miracle</a>
|
||||
|
||||
<a class="tag" href="/tag/miracles/page/1/">miracles</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>
|
||||
<span>by <small class="author" itemprop="author">Jane Austen</small>
|
||||
<a href="/author/Jane-Austen">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="aliteracy,books,classic,humor" / >
|
||||
|
||||
<a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
|
||||
|
||||
<a class="tag" href="/tag/books/page/1/">books</a>
|
||||
|
||||
<a class="tag" href="/tag/classic/page/1/">classic</a>
|
||||
|
||||
<a class="tag" href="/tag/humor/page/1/">humor</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>
|
||||
<span>by <small class="author" itemprop="author">Marilyn Monroe</small>
|
||||
<a href="/author/Marilyn-Monroe">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="be-yourself,inspirational" / >
|
||||
|
||||
<a class="tag" href="/tag/be-yourself/page/1/">be-yourself</a>
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="adulthood,success,value" / >
|
||||
|
||||
<a class="tag" href="/tag/adulthood/page/1/">adulthood</a>
|
||||
|
||||
<a class="tag" href="/tag/success/page/1/">success</a>
|
||||
|
||||
<a class="tag" href="/tag/value/page/1/">value</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.”</span>
|
||||
<span>by <small class="author" itemprop="author">André Gide</small>
|
||||
<a href="/author/Andre-Gide">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="life,love" / >
|
||||
|
||||
<a class="tag" href="/tag/life/page/1/">life</a>
|
||||
|
||||
<a class="tag" href="/tag/love/page/1/">love</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“I have not failed. I've just found 10,000 ways that won't work.”</span>
|
||||
<span>by <small class="author" itemprop="author">Thomas A. Edison</small>
|
||||
<a href="/author/Thomas-A-Edison">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="edison,failure,inspirational,paraphrased" / >
|
||||
|
||||
<a class="tag" href="/tag/edison/page/1/">edison</a>
|
||||
|
||||
<a class="tag" href="/tag/failure/page/1/">failure</a>
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
<a class="tag" href="/tag/paraphrased/page/1/">paraphrased</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“A woman is like a tea bag; you never know how strong it is until it's in hot water.”</span>
|
||||
<span>by <small class="author" itemprop="author">Eleanor Roosevelt</small>
|
||||
<a href="/author/Eleanor-Roosevelt">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="misattributed-eleanor-roosevelt" / >
|
||||
|
||||
<a class="tag" href="/tag/misattributed-eleanor-roosevelt/page/1/">misattributed-eleanor-roosevelt</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“A day without sunshine is like, you know, night.”</span>
|
||||
<span>by <small class="author" itemprop="author">Steve Martin</small>
|
||||
<a href="/author/Steve-Martin">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="humor,obvious,simile" / >
|
||||
|
||||
<a class="tag" href="/tag/humor/page/1/">humor</a>
|
||||
|
||||
<a class="tag" href="/tag/obvious/page/1/">obvious</a>
|
||||
|
||||
<a class="tag" href="/tag/simile/page/1/">simile</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<nav>
|
||||
<ul class="pager">
|
||||
|
||||
|
||||
<li class="next">
|
||||
<a href="/page/2/">Next <span aria-hidden="true">→</span></a>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
<div class="col-md-4 tags-box">
|
||||
|
||||
<h2>Top Ten tags</h2>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 28px" href="/tag/love/">love</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 26px" href="/tag/inspirational/">inspirational</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 26px" href="/tag/life/">life</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 24px" href="/tag/humor/">humor</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 22px" href="/tag/books/">books</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 14px" href="/tag/reading/">reading</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 10px" href="/tag/friendship/">friendship</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 8px" href="/tag/friends/">friends</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 8px" href="/tag/truth/">truth</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 6px" href="/tag/simile/">simile</a>
|
||||
</span>
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<footer class="footer">
|
||||
<div class="container">
|
||||
<p class="text-muted">
|
||||
Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
|
||||
</p>
|
||||
<p class="copyright">
|
||||
Made with <span class='sh-red'>❤</span> by <a href="https://scrapinghub.com">Scrapinghub</a>
|
||||
</p>
|
||||
</div>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
281
docs/_tests/quotes1.html
Normal file
@ -0,0 +1,281 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Quotes to Scrape</title>
|
||||
<link rel="stylesheet" href="/static/bootstrap.min.css">
|
||||
<link rel="stylesheet" href="/static/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="row header-box">
|
||||
<div class="col-md-8">
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
|
||||
</h1>
|
||||
</div>
|
||||
<div class="col-md-4">
|
||||
<p>
|
||||
|
||||
<a href="/login">Login</a>
|
||||
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-8">
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="change,deep-thoughts,thinking,world" / >
|
||||
|
||||
<a class="tag" href="/tag/change/page/1/">change</a>
|
||||
|
||||
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
|
||||
|
||||
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
|
||||
|
||||
<a class="tag" href="/tag/world/page/1/">world</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
|
||||
<span>by <small class="author" itemprop="author">J.K. Rowling</small>
|
||||
<a href="/author/J-K-Rowling">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="abilities,choices" / >
|
||||
|
||||
<a class="tag" href="/tag/abilities/page/1/">abilities</a>
|
||||
|
||||
<a class="tag" href="/tag/choices/page/1/">choices</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="inspirational,life,live,miracle,miracles" / >
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
<a class="tag" href="/tag/life/page/1/">life</a>
|
||||
|
||||
<a class="tag" href="/tag/live/page/1/">live</a>
|
||||
|
||||
<a class="tag" href="/tag/miracle/page/1/">miracle</a>
|
||||
|
||||
<a class="tag" href="/tag/miracles/page/1/">miracles</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>
|
||||
<span>by <small class="author" itemprop="author">Jane Austen</small>
|
||||
<a href="/author/Jane-Austen">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="aliteracy,books,classic,humor" / >
|
||||
|
||||
<a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
|
||||
|
||||
<a class="tag" href="/tag/books/page/1/">books</a>
|
||||
|
||||
<a class="tag" href="/tag/classic/page/1/">classic</a>
|
||||
|
||||
<a class="tag" href="/tag/humor/page/1/">humor</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>
|
||||
<span>by <small class="author" itemprop="author">Marilyn Monroe</small>
|
||||
<a href="/author/Marilyn-Monroe">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="be-yourself,inspirational" / >
|
||||
|
||||
<a class="tag" href="/tag/be-yourself/page/1/">be-yourself</a>
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="adulthood,success,value" / >
|
||||
|
||||
<a class="tag" href="/tag/adulthood/page/1/">adulthood</a>
|
||||
|
||||
<a class="tag" href="/tag/success/page/1/">success</a>
|
||||
|
||||
<a class="tag" href="/tag/value/page/1/">value</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.”</span>
|
||||
<span>by <small class="author" itemprop="author">André Gide</small>
|
||||
<a href="/author/Andre-Gide">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="life,love" / >
|
||||
|
||||
<a class="tag" href="/tag/life/page/1/">life</a>
|
||||
|
||||
<a class="tag" href="/tag/love/page/1/">love</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“I have not failed. I've just found 10,000 ways that won't work.”</span>
|
||||
<span>by <small class="author" itemprop="author">Thomas A. Edison</small>
|
||||
<a href="/author/Thomas-A-Edison">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="edison,failure,inspirational,paraphrased" / >
|
||||
|
||||
<a class="tag" href="/tag/edison/page/1/">edison</a>
|
||||
|
||||
<a class="tag" href="/tag/failure/page/1/">failure</a>
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
<a class="tag" href="/tag/paraphrased/page/1/">paraphrased</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“A woman is like a tea bag; you never know how strong it is until it's in hot water.”</span>
|
||||
<span>by <small class="author" itemprop="author">Eleanor Roosevelt</small>
|
||||
<a href="/author/Eleanor-Roosevelt">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="misattributed-eleanor-roosevelt" / >
|
||||
|
||||
<a class="tag" href="/tag/misattributed-eleanor-roosevelt/page/1/">misattributed-eleanor-roosevelt</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“A day without sunshine is like, you know, night.”</span>
|
||||
<span>by <small class="author" itemprop="author">Steve Martin</small>
|
||||
<a href="/author/Steve-Martin">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="humor,obvious,simile" / >
|
||||
|
||||
<a class="tag" href="/tag/humor/page/1/">humor</a>
|
||||
|
||||
<a class="tag" href="/tag/obvious/page/1/">obvious</a>
|
||||
|
||||
<a class="tag" href="/tag/simile/page/1/">simile</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<nav>
|
||||
<ul class="pager">
|
||||
|
||||
|
||||
<li class="next">
|
||||
<a href="/page/2/">Next <span aria-hidden="true">→</span></a>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
<div class="col-md-4 tags-box">
|
||||
|
||||
<h2>Top Ten tags</h2>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 28px" href="/tag/love/">love</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 26px" href="/tag/inspirational/">inspirational</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 26px" href="/tag/life/">life</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 24px" href="/tag/humor/">humor</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 22px" href="/tag/books/">books</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 14px" href="/tag/reading/">reading</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 10px" href="/tag/friendship/">friendship</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 8px" href="/tag/friends/">friends</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 8px" href="/tag/truth/">truth</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 6px" href="/tag/simile/">simile</a>
|
||||
</span>
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<footer class="footer">
|
||||
<div class="container">
|
||||
<p class="text-muted">
|
||||
Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
|
||||
</p>
|
||||
<p class="copyright">
|
||||
Made with <span class='sh-red'>❤</span> by <a href="https://scrapinghub.com">Scrapinghub</a>
|
||||
</p>
|
||||
</div>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
141
docs/conf.py
@ -1,5 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Scrapy documentation build configuration file, created by
|
||||
# sphinx-quickstart on Mon Nov 24 12:02:52 2008.
|
||||
#
|
||||
@ -12,13 +10,14 @@
|
||||
# serve to show the default.
|
||||
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from os import path
|
||||
|
||||
# If your extensions are in another directory, add it here. If the directory
|
||||
# is relative to the documentation root, use os.path.abspath to make it
|
||||
# absolute, like shown here.
|
||||
sys.path.append(path.join(path.dirname(__file__), "_ext"))
|
||||
sys.path.append(path.join(path.dirname(path.dirname(__file__)), "scrapy"))
|
||||
sys.path.insert(0, path.dirname(path.dirname(__file__)))
|
||||
|
||||
|
||||
# General configuration
|
||||
@ -26,7 +25,15 @@ sys.path.append(path.join(path.dirname(path.dirname(__file__)), "scrapy"))
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be extensions
|
||||
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||
extensions = ['scrapydocs']
|
||||
extensions = [
|
||||
'hoverxref.extension',
|
||||
'notfound.extension',
|
||||
'scrapydocs',
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.coverage',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.viewcode',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
@ -41,8 +48,8 @@ source_suffix = '.rst'
|
||||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = u'Scrapy'
|
||||
copyright = u'2008-2013, Scrapy developers'
|
||||
project = 'Scrapy'
|
||||
copyright = '2008–{}, Scrapy developers'.format(datetime.now().year)
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
@ -70,6 +77,8 @@ language = 'en'
|
||||
# List of documents that shouldn't be included in the build.
|
||||
#unused_docs = []
|
||||
|
||||
exclude_patterns = ['build']
|
||||
|
||||
# List of directories, relative to source directory, that shouldn't be searched
|
||||
# for source files.
|
||||
exclude_trees = ['.build']
|
||||
@ -91,14 +100,33 @@ exclude_trees = ['.build']
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'sphinx'
|
||||
|
||||
# List of Sphinx warnings that will not be raised
|
||||
suppress_warnings = ['epub.unknown_project_files']
|
||||
|
||||
|
||||
# Options for HTML output
|
||||
# -----------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#html_theme_options = {}
|
||||
|
||||
# Add any paths that contain custom themes here, relative to this directory.
|
||||
# Add path to the RTD explicitly to robustify builds (otherwise might
|
||||
# fail in a clean Debian build env)
|
||||
import sphinx_rtd_theme
|
||||
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
|
||||
|
||||
|
||||
# The style sheet to use for HTML and HTML Help pages. A file of that name
|
||||
# must exist either in Sphinx' static/ path, or in one of the custom paths
|
||||
# given in html_static_path.
|
||||
html_style = 'scrapydoc.css'
|
||||
# html_style = 'scrapydoc.css'
|
||||
|
||||
# The name for this set of Sphinx documents. If None, it defaults to
|
||||
# "<project> v<release> documentation".
|
||||
@ -125,10 +153,6 @@ html_static_path = ['_static']
|
||||
# using the given strftime format.
|
||||
html_last_updated_fmt = '%b %d, %Y'
|
||||
|
||||
# If true, SmartyPants will be used to convert quotes and dashes to
|
||||
# typographically correct entities.
|
||||
html_use_smartypants = True
|
||||
|
||||
# Custom sidebar templates, maps document names to template names.
|
||||
#html_sidebars = {}
|
||||
|
||||
@ -172,8 +196,8 @@ htmlhelp_basename = 'Scrapydoc'
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title, author, document class [howto/manual]).
|
||||
latex_documents = [
|
||||
('index', 'Scrapy.tex', ur'Scrapy Documentation',
|
||||
ur'Scrapy developers', 'manual'),
|
||||
('index', 'Scrapy.tex', 'Scrapy Documentation',
|
||||
'Scrapy developers', 'manual'),
|
||||
]
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top of
|
||||
@ -203,3 +227,94 @@ linkcheck_ignore = [
|
||||
'http://localhost:\d+', 'http://hg.scrapy.org',
|
||||
'http://directory.google.com/'
|
||||
]
|
||||
|
||||
|
||||
# Options for the Coverage extension
|
||||
# ----------------------------------
|
||||
coverage_ignore_pyobjects = [
|
||||
# Contract’s add_pre_hook and add_post_hook are not documented because
|
||||
# they should be transparent to contract developers, for whom pre_hook and
|
||||
# post_hook should be the actual concern.
|
||||
r'\bContract\.add_(pre|post)_hook$',
|
||||
|
||||
# ContractsManager is an internal class, developers are not expected to
|
||||
# interact with it directly in any way.
|
||||
r'\bContractsManager\b$',
|
||||
|
||||
# For default contracts we only want to document their general purpose in
|
||||
# their __init__ method, the methods they reimplement to achieve that purpose
|
||||
# should be irrelevant to developers using those contracts.
|
||||
r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
|
||||
|
||||
# Methods of downloader middlewares are not documented, only the classes
|
||||
# themselves, since downloader middlewares are controlled through Scrapy
|
||||
# settings.
|
||||
r'^scrapy\.downloadermiddlewares\.\w*?\.(\w*?Middleware|DownloaderStats)\.',
|
||||
|
||||
# Base classes of downloader middlewares are implementation details that
|
||||
# are not meant for users.
|
||||
r'^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware',
|
||||
|
||||
# Private exception used by the command-line interface implementation.
|
||||
r'^scrapy\.exceptions\.UsageError',
|
||||
|
||||
# Methods of BaseItemExporter subclasses are only documented in
|
||||
# BaseItemExporter.
|
||||
r'^scrapy\.exporters\.(?!BaseItemExporter\b)\w*?\.',
|
||||
|
||||
# Extension behavior is only modified through settings. Methods of
|
||||
# extension classes, as well as helper functions, are implementation
|
||||
# details that are not documented.
|
||||
r'^scrapy\.extensions\.[a-z]\w*?\.[A-Z]\w*?\.', # methods
|
||||
r'^scrapy\.extensions\.[a-z]\w*?\.[a-z]', # helper functions
|
||||
|
||||
# Never documented before, and deprecated now.
|
||||
r'^scrapy\.item\.DictItem$',
|
||||
r'^scrapy\.linkextractors\.FilteringLinkExtractor$',
|
||||
|
||||
# Implementation detail of LxmlLinkExtractor
|
||||
r'^scrapy\.linkextractors\.lxmlhtml\.LxmlParserLinkExtractor',
|
||||
]
|
||||
|
||||
|
||||
# Options for the InterSphinx extension
|
||||
# -------------------------------------
|
||||
|
||||
intersphinx_mapping = {
|
||||
'attrs': ('https://www.attrs.org/en/stable/', None),
|
||||
'coverage': ('https://coverage.readthedocs.io/en/stable', None),
|
||||
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
|
||||
'itemloaders': ('https://itemloaders.readthedocs.io/en/latest/', None),
|
||||
'pytest': ('https://docs.pytest.org/en/latest', None),
|
||||
'python': ('https://docs.python.org/3', None),
|
||||
'sphinx': ('https://www.sphinx-doc.org/en/master', None),
|
||||
'tox': ('https://tox.readthedocs.io/en/latest', None),
|
||||
'twisted': ('https://twistedmatrix.com/documents/current', None),
|
||||
'twistedapi': ('https://twistedmatrix.com/documents/current/api', None),
|
||||
}
|
||||
|
||||
|
||||
# Options for sphinx-hoverxref options
|
||||
# ------------------------------------
|
||||
|
||||
hoverxref_auto_ref = True
|
||||
hoverxref_role_types = {
|
||||
"class": "tooltip",
|
||||
"confval": "tooltip",
|
||||
"hoverxref": "tooltip",
|
||||
"mod": "tooltip",
|
||||
"ref": "tooltip",
|
||||
}
|
||||
hoverxref_roles = ['command', 'reqmeta', 'setting', 'signal']
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.connect('autodoc-skip-member', maybe_skip_member)
|
||||
|
||||
|
||||
def maybe_skip_member(app, what, name, obj, skip, options):
|
||||
if not skip:
|
||||
# autodocs was generating a text "alias of" for the following members
|
||||
# https://github.com/sphinx-doc/sphinx/issues/4422
|
||||
return name in {'default_item_class', 'default_selector_class'}
|
||||
return skip
|
||||
|
29
docs/conftest.py
Normal file
@ -0,0 +1,29 @@
|
||||
import os
|
||||
from doctest import ELLIPSIS, NORMALIZE_WHITESPACE
|
||||
|
||||
from scrapy.http.response.html import HtmlResponse
|
||||
from sybil import Sybil
|
||||
from sybil.parsers.codeblock import CodeBlockParser
|
||||
from sybil.parsers.doctest import DocTestParser
|
||||
from sybil.parsers.skip import skip
|
||||
|
||||
|
||||
def load_response(url, filename):
|
||||
input_path = os.path.join(os.path.dirname(__file__), '_tests', filename)
|
||||
with open(input_path, 'rb') as input_file:
|
||||
return HtmlResponse(url, body=input_file.read())
|
||||
|
||||
|
||||
def setup(namespace):
|
||||
namespace['load_response'] = load_response
|
||||
|
||||
|
||||
pytest_collect_file = Sybil(
|
||||
parsers=[
|
||||
DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE),
|
||||
CodeBlockParser(future_imports=['print_function']),
|
||||
skip,
|
||||
],
|
||||
pattern='*.rst',
|
||||
setup=setup,
|
||||
).pytest()
|
@ -4,22 +4,31 @@
|
||||
Contributing to Scrapy
|
||||
======================
|
||||
|
||||
.. important::
|
||||
|
||||
Double check that you are reading the most recent version of this document at
|
||||
https://docs.scrapy.org/en/master/contributing.html
|
||||
|
||||
There are many ways to contribute to Scrapy. Here are some of them:
|
||||
|
||||
* Blog about Scrapy. Tell the world how you're using Scrapy. This will help
|
||||
newcomers with more examples and the Scrapy project to increase its
|
||||
newcomers with more examples and will help the Scrapy project to increase its
|
||||
visibility.
|
||||
|
||||
* Report bugs and request features in the `issue tracker`_, trying to follow
|
||||
the guidelines detailed in `Reporting bugs`_ below.
|
||||
|
||||
* Submit patches for new functionality and/or bug fixes. Please read
|
||||
`Writing patches`_ and `Submitting patches`_ below for details on how to
|
||||
* Submit patches for new functionalities and/or bug fixes. Please read
|
||||
:ref:`writing-patches` and `Submitting patches`_ below for details on how to
|
||||
write and submit a patch.
|
||||
|
||||
* Join the `scrapy-users`_ mailing list and share your ideas on how to
|
||||
* Join the `Scrapy subreddit`_ and share your ideas on how to
|
||||
improve Scrapy. We're always open to suggestions.
|
||||
|
||||
* Answer Scrapy questions at
|
||||
`Stack Overflow <https://stackoverflow.com/questions/tagged/scrapy>`__.
|
||||
|
||||
|
||||
Reporting bugs
|
||||
==============
|
||||
|
||||
@ -30,33 +39,48 @@ Reporting bugs
|
||||
trusted Scrapy developers, and its archives are not public.
|
||||
|
||||
Well-written bug reports are very helpful, so keep in mind the following
|
||||
guidelines when reporting a new bug.
|
||||
guidelines when you're going to report a new bug.
|
||||
|
||||
* check the :ref:`FAQ <faq>` first to see if your issue is addressed in a
|
||||
well-known question
|
||||
|
||||
* check the `open issues`_ to see if it has already been reported. If it has,
|
||||
don't dismiss the report but check the ticket history and comments, you may
|
||||
find additional useful information to contribute.
|
||||
* if you have a general question about Scrapy usage, please ask it at
|
||||
`Stack Overflow <https://stackoverflow.com/questions/tagged/scrapy>`__
|
||||
(use "scrapy" tag).
|
||||
|
||||
* search the `scrapy-users`_ list to see if it has been discussed there, or
|
||||
if you're not sure if what you're seeing is a bug. You can also ask in the
|
||||
`#scrapy` IRC channel.
|
||||
* check the `open issues`_ to see if the issue has already been reported. If it
|
||||
has, don't dismiss the report, but check the ticket history and comments. If
|
||||
you have additional useful information, please leave a comment, or consider
|
||||
:ref:`sending a pull request <writing-patches>` with a fix.
|
||||
|
||||
* write complete, reproducible, specific bug reports. The smaller the test
|
||||
* search the `scrapy-users`_ list and `Scrapy subreddit`_ to see if it has
|
||||
been discussed there, or if you're not sure if what you're seeing is a bug.
|
||||
You can also ask in the ``#scrapy`` IRC channel.
|
||||
|
||||
* write **complete, reproducible, specific bug reports**. The smaller the test
|
||||
case, the better. Remember that other developers won't have your project to
|
||||
reproduce the bug, so please include all relevant files required to reproduce
|
||||
it.
|
||||
it. See for example StackOverflow's guide on creating a
|
||||
`Minimal, Complete, and Verifiable example`_ exhibiting the issue.
|
||||
|
||||
* the most awesome way to provide a complete reproducible example is to
|
||||
send a pull request which adds a failing test case to the
|
||||
Scrapy testing suite (see :ref:`submitting-patches`).
|
||||
This is helpful even if you don't have an intention to
|
||||
fix the issue yourselves.
|
||||
|
||||
* include the output of ``scrapy version -v`` so developers working on your bug
|
||||
know exactly which version and platform it occurred on, which is often very
|
||||
helpful for reproducing it, or knowing if it was already fixed.
|
||||
|
||||
.. _Minimal, Complete, and Verifiable example: https://stackoverflow.com/help/mcve
|
||||
|
||||
.. _writing-patches:
|
||||
|
||||
Writing patches
|
||||
===============
|
||||
|
||||
The better written a patch is, the higher chance that it'll get accepted and
|
||||
the sooner that will be merged.
|
||||
The better a patch is written, the higher the chances that it'll get accepted and the sooner it will be merged.
|
||||
|
||||
Well-written patches should:
|
||||
|
||||
@ -75,10 +99,26 @@ Well-written patches should:
|
||||
the documentation changes in the same patch. See `Documentation policies`_
|
||||
below.
|
||||
|
||||
* if you're adding a private API, please add a regular expression to the
|
||||
``coverage_ignore_pyobjects`` variable of ``docs/conf.py`` to exclude the new
|
||||
private API from documentation coverage checks.
|
||||
|
||||
To see if your private API is skipped properly, generate a documentation
|
||||
coverage report as follows::
|
||||
|
||||
tox -e docs-coverage
|
||||
|
||||
* if you are removing deprecated code, first make sure that at least 1 year
|
||||
(12 months) has passed since the release that introduced the deprecation.
|
||||
See :ref:`deprecation-policy`.
|
||||
|
||||
|
||||
.. _submitting-patches:
|
||||
|
||||
Submitting patches
|
||||
==================
|
||||
|
||||
The best way to submit a patch is to issue a `pull request`_ on Github,
|
||||
The best way to submit a patch is to issue a `pull request`_ on GitHub,
|
||||
optionally creating a new issue first.
|
||||
|
||||
Remember to explain what was fixed or the new functionality (what it is, why
|
||||
@ -88,15 +128,41 @@ developers to understand and accept your patch.
|
||||
You can also discuss the new functionality (or bug fix) before creating the
|
||||
patch, but it's always good to have a patch ready to illustrate your arguments
|
||||
and show that you have put some additional thought into the subject. A good
|
||||
starting point is to send a pull request on Github. It can be simple enough to
|
||||
starting point is to send a pull request on GitHub. It can be simple enough to
|
||||
illustrate your idea, and leave documentation/tests for later, after the idea
|
||||
has been validated and proven useful. Alternatively, you can send an email to
|
||||
`scrapy-users`_ to discuss your idea first.
|
||||
has been validated and proven useful. Alternatively, you can start a
|
||||
conversation in the `Scrapy subreddit`_ to discuss your idea first.
|
||||
|
||||
Sometimes there is an existing pull request for the problem you'd like to
|
||||
solve, which is stalled for some reason. Often the pull request is in a
|
||||
right direction, but changes are requested by Scrapy maintainers, and the
|
||||
original pull request author hasn't had time to address them.
|
||||
In this case consider picking up this pull request: open
|
||||
a new pull request with all commits from the original pull request, as well as
|
||||
additional changes to address the raised issues. Doing so helps a lot; it is
|
||||
not considered rude as soon as the original author is acknowledged by keeping
|
||||
his/her commits.
|
||||
|
||||
You can pull an existing pull request to a local branch
|
||||
by running ``git fetch upstream pull/$PR_NUMBER/head:$BRANCH_NAME_TO_CREATE``
|
||||
(replace 'upstream' with a remote name for scrapy repository,
|
||||
``$PR_NUMBER`` with an ID of the pull request, and ``$BRANCH_NAME_TO_CREATE``
|
||||
with a name of the branch you want to create locally).
|
||||
See also: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/checking-out-pull-requests-locally#modifying-an-inactive-pull-request-locally.
|
||||
|
||||
When writing GitHub pull requests, try to keep titles short but descriptive.
|
||||
E.g. For bug #411: "Scrapy hangs if an exception raises in start_requests"
|
||||
prefer "Fix hanging when exception occurs in start_requests (#411)"
|
||||
instead of "Fix for #411". Complete titles make it easy to skim through
|
||||
the issue tracker.
|
||||
|
||||
Finally, try to keep aesthetic changes (:pep:`8` compliance, unused imports
|
||||
removal, etc) in separate commits than functional changes. This will make pull
|
||||
removal, etc) in separate commits from functional changes. This will make pull
|
||||
requests easier to review and more likely to get merged.
|
||||
|
||||
|
||||
.. _coding-style:
|
||||
|
||||
Coding style
|
||||
============
|
||||
|
||||
@ -105,50 +171,84 @@ Scrapy:
|
||||
|
||||
* Unless otherwise specified, follow :pep:`8`.
|
||||
|
||||
* It's OK to use lines longer than 80 chars if it improves the code
|
||||
* It's OK to use lines longer than 79 chars if it improves the code
|
||||
readability.
|
||||
|
||||
* Don't put your name in the code you contribute. Our policy is to keep
|
||||
the contributor's name in the `AUTHORS`_ file distributed with Scrapy.
|
||||
* Don't put your name in the code you contribute; git provides enough
|
||||
metadata to identify author of the code.
|
||||
See https://help.github.com/en/github/using-git/setting-your-username-in-git for
|
||||
setup instructions.
|
||||
|
||||
Scrapy Contrib
|
||||
==============
|
||||
|
||||
Scrapy contrib shares a similar rationale as Django contrib, which is explained
|
||||
in `this post <http://jacobian.org/writing/what-is-django-contrib/>`_. If you
|
||||
are working on a new functionality, please follow that rationale to decide
|
||||
whether it should be a Scrapy contrib. If unsure, you can ask in
|
||||
`scrapy-users`_.
|
||||
.. _documentation-policies:
|
||||
|
||||
Documentation policies
|
||||
======================
|
||||
|
||||
* **Don't** use docstrings for documenting classes, or methods which are
|
||||
already documented in the official (sphinx) documentation. For example, the
|
||||
:meth:`ItemLoader.add_value` method should be documented in the sphinx
|
||||
documentation, not its docstring.
|
||||
For reference documentation of API members (classes, methods, etc.) use
|
||||
docstrings and make sure that the Sphinx documentation uses the
|
||||
:mod:`~sphinx.ext.autodoc` extension to pull the docstrings. API reference
|
||||
documentation should follow docstring conventions (`PEP 257`_) and be
|
||||
IDE-friendly: short, to the point, and it may provide short examples.
|
||||
|
||||
* **Do** use docstrings for documenting functions not present in the official
|
||||
(sphinx) documentation, such as functions from ``scrapy.utils`` package and
|
||||
its sub-modules.
|
||||
Other types of documentation, such as tutorials or topics, should be covered in
|
||||
files within the ``docs/`` directory. This includes documentation that is
|
||||
specific to an API member, but goes beyond API reference documentation.
|
||||
|
||||
In any case, if something is covered in a docstring, use the
|
||||
:mod:`~sphinx.ext.autodoc` extension to pull the docstring into the
|
||||
documentation instead of duplicating the docstring in files within the
|
||||
``docs/`` directory.
|
||||
|
||||
Tests
|
||||
=====
|
||||
|
||||
Tests are implemented using the `Twisted unit-testing framework`_, running
|
||||
tests requires `tox`_.
|
||||
Tests are implemented using the :doc:`Twisted unit-testing framework
|
||||
<twisted:core/development/policy/test-standard>`. Running tests requires
|
||||
:doc:`tox <tox:index>`.
|
||||
|
||||
.. _running-tests:
|
||||
|
||||
Running tests
|
||||
-------------
|
||||
|
||||
To run all tests go to the root directory of Scrapy source code and run:
|
||||
To run all tests::
|
||||
|
||||
``tox``
|
||||
tox
|
||||
|
||||
To run a specific test (say ``tests/test_contrib_loader.py``) use:
|
||||
To run a specific test (say ``tests/test_loader.py``) use:
|
||||
|
||||
``tox -- tests/test_contrib_loader.py``
|
||||
``tox -- tests/test_loader.py``
|
||||
|
||||
To run the tests on a specific :doc:`tox <tox:index>` environment, use
|
||||
``-e <name>`` with an environment name from ``tox.ini``. For example, to run
|
||||
the tests with Python 3.6 use::
|
||||
|
||||
tox -e py36
|
||||
|
||||
You can also specify a comma-separated list of environments, and use :ref:`tox’s
|
||||
parallel mode <tox:parallel_mode>` to run the tests on multiple environments in
|
||||
parallel::
|
||||
|
||||
tox -e py36,py38 -p auto
|
||||
|
||||
To pass command-line options to :doc:`pytest <pytest:index>`, add them after
|
||||
``--`` in your call to :doc:`tox <tox:index>`. Using ``--`` overrides the
|
||||
default positional arguments defined in ``tox.ini``, so you must include those
|
||||
default positional arguments (``scrapy tests``) after ``--`` as well::
|
||||
|
||||
tox -- scrapy tests -x # stop after first failure
|
||||
|
||||
You can also use the `pytest-xdist`_ plugin. For example, to run all tests on
|
||||
the Python 3.6 :doc:`tox <tox:index>` environment using all your CPU cores::
|
||||
|
||||
tox -e py36 -- scrapy tests -n auto
|
||||
|
||||
To see coverage report install :doc:`coverage <coverage:index>`
|
||||
(``pip install coverage``) and run:
|
||||
|
||||
``coverage report``
|
||||
|
||||
see output of ``coverage --help`` for more options like html or xml report.
|
||||
|
||||
Writing tests
|
||||
-------------
|
||||
@ -161,17 +261,18 @@ Scrapy uses unit-tests, which are located in the `tests/`_ directory.
|
||||
Their module name typically resembles the full path of the module they're
|
||||
testing. For example, the item loaders code is in::
|
||||
|
||||
scrapy.contrib.loader
|
||||
scrapy.loader
|
||||
|
||||
And their unit-tests are in::
|
||||
|
||||
tests/test_contrib_loader.py
|
||||
tests/test_loader.py
|
||||
|
||||
.. _issue tracker: https://github.com/scrapy/scrapy/issues
|
||||
.. _scrapy-users: http://groups.google.com/group/scrapy-users
|
||||
.. _Twisted unit-testing framework: http://twistedmatrix.com/documents/current/core/development/policy/test-standard.html
|
||||
.. _scrapy-users: https://groups.google.com/forum/#!forum/scrapy-users
|
||||
.. _Scrapy subreddit: https://reddit.com/r/scrapy
|
||||
.. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS
|
||||
.. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
|
||||
.. _open issues: https://github.com/scrapy/scrapy/issues
|
||||
.. _pull request: http://help.github.com/send-pull-requests/
|
||||
.. _tox: https://pypi.python.org/pypi/tox
|
||||
.. _PEP 257: https://www.python.org/dev/peps/pep-0257/
|
||||
.. _pull request: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request
|
||||
.. _pytest-xdist: https://github.com/pytest-dev/pytest-xdist
|
||||
|
@ -1,34 +0,0 @@
|
||||
.. _experimental:
|
||||
|
||||
Experimental features
|
||||
=====================
|
||||
|
||||
This section documents experimental Scrapy features that may become stable in
|
||||
future releases, but whose API is not yet stable. Use them with caution, and
|
||||
subscribe to the `mailing lists <http://scrapy.org/community/>`_ to get
|
||||
notified of any changes.
|
||||
|
||||
Since it's not revised so frequently, this section may contain documentation
|
||||
which is outdated, incomplete or overlapping with stable documentation (until
|
||||
it's properly merged) . Use at your own risk.
|
||||
|
||||
.. warning::
|
||||
|
||||
This documentation is a work in progress. Use at your own risk.
|
||||
|
||||
Add commands using external libraries
|
||||
-------------------------------------
|
||||
|
||||
You can also add Scrapy commands from an external library by adding `scrapy.commands` section into entry_points in the `setup.py`.
|
||||
|
||||
The following example adds `my_command` command::
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(name='scrapy-mymodule',
|
||||
entry_points={
|
||||
'scrapy.commands': [
|
||||
'my_command=my_scrapy_module.commands:MyCommand',
|
||||
],
|
||||
},
|
||||
)
|
190
docs/faq.rst
@ -3,6 +3,8 @@
|
||||
Frequently Asked Questions
|
||||
==========================
|
||||
|
||||
.. _faq-scrapy-bs-cmp:
|
||||
|
||||
How does Scrapy compare to BeautifulSoup or lxml?
|
||||
-------------------------------------------------
|
||||
|
||||
@ -19,33 +21,56 @@ Python code.
|
||||
In other words, comparing `BeautifulSoup`_ (or `lxml`_) to Scrapy is like
|
||||
comparing `jinja2`_ to `Django`_.
|
||||
|
||||
.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
|
||||
.. _lxml: http://lxml.de/
|
||||
.. _jinja2: http://jinja.pocoo.org/2/
|
||||
.. _Django: http://www.djangoproject.com
|
||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
|
||||
.. _lxml: https://lxml.de/
|
||||
.. _jinja2: https://palletsprojects.com/p/jinja/
|
||||
.. _Django: https://www.djangoproject.com/
|
||||
|
||||
.. _faq-python-versions:
|
||||
Can I use Scrapy with BeautifulSoup?
|
||||
------------------------------------
|
||||
|
||||
What Python versions does Scrapy support?
|
||||
-----------------------------------------
|
||||
Yes, you can.
|
||||
As mentioned :ref:`above <faq-scrapy-bs-cmp>`, `BeautifulSoup`_ can be used
|
||||
for parsing HTML responses in Scrapy callbacks.
|
||||
You just have to feed the response's body into a ``BeautifulSoup`` object
|
||||
and extract whatever data you need from it.
|
||||
|
||||
Scrapy is supported under Python 2.7 only.
|
||||
Python 2.6 support was dropped starting at Scrapy 0.20.
|
||||
Here's an example spider using BeautifulSoup API, with ``lxml`` as the HTML parser::
|
||||
|
||||
Does Scrapy work with Python 3?
|
||||
---------------------------------
|
||||
|
||||
No, but there are plans to support Python 3.3+.
|
||||
At the moment, Scrapy works with Python 2.7.
|
||||
from bs4 import BeautifulSoup
|
||||
import scrapy
|
||||
|
||||
|
||||
class ExampleSpider(scrapy.Spider):
|
||||
name = "example"
|
||||
allowed_domains = ["example.com"]
|
||||
start_urls = (
|
||||
'http://www.example.com/',
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
# use lxml to get decent HTML parsing speed
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
yield {
|
||||
"url": response.url,
|
||||
"title": soup.h1.string
|
||||
}
|
||||
|
||||
.. note::
|
||||
|
||||
``BeautifulSoup`` supports several HTML/XML parsers.
|
||||
See `BeautifulSoup's official documentation`_ on which ones are available.
|
||||
|
||||
.. _BeautifulSoup's official documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
|
||||
|
||||
.. seealso:: :ref:`faq-python-versions`.
|
||||
|
||||
Did Scrapy "steal" X from Django?
|
||||
---------------------------------
|
||||
|
||||
Probably, but we don't like that word. We think Django_ is a great open source
|
||||
project and an example to follow, so we've used it as an inspiration for
|
||||
Scrapy.
|
||||
Scrapy.
|
||||
|
||||
We believe that, if something is already done well, there's no need to reinvent
|
||||
it. This concept, besides being one of the foundations for open source and free
|
||||
@ -57,14 +82,12 @@ focus on the real problems we need to solve.
|
||||
We'd be proud if Scrapy serves as an inspiration for other projects. Feel free
|
||||
to steal from us!
|
||||
|
||||
.. _Django: http://www.djangoproject.com
|
||||
|
||||
Does Scrapy work with HTTP proxies?
|
||||
-----------------------------------
|
||||
|
||||
Yes. Support for HTTP proxies is provided (since Scrapy 0.8) through the HTTP
|
||||
Proxy downloader middleware. See
|
||||
:class:`~scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware`.
|
||||
:class:`~scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware`.
|
||||
|
||||
How can I scrape an item with attributes in different pages?
|
||||
------------------------------------------------------------
|
||||
@ -77,25 +100,37 @@ Scrapy crashes with: ImportError: No module named win32api
|
||||
|
||||
You need to install `pywin32`_ because of `this Twisted bug`_.
|
||||
|
||||
.. _pywin32: http://sourceforge.net/projects/pywin32/
|
||||
.. _this Twisted bug: http://twistedmatrix.com/trac/ticket/3707
|
||||
.. _pywin32: https://sourceforge.net/projects/pywin32/
|
||||
.. _this Twisted bug: https://twistedmatrix.com/trac/ticket/3707
|
||||
|
||||
How can I simulate a user login in my spider?
|
||||
---------------------------------------------
|
||||
|
||||
See :ref:`topics-request-response-ref-request-userlogin`.
|
||||
|
||||
.. _faq-bfo-dfo:
|
||||
|
||||
Does Scrapy crawl in breadth-first or depth-first order?
|
||||
--------------------------------------------------------
|
||||
|
||||
By default, Scrapy uses a `LIFO`_ queue for storing pending requests, which
|
||||
basically means that it crawls in `DFO order`_. This order is more convenient
|
||||
in most cases. If you do want to crawl in true `BFO order`_, you can do it by
|
||||
in most cases.
|
||||
|
||||
If you do want to crawl in true `BFO order`_, you can do it by
|
||||
setting the following settings::
|
||||
|
||||
DEPTH_PRIORITY = 1
|
||||
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
|
||||
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
|
||||
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
||||
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
||||
|
||||
While pending requests are below the configured values of
|
||||
:setting:`CONCURRENT_REQUESTS`, :setting:`CONCURRENT_REQUESTS_PER_DOMAIN` or
|
||||
:setting:`CONCURRENT_REQUESTS_PER_IP`, those requests are sent
|
||||
concurrently. As a result, the first few requests of a crawl rarely follow the
|
||||
desired order. Lowering those settings to ``1`` enforces the desired order, but
|
||||
it significantly slows down the crawl as a whole.
|
||||
|
||||
|
||||
My Scrapy crawler has memory leaks. What can I do?
|
||||
--------------------------------------------------
|
||||
@ -113,7 +148,7 @@ See previous question.
|
||||
Can I use Basic HTTP Authentication in my spiders?
|
||||
--------------------------------------------------
|
||||
|
||||
Yes, see :class:`~scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware`.
|
||||
Yes, see :class:`~scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware`.
|
||||
|
||||
Why does Scrapy download pages in English instead of my native language?
|
||||
------------------------------------------------------------------------
|
||||
@ -121,7 +156,7 @@ Why does Scrapy download pages in English instead of my native language?
|
||||
Try changing the default `Accept-Language`_ request header by overriding the
|
||||
:setting:`DEFAULT_REQUEST_HEADERS` setting.
|
||||
|
||||
.. _Accept-Language: http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
|
||||
.. _Accept-Language: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
|
||||
|
||||
Where can I find some example Scrapy projects?
|
||||
----------------------------------------------
|
||||
@ -144,23 +179,23 @@ I get "Filtered offsite request" messages. How can I fix them?
|
||||
Those messages (logged with ``DEBUG`` level) don't necessarily mean there is a
|
||||
problem, so you may not need to fix them.
|
||||
|
||||
Those message are thrown by the Offsite Spider Middleware, which is a spider
|
||||
Those messages are thrown by the Offsite Spider Middleware, which is a spider
|
||||
middleware (enabled by default) whose purpose is to filter out requests to
|
||||
domains outside the ones covered by the spider.
|
||||
|
||||
For more info see:
|
||||
:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware`.
|
||||
:class:`~scrapy.spidermiddlewares.offsite.OffsiteMiddleware`.
|
||||
|
||||
What is the recommended way to deploy a Scrapy crawler in production?
|
||||
---------------------------------------------------------------------
|
||||
|
||||
See :ref:`topics-scrapyd`.
|
||||
See :ref:`topics-deploy`.
|
||||
|
||||
Can I use JSON for large exports?
|
||||
---------------------------------
|
||||
|
||||
It'll depend on how large your output is. See :ref:`this warning
|
||||
<json-with-large-data>` in :class:`~scrapy.contrib.exporter.JsonItemExporter`
|
||||
<json-with-large-data>` in :class:`~scrapy.exporters.JsonItemExporter`
|
||||
documentation.
|
||||
|
||||
Can I return (Twisted) deferreds from signal handlers?
|
||||
@ -190,7 +225,7 @@ Or by setting a global download delay in your project with the
|
||||
Can I call ``pdb.set_trace()`` from my spiders to debug them?
|
||||
-------------------------------------------------------------
|
||||
|
||||
Yes, but you can also use the Scrapy shell which allows you too quickly analyze
|
||||
Yes, but you can also use the Scrapy shell which allows you to quickly analyze
|
||||
(and even modify) the response being processed by your spider, which is, quite
|
||||
often, more useful than plain old ``pdb.set_trace()``.
|
||||
|
||||
@ -201,15 +236,15 @@ Simplest way to dump all my scraped items into a JSON/CSV/XML file?
|
||||
|
||||
To dump into a JSON file::
|
||||
|
||||
scrapy crawl myspider -o items.json
|
||||
scrapy crawl myspider -O items.json
|
||||
|
||||
To dump into a CSV file::
|
||||
|
||||
scrapy crawl myspider -o items.csv
|
||||
scrapy crawl myspider -O items.csv
|
||||
|
||||
To dump into a XML file::
|
||||
|
||||
scrapy crawl myspider -o items.xml
|
||||
scrapy crawl myspider -O items.xml
|
||||
|
||||
For more information see :ref:`topics-feed-exports`
|
||||
|
||||
@ -220,8 +255,8 @@ The ``__VIEWSTATE`` parameter is used in sites built with ASP.NET/VB.NET. For
|
||||
more info on how it works see `this page`_. Also, here's an `example spider`_
|
||||
which scrapes one of these sites.
|
||||
|
||||
.. _this page: http://search.cpan.org/~ecarroll/HTML-TreeBuilderX-ASP_NET-0.09/lib/HTML/TreeBuilderX/ASP_NET.pm
|
||||
.. _example spider: http://github.com/AmbientLighter/rpn-fas/blob/master/fas/spiders/rnp.py
|
||||
.. _this page: https://metacpan.org/pod/release/ECARROLL/HTML-TreeBuilderX-ASP_NET-0.09/lib/HTML/TreeBuilderX/ASP_NET.pm
|
||||
.. _example spider: https://github.com/AmbientLighter/rpn-fas/blob/master/fas/spiders/rnp.py
|
||||
|
||||
What's the best way to parse big XML/CSV data feeds?
|
||||
----------------------------------------------------
|
||||
@ -280,38 +315,63 @@ I'm scraping a XML document and my XPath selector doesn't return any items
|
||||
|
||||
You may need to remove namespaces. See :ref:`removing-namespaces`.
|
||||
|
||||
.. _faq-split-item:
|
||||
|
||||
I'm getting an error: "cannot import name crawler"
|
||||
How to split an item into multiple items in an item pipeline?
|
||||
-------------------------------------------------------------
|
||||
|
||||
:ref:`Item pipelines <topics-item-pipeline>` cannot yield multiple items per
|
||||
input item. :ref:`Create a spider middleware <custom-spider-middleware>`
|
||||
instead, and use its
|
||||
:meth:`~scrapy.spidermiddlewares.SpiderMiddleware.process_spider_output`
|
||||
method for this purpose. For example::
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
||||
class MultiplyItemsMiddleware:
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
for item in result:
|
||||
if is_item(item):
|
||||
adapter = ItemAdapter(item)
|
||||
for _ in range(adapter['multiply_by']):
|
||||
yield deepcopy(item)
|
||||
|
||||
Does Scrapy support IPv6 addresses?
|
||||
-----------------------------------
|
||||
|
||||
Yes, by setting :setting:`DNS_RESOLVER` to ``scrapy.resolver.CachingHostnameResolver``.
|
||||
Note that by doing so, you lose the ability to set a specific timeout for DNS requests
|
||||
(the value of the :setting:`DNS_TIMEOUT` setting is ignored).
|
||||
|
||||
|
||||
.. _faq-specific-reactor:
|
||||
|
||||
How to deal with ``<class 'ValueError'>: filedescriptor out of range in select()`` exceptions?
|
||||
----------------------------------------------------------------------------------------------
|
||||
|
||||
This issue `has been reported`_ to appear when running broad crawls in macOS, where the default
|
||||
Twisted reactor is :class:`twisted.internet.selectreactor.SelectReactor`. Switching to a
|
||||
different reactor is possible by using the :setting:`TWISTED_REACTOR` setting.
|
||||
|
||||
|
||||
.. _faq-stop-response-download:
|
||||
|
||||
How can I cancel the download of a given response?
|
||||
--------------------------------------------------
|
||||
|
||||
This is caused by Scrapy changes due to the singletons removal. The error is
|
||||
most likely raised by a module (extension, middleware, pipeline or spider) in
|
||||
your Scrapy project that imports ``crawler`` from ``scrapy.project``. For
|
||||
example::
|
||||
In some situations, it might be useful to stop the download of a certain response.
|
||||
For instance, if you only need the first part of a large response and you would like
|
||||
to save resources by avoiding the download of the whole body.
|
||||
In that case, you could attach a handler to the :class:`~scrapy.signals.bytes_received`
|
||||
signal and raise a :exc:`~scrapy.exceptions.StopDownload` exception. Please refer to
|
||||
the :ref:`topics-stop-response-download` topic for additional information and examples.
|
||||
|
||||
from scrapy.project import crawler
|
||||
|
||||
class SomeExtension(object):
|
||||
def __init__(self):
|
||||
self.crawler = crawler
|
||||
# ...
|
||||
|
||||
This way to access the crawler object is deprecated, the code should be ported
|
||||
to use ``from_crawler`` class method, for example::
|
||||
|
||||
class SomeExtension(object):
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls()
|
||||
o.crawler = crawler
|
||||
return o
|
||||
|
||||
Scrapy command line tool has some backwards compatibility in place to support
|
||||
the old import mechanism (with a deprecation warning), but this mechanism may
|
||||
not work if you use Scrapy differently (for example, as a library).
|
||||
|
||||
.. _user agents: http://en.wikipedia.org/wiki/User_agent
|
||||
.. _LIFO: http://en.wikipedia.org/wiki/LIFO
|
||||
.. _DFO order: http://en.wikipedia.org/wiki/Depth-first_search
|
||||
.. _BFO order: http://en.wikipedia.org/wiki/Breadth-first_search
|
||||
.. _has been reported: https://github.com/scrapy/scrapy/issues/2905
|
||||
.. _user agents: https://en.wikipedia.org/wiki/User_agent
|
||||
.. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type)
|
||||
.. _DFO order: https://en.wikipedia.org/wiki/Depth-first_search
|
||||
.. _BFO order: https://en.wikipedia.org/wiki/Breadth-first_search
|
||||
|
123
docs/index.rst
@ -4,7 +4,13 @@
|
||||
Scrapy |version| documentation
|
||||
==============================
|
||||
|
||||
This documentation contains everything you need to know about Scrapy.
|
||||
Scrapy is a fast high-level `web crawling`_ and `web scraping`_ framework, used
|
||||
to crawl websites and extract structured data from their pages. It can be used
|
||||
for a wide range of purposes, from data mining to monitoring and automated
|
||||
testing.
|
||||
|
||||
.. _web crawling: https://en.wikipedia.org/wiki/Web_crawler
|
||||
.. _web scraping: https://en.wikipedia.org/wiki/Web_scraping
|
||||
|
||||
Getting help
|
||||
============
|
||||
@ -13,13 +19,15 @@ Having trouble? We'd like to help!
|
||||
|
||||
* Try the :doc:`FAQ <faq>` -- it's got answers to some common questions.
|
||||
* Looking for specific information? Try the :ref:`genindex` or :ref:`modindex`.
|
||||
* Search for information in the `archives of the scrapy-users mailing list`_, or
|
||||
`post a question`_.
|
||||
* Ask a question in the `#scrapy IRC channel`_.
|
||||
* Ask or search questions in `StackOverflow using the scrapy tag`_.
|
||||
* Ask or search questions in the `Scrapy subreddit`_.
|
||||
* Search for questions on the archives of the `scrapy-users mailing list`_.
|
||||
* Ask a question in the `#scrapy IRC channel`_,
|
||||
* Report bugs with Scrapy in our `issue tracker`_.
|
||||
|
||||
.. _archives of the scrapy-users mailing list: http://groups.google.com/group/scrapy-users/
|
||||
.. _post a question: http://groups.google.com/group/scrapy-users/
|
||||
.. _scrapy-users mailing list: https://groups.google.com/forum/#!forum/scrapy-users
|
||||
.. _Scrapy subreddit: https://www.reddit.com/r/scrapy/
|
||||
.. _StackOverflow using the scrapy tag: https://stackoverflow.com/tags/scrapy
|
||||
.. _#scrapy IRC channel: irc://irc.freenode.net/scrapy
|
||||
.. _issue tracker: https://github.com/scrapy/scrapy/issues
|
||||
|
||||
@ -28,6 +36,7 @@ First steps
|
||||
===========
|
||||
|
||||
.. toctree::
|
||||
:caption: First steps
|
||||
:hidden:
|
||||
|
||||
intro/overview
|
||||
@ -53,24 +62,26 @@ Basic concepts
|
||||
==============
|
||||
|
||||
.. toctree::
|
||||
:caption: Basic concepts
|
||||
:hidden:
|
||||
|
||||
topics/commands
|
||||
topics/items
|
||||
topics/spiders
|
||||
topics/selectors
|
||||
topics/items
|
||||
topics/loaders
|
||||
topics/shell
|
||||
topics/item-pipeline
|
||||
topics/feed-exports
|
||||
topics/request-response
|
||||
topics/link-extractors
|
||||
topics/settings
|
||||
topics/exceptions
|
||||
|
||||
|
||||
:doc:`topics/commands`
|
||||
Learn about the command-line tool used to manage your Scrapy project.
|
||||
|
||||
:doc:`topics/items`
|
||||
Define the data you want to scrape.
|
||||
|
||||
:doc:`topics/spiders`
|
||||
Write the rules to crawl your websites.
|
||||
|
||||
@ -80,6 +91,9 @@ Basic concepts
|
||||
:doc:`topics/shell`
|
||||
Test your extraction code in an interactive environment.
|
||||
|
||||
:doc:`topics/items`
|
||||
Define the data you want to scrape.
|
||||
|
||||
:doc:`topics/loaders`
|
||||
Populate your items with the extracted data.
|
||||
|
||||
@ -89,13 +103,24 @@ Basic concepts
|
||||
:doc:`topics/feed-exports`
|
||||
Output your scraped data using different formats and storages.
|
||||
|
||||
:doc:`topics/request-response`
|
||||
Understand the classes used to represent HTTP requests and responses.
|
||||
|
||||
:doc:`topics/link-extractors`
|
||||
Convenient classes to extract links to follow from pages.
|
||||
|
||||
:doc:`topics/settings`
|
||||
Learn how to configure Scrapy and see all :ref:`available settings <topics-settings-ref>`.
|
||||
|
||||
:doc:`topics/exceptions`
|
||||
See all available exceptions and their meaning.
|
||||
|
||||
|
||||
Built-in services
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:caption: Built-in services
|
||||
:hidden:
|
||||
|
||||
topics/logging
|
||||
@ -105,8 +130,8 @@ Built-in services
|
||||
topics/webservice
|
||||
|
||||
:doc:`topics/logging`
|
||||
Understand the simple logging facility provided by Scrapy.
|
||||
|
||||
Learn how to use Python's builtin logging on Scrapy.
|
||||
|
||||
:doc:`topics/stats`
|
||||
Collect statistics about your scraping crawler.
|
||||
|
||||
@ -124,6 +149,7 @@ Solving specific problems
|
||||
=========================
|
||||
|
||||
.. toctree::
|
||||
:caption: Solving specific problems
|
||||
:hidden:
|
||||
|
||||
faq
|
||||
@ -131,22 +157,22 @@ Solving specific problems
|
||||
topics/contracts
|
||||
topics/practices
|
||||
topics/broad-crawls
|
||||
topics/firefox
|
||||
topics/firebug
|
||||
topics/developer-tools
|
||||
topics/dynamic-content
|
||||
topics/leaks
|
||||
topics/images
|
||||
topics/ubuntu
|
||||
topics/scrapyd
|
||||
topics/media-pipeline
|
||||
topics/deploy
|
||||
topics/autothrottle
|
||||
topics/benchmarking
|
||||
topics/jobs
|
||||
topics/djangoitem
|
||||
topics/coroutines
|
||||
topics/asyncio
|
||||
|
||||
:doc:`faq`
|
||||
Get answers to most frequently asked questions.
|
||||
|
||||
:doc:`topics/debug`
|
||||
Learn how to debug common problems of your scrapy spider.
|
||||
Learn how to debug common problems of your Scrapy spider.
|
||||
|
||||
:doc:`topics/contracts`
|
||||
Learn how to use contracts for testing your spiders.
|
||||
@ -157,23 +183,20 @@ Solving specific problems
|
||||
:doc:`topics/broad-crawls`
|
||||
Tune Scrapy for crawling a lot domains in parallel.
|
||||
|
||||
:doc:`topics/firefox`
|
||||
Learn how to scrape with Firefox and some useful add-ons.
|
||||
:doc:`topics/developer-tools`
|
||||
Learn how to scrape with your browser's developer tools.
|
||||
|
||||
:doc:`topics/firebug`
|
||||
Learn how to scrape efficiently using Firebug.
|
||||
:doc:`topics/dynamic-content`
|
||||
Read webpage data that is loaded dynamically.
|
||||
|
||||
:doc:`topics/leaks`
|
||||
Learn how to find and get rid of memory leaks in your crawler.
|
||||
|
||||
:doc:`topics/images`
|
||||
Download static images associated with your scraped items.
|
||||
:doc:`topics/media-pipeline`
|
||||
Download files and/or images associated with your scraped items.
|
||||
|
||||
:doc:`topics/ubuntu`
|
||||
Install latest Scrapy packages easily on Ubuntu
|
||||
|
||||
:doc:`topics/scrapyd`
|
||||
Deploying your Scrapy project in production.
|
||||
:doc:`topics/deploy`
|
||||
Deploying your Scrapy spiders and run them in a remote server.
|
||||
|
||||
:doc:`topics/autothrottle`
|
||||
Adjust crawl rate dynamically based on load.
|
||||
@ -184,8 +207,11 @@ Solving specific problems
|
||||
:doc:`topics/jobs`
|
||||
Learn how to pause and resume crawls for large spiders.
|
||||
|
||||
:doc:`topics/djangoitem`
|
||||
Write scraped items using Django models.
|
||||
:doc:`topics/coroutines`
|
||||
Use the :ref:`coroutine syntax <async>`.
|
||||
|
||||
:doc:`topics/asyncio`
|
||||
Use :mod:`asyncio` and :mod:`asyncio`-powered libraries.
|
||||
|
||||
.. _extending-scrapy:
|
||||
|
||||
@ -193,6 +219,7 @@ Extending Scrapy
|
||||
================
|
||||
|
||||
.. toctree::
|
||||
:caption: Extending Scrapy
|
||||
:hidden:
|
||||
|
||||
topics/architecture
|
||||
@ -200,6 +227,9 @@ Extending Scrapy
|
||||
topics/spider-middleware
|
||||
topics/extensions
|
||||
topics/api
|
||||
topics/signals
|
||||
topics/exporters
|
||||
|
||||
|
||||
:doc:`topics/architecture`
|
||||
Understand the Scrapy architecture.
|
||||
@ -216,33 +246,9 @@ Extending Scrapy
|
||||
:doc:`topics/api`
|
||||
Use it on extensions and middlewares to extend Scrapy functionality
|
||||
|
||||
Reference
|
||||
=========
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
|
||||
topics/request-response
|
||||
topics/settings
|
||||
topics/signals
|
||||
topics/exceptions
|
||||
topics/exporters
|
||||
|
||||
:doc:`topics/commands`
|
||||
Learn about the command-line tool and see all :ref:`available commands <topics-commands-ref>`.
|
||||
|
||||
:doc:`topics/request-response`
|
||||
Understand the classes used to represent HTTP requests and responses.
|
||||
|
||||
:doc:`topics/settings`
|
||||
Learn how to configure Scrapy and see all :ref:`available settings <topics-settings-ref>`.
|
||||
|
||||
:doc:`topics/signals`
|
||||
See all available signals and how to work with them.
|
||||
|
||||
:doc:`topics/exceptions`
|
||||
See all available exceptions and their meaning.
|
||||
|
||||
:doc:`topics/exporters`
|
||||
Quickly export your scraped items to a file (XML, CSV, etc).
|
||||
|
||||
@ -251,12 +257,12 @@ All the rest
|
||||
============
|
||||
|
||||
.. toctree::
|
||||
:caption: All the rest
|
||||
:hidden:
|
||||
|
||||
news
|
||||
contributing
|
||||
versioning
|
||||
experimental/index
|
||||
|
||||
:doc:`news`
|
||||
See what has changed in recent Scrapy versions.
|
||||
@ -266,6 +272,3 @@ All the rest
|
||||
|
||||
:doc:`versioning`
|
||||
Understand Scrapy versioning and API stability.
|
||||
|
||||
:doc:`experimental/index`
|
||||
Learn about bleeding-edge features.
|
||||
|
@ -5,21 +5,16 @@ Examples
|
||||
========
|
||||
|
||||
The best way to learn is with examples, and Scrapy is no exception. For this
|
||||
reason, there is an example Scrapy project named dirbot_, that you can use to
|
||||
play and learn more about Scrapy. It contains the dmoz spider described in the
|
||||
tutorial.
|
||||
reason, there is an example Scrapy project named quotesbot_, that you can use to
|
||||
play and learn more about Scrapy. It contains two spiders for
|
||||
http://quotes.toscrape.com, one using CSS selectors and another one using XPath
|
||||
expressions.
|
||||
|
||||
This dirbot_ project is available at: https://github.com/scrapy/dirbot
|
||||
|
||||
It contains a README file with a detailed description of the project contents.
|
||||
The quotesbot_ project is available at: https://github.com/scrapy/quotesbot.
|
||||
You can find more information about it in the project's README.
|
||||
|
||||
If you're familiar with git, you can checkout the code. Otherwise you can
|
||||
download a tarball or zip file of the project by clicking on `Downloads`_.
|
||||
download the project as a zip file by clicking
|
||||
`here <https://github.com/scrapy/quotesbot/archive/master.zip>`_.
|
||||
|
||||
The `scrapy tag on Snipplr`_ is used for sharing code snippets such as spiders,
|
||||
middlewares, extensions, or scripts. Feel free (and encouraged!) to share any
|
||||
code there.
|
||||
|
||||
.. _dirbot: https://github.com/scrapy/dirbot
|
||||
.. _Downloads: https://github.com/scrapy/dirbot/archives/master
|
||||
.. _scrapy tag on Snipplr: http://snipplr.com/all/tags/scrapy/
|
||||
.. _quotesbot: https://github.com/scrapy/quotesbot
|
||||
|
@ -4,90 +4,271 @@
|
||||
Installation guide
|
||||
==================
|
||||
|
||||
.. _faq-python-versions:
|
||||
|
||||
Supported Python versions
|
||||
=========================
|
||||
|
||||
Scrapy requires Python 3.5.2+, either the CPython implementation (default) or
|
||||
the PyPy 5.9+ implementation (see :ref:`python:implementations`).
|
||||
|
||||
|
||||
Installing Scrapy
|
||||
=================
|
||||
|
||||
.. note:: Check :ref:`intro-install-platform-notes` first.
|
||||
If you're using `Anaconda`_ or `Miniconda`_, you can install the package from
|
||||
the `conda-forge`_ channel, which has up-to-date packages for Linux, Windows
|
||||
and macOS.
|
||||
|
||||
The installation steps assume that you have the following things installed:
|
||||
To install Scrapy using ``conda``, run::
|
||||
|
||||
* `Python`_ 2.7
|
||||
conda install -c conda-forge scrapy
|
||||
|
||||
* `pip`_ and `setuptools`_ Python packages. Nowadays `pip`_ requires and
|
||||
installs `setuptools`_ if not installed.
|
||||
Alternatively, if you’re already familiar with installation of Python packages,
|
||||
you can install Scrapy and its dependencies from PyPI with::
|
||||
|
||||
* `lxml`_. Most Linux distributions ships prepackaged versions of lxml.
|
||||
Otherwise refer to http://lxml.de/installation.html
|
||||
pip install Scrapy
|
||||
|
||||
* `OpenSSL`_. This comes preinstalled in all operating systems, except Windows
|
||||
where the Python installer ships it bundled.
|
||||
Note that sometimes this may require solving compilation issues for some Scrapy
|
||||
dependencies depending on your operating system, so be sure to check the
|
||||
:ref:`intro-install-platform-notes`.
|
||||
|
||||
You can install Scrapy using pip (which is the canonical way to install Python
|
||||
packages).
|
||||
We strongly recommend that you install Scrapy in :ref:`a dedicated virtualenv <intro-using-virtualenv>`,
|
||||
to avoid conflicting with your system packages.
|
||||
|
||||
To install using pip::
|
||||
For more detailed and platform specifics instructions, as well as
|
||||
troubleshooting information, read on.
|
||||
|
||||
|
||||
Things that are good to know
|
||||
----------------------------
|
||||
|
||||
Scrapy is written in pure Python and depends on a few key Python packages (among others):
|
||||
|
||||
* `lxml`_, an efficient XML and HTML parser
|
||||
* `parsel`_, an HTML/XML data extraction library written on top of lxml,
|
||||
* `w3lib`_, a multi-purpose helper for dealing with URLs and web page encodings
|
||||
* `twisted`_, an asynchronous networking framework
|
||||
* `cryptography`_ and `pyOpenSSL`_, to deal with various network-level security needs
|
||||
|
||||
The minimal versions which Scrapy is tested against are:
|
||||
|
||||
* Twisted 14.0
|
||||
* lxml 3.4
|
||||
* pyOpenSSL 0.14
|
||||
|
||||
Scrapy may work with older versions of these packages
|
||||
but it is not guaranteed it will continue working
|
||||
because it’s not being tested against them.
|
||||
|
||||
Some of these packages themselves depends on non-Python packages
|
||||
that might require additional installation steps depending on your platform.
|
||||
Please check :ref:`platform-specific guides below <intro-install-platform-notes>`.
|
||||
|
||||
In case of any trouble related to these dependencies,
|
||||
please refer to their respective installation instructions:
|
||||
|
||||
* `lxml installation`_
|
||||
* `cryptography installation`_
|
||||
|
||||
.. _lxml installation: https://lxml.de/installation.html
|
||||
.. _cryptography installation: https://cryptography.io/en/latest/installation/
|
||||
|
||||
|
||||
.. _intro-using-virtualenv:
|
||||
|
||||
Using a virtual environment (recommended)
|
||||
-----------------------------------------
|
||||
|
||||
TL;DR: We recommend installing Scrapy inside a virtual environment
|
||||
on all platforms.
|
||||
|
||||
Python packages can be installed either globally (a.k.a system wide),
|
||||
or in user-space. We do not recommend installing Scrapy system wide.
|
||||
|
||||
Instead, we recommend that you install Scrapy within a so-called
|
||||
"virtual environment" (:mod:`venv`).
|
||||
Virtual environments allow you to not conflict with already-installed Python
|
||||
system packages (which could break some of your system tools and scripts),
|
||||
and still install packages normally with ``pip`` (without ``sudo`` and the likes).
|
||||
|
||||
See :ref:`tut-venv` on how to create your virtual environment.
|
||||
|
||||
Once you have created a virtual environment, you can install Scrapy inside it with ``pip``,
|
||||
just like any other Python package.
|
||||
(See :ref:`platform-specific guides <intro-install-platform-notes>`
|
||||
below for non-Python dependencies that you may need to install beforehand).
|
||||
|
||||
pip install Scrapy
|
||||
|
||||
.. _intro-install-platform-notes:
|
||||
|
||||
Platform specific installation notes
|
||||
====================================
|
||||
|
||||
.. _intro-install-windows:
|
||||
|
||||
Windows
|
||||
-------
|
||||
|
||||
* Install Python 2.7 from http://python.org/download/
|
||||
Though it's possible to install Scrapy on Windows using pip, we recommend you
|
||||
to install `Anaconda`_ or `Miniconda`_ and use the package from the
|
||||
`conda-forge`_ channel, which will avoid most installation issues.
|
||||
|
||||
You need to adjust ``PATH`` environment variable to include paths to
|
||||
the Python executable and additional scripts. The following paths need to be
|
||||
added to ``PATH``::
|
||||
Once you've installed `Anaconda`_ or `Miniconda`_, install Scrapy with::
|
||||
|
||||
C:\Python2.7\;C:\Python2.7\Scripts\;
|
||||
conda install -c conda-forge scrapy
|
||||
|
||||
To update the ``PATH`` open a Command prompt and run::
|
||||
|
||||
c:\python27\python.exe c:\python27\tools\scripts\win_add2path.py
|
||||
.. _intro-install-ubuntu:
|
||||
|
||||
Close the command prompt window and reopen it so changes take effect, run the
|
||||
following command and check it shows the expected Python version::
|
||||
Ubuntu 14.04 or above
|
||||
---------------------
|
||||
|
||||
python --version
|
||||
|
||||
* Install `pip`_ from https://pip.pypa.io/en/latest/installing.html
|
||||
|
||||
Now open a Command prompt to check ``pip`` is installed correctly::
|
||||
|
||||
pip --version
|
||||
|
||||
* At this point Python 2.7 and ``pip`` package manager must be working, let's
|
||||
install Scrapy::
|
||||
|
||||
pip install Scrapy
|
||||
|
||||
Ubuntu 9.10 or above
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
Scrapy is currently tested with recent-enough versions of lxml,
|
||||
twisted and pyOpenSSL, and is compatible with recent Ubuntu distributions.
|
||||
But it should support older versions of Ubuntu too, like Ubuntu 14.04,
|
||||
albeit with potential issues with TLS connections.
|
||||
|
||||
**Don't** use the ``python-scrapy`` package provided by Ubuntu, they are
|
||||
typically too old and slow to catch up with latest Scrapy.
|
||||
|
||||
Instead, use the official :ref:`Ubuntu Packages <topics-ubuntu>`, which already
|
||||
solve all dependencies for you and are continuously updated with the latest bug
|
||||
fixes.
|
||||
|
||||
Archlinux
|
||||
~~~~~~~~~
|
||||
To install Scrapy on Ubuntu (or Ubuntu-based) systems, you need to install
|
||||
these dependencies::
|
||||
|
||||
You can follow the generic instructions or install Scrapy from `AUR Scrapy package`::
|
||||
sudo apt-get install python3 python3-dev python3-pip libxml2-dev libxslt1-dev zlib1g-dev libffi-dev libssl-dev
|
||||
|
||||
yaourt -S scrapy
|
||||
- ``python3-dev``, ``zlib1g-dev``, ``libxml2-dev`` and ``libxslt1-dev``
|
||||
are required for ``lxml``
|
||||
- ``libssl-dev`` and ``libffi-dev`` are required for ``cryptography``
|
||||
|
||||
Inside a :ref:`virtualenv <intro-using-virtualenv>`,
|
||||
you can install Scrapy with ``pip`` after that::
|
||||
|
||||
pip install scrapy
|
||||
|
||||
.. note::
|
||||
The same non-Python dependencies can be used to install Scrapy in Debian
|
||||
Jessie (8.0) and above.
|
||||
|
||||
|
||||
.. _Python: http://www.python.org
|
||||
.. _pip: http://www.pip-installer.org/en/latest/installing.html
|
||||
.. _easy_install: http://pypi.python.org/pypi/setuptools
|
||||
.. _Control Panel: http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/sysdm_advancd_environmnt_addchange_variable.mspx
|
||||
.. _lxml: http://lxml.de/
|
||||
.. _OpenSSL: https://pypi.python.org/pypi/pyOpenSSL
|
||||
.. _intro-install-macos:
|
||||
|
||||
macOS
|
||||
-----
|
||||
|
||||
Building Scrapy's dependencies requires the presence of a C compiler and
|
||||
development headers. On macOS this is typically provided by Apple’s Xcode
|
||||
development tools. To install the Xcode command line tools open a terminal
|
||||
window and run::
|
||||
|
||||
xcode-select --install
|
||||
|
||||
There's a `known issue <https://github.com/pypa/pip/issues/2468>`_ that
|
||||
prevents ``pip`` from updating system packages. This has to be addressed to
|
||||
successfully install Scrapy and its dependencies. Here are some proposed
|
||||
solutions:
|
||||
|
||||
* *(Recommended)* **Don't** use system python, install a new, updated version
|
||||
that doesn't conflict with the rest of your system. Here's how to do it using
|
||||
the `homebrew`_ package manager:
|
||||
|
||||
* Install `homebrew`_ following the instructions in https://brew.sh/
|
||||
|
||||
* Update your ``PATH`` variable to state that homebrew packages should be
|
||||
used before system packages (Change ``.bashrc`` to ``.zshrc`` accordantly
|
||||
if you're using `zsh`_ as default shell)::
|
||||
|
||||
echo "export PATH=/usr/local/bin:/usr/local/sbin:$PATH" >> ~/.bashrc
|
||||
|
||||
* Reload ``.bashrc`` to ensure the changes have taken place::
|
||||
|
||||
source ~/.bashrc
|
||||
|
||||
* Install python::
|
||||
|
||||
brew install python
|
||||
|
||||
* Latest versions of python have ``pip`` bundled with them so you won't need
|
||||
to install it separately. If this is not the case, upgrade python::
|
||||
|
||||
brew update; brew upgrade python
|
||||
|
||||
* *(Optional)* :ref:`Install Scrapy inside a Python virtual environment
|
||||
<intro-using-virtualenv>`.
|
||||
|
||||
This method is a workaround for the above macOS issue, but it's an overall
|
||||
good practice for managing dependencies and can complement the first method.
|
||||
|
||||
After any of these workarounds you should be able to install Scrapy::
|
||||
|
||||
pip install Scrapy
|
||||
|
||||
|
||||
PyPy
|
||||
----
|
||||
|
||||
We recommend using the latest PyPy version. The version tested is 5.9.0.
|
||||
For PyPy3, only Linux installation was tested.
|
||||
|
||||
Most Scrapy dependencides now have binary wheels for CPython, but not for PyPy.
|
||||
This means that these dependecies will be built during installation.
|
||||
On macOS, you are likely to face an issue with building Cryptography dependency,
|
||||
solution to this problem is described
|
||||
`here <https://github.com/pyca/cryptography/issues/2692#issuecomment-272773481>`_,
|
||||
that is to ``brew install openssl`` and then export the flags that this command
|
||||
recommends (only needed when installing Scrapy). Installing on Linux has no special
|
||||
issues besides installing build dependencies.
|
||||
Installing Scrapy with PyPy on Windows is not tested.
|
||||
|
||||
You can check that Scrapy is installed correctly by running ``scrapy bench``.
|
||||
If this command gives errors such as
|
||||
``TypeError: ... got 2 unexpected keyword arguments``, this means
|
||||
that setuptools was unable to pick up one PyPy-specific dependency.
|
||||
To fix this issue, run ``pip install 'PyPyDispatcher>=2.1.0'``.
|
||||
|
||||
|
||||
.. _intro-install-troubleshooting:
|
||||
|
||||
Troubleshooting
|
||||
===============
|
||||
|
||||
AttributeError: 'module' object has no attribute 'OP_NO_TLSv1_1'
|
||||
----------------------------------------------------------------
|
||||
|
||||
After you install or upgrade Scrapy, Twisted or pyOpenSSL, you may get an
|
||||
exception with the following traceback::
|
||||
|
||||
[…]
|
||||
File "[…]/site-packages/twisted/protocols/tls.py", line 63, in <module>
|
||||
from twisted.internet._sslverify import _setAcceptableProtocols
|
||||
File "[…]/site-packages/twisted/internet/_sslverify.py", line 38, in <module>
|
||||
TLSVersion.TLSv1_1: SSL.OP_NO_TLSv1_1,
|
||||
AttributeError: 'module' object has no attribute 'OP_NO_TLSv1_1'
|
||||
|
||||
The reason you get this exception is that your system or virtual environment
|
||||
has a version of pyOpenSSL that your version of Twisted does not support.
|
||||
|
||||
To install a version of pyOpenSSL that your version of Twisted supports,
|
||||
reinstall Twisted with the :code:`tls` extra option::
|
||||
|
||||
pip install twisted[tls]
|
||||
|
||||
For details, see `Issue #2473 <https://github.com/scrapy/scrapy/issues/2473>`_.
|
||||
|
||||
.. _Python: https://www.python.org/
|
||||
.. _pip: https://pip.pypa.io/en/latest/installing/
|
||||
.. _lxml: https://lxml.de/index.html
|
||||
.. _parsel: https://pypi.org/project/parsel/
|
||||
.. _w3lib: https://pypi.org/project/w3lib/
|
||||
.. _twisted: https://twistedmatrix.com/trac/
|
||||
.. _cryptography: https://cryptography.io/en/latest/
|
||||
.. _pyOpenSSL: https://pypi.org/project/pyOpenSSL/
|
||||
.. _setuptools: https://pypi.python.org/pypi/setuptools
|
||||
.. _AUR Scrapy package: https://aur.archlinux.org/packages/scrapy/
|
||||
.. _homebrew: https://brew.sh/
|
||||
.. _zsh: https://www.zsh.org/
|
||||
.. _Scrapinghub: https://scrapinghub.com
|
||||
.. _Anaconda: https://docs.anaconda.com/anaconda/
|
||||
.. _Miniconda: https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html
|
||||
.. _conda-forge: https://conda-forge.org/
|
||||
|
@ -8,177 +8,90 @@ Scrapy is an application framework for crawling web sites and extracting
|
||||
structured data which can be used for a wide range of useful applications, like
|
||||
data mining, information processing or historical archival.
|
||||
|
||||
Even though Scrapy was originally designed for `screen scraping`_ (more
|
||||
precisely, `web scraping`_), it can also be used to extract data using APIs
|
||||
(such as `Amazon Associates Web Services`_) or as a general purpose web
|
||||
crawler.
|
||||
Even though Scrapy was originally designed for `web scraping`_, it can also be
|
||||
used to extract data using APIs (such as `Amazon Associates Web Services`_) or
|
||||
as a general purpose web crawler.
|
||||
|
||||
The purpose of this document is to introduce you to the concepts behind Scrapy
|
||||
so you can get an idea of how it works and decide if Scrapy is what you need.
|
||||
|
||||
When you're ready to start a project, you can :ref:`start with the tutorial
|
||||
<intro-tutorial>`.
|
||||
Walk-through of an example spider
|
||||
=================================
|
||||
|
||||
Pick a website
|
||||
==============
|
||||
In order to show you what Scrapy brings to the table, we'll walk you through an
|
||||
example of a Scrapy Spider using the simplest way to run a spider.
|
||||
|
||||
So you need to extract some information from a website, but the website doesn't
|
||||
provide any API or mechanism to access that info programmatically. Scrapy can
|
||||
help you extract that information.
|
||||
|
||||
Let's say we want to extract the URL, name, description and size of all torrent
|
||||
files added today in the `Mininova`_ site.
|
||||
|
||||
The list of all torrents added today can be found on this page:
|
||||
|
||||
http://www.mininova.org/today
|
||||
|
||||
.. _intro-overview-item:
|
||||
|
||||
Define the data you want to scrape
|
||||
==================================
|
||||
|
||||
The first thing is to define the data we want to scrape. In Scrapy, this is
|
||||
done through :ref:`Scrapy Items <topics-items>` (Torrent files, in this case).
|
||||
|
||||
This would be our Item::
|
||||
Here's the code for a spider that scrapes famous quotes from website
|
||||
http://quotes.toscrape.com, following the pagination::
|
||||
|
||||
import scrapy
|
||||
|
||||
class TorrentItem(scrapy.Item):
|
||||
url = scrapy.Field()
|
||||
name = scrapy.Field()
|
||||
description = scrapy.Field()
|
||||
size = scrapy.Field()
|
||||
|
||||
Write a Spider to extract the data
|
||||
==================================
|
||||
class QuotesSpider(scrapy.Spider):
|
||||
name = 'quotes'
|
||||
start_urls = [
|
||||
'http://quotes.toscrape.com/tag/humor/',
|
||||
]
|
||||
|
||||
The next thing is to write a Spider which defines the start URL
|
||||
(http://www.mininova.org/today), the rules for following links and the rules
|
||||
for extracting the data from pages.
|
||||
def parse(self, response):
|
||||
for quote in response.css('div.quote'):
|
||||
yield {
|
||||
'author': quote.xpath('span/small/text()').get(),
|
||||
'text': quote.css('span.text::text').get(),
|
||||
}
|
||||
|
||||
If we take a look at that page content we'll see that all torrent URLs are like
|
||||
``http://www.mininova.org/tor/NUMBER`` where ``NUMBER`` is an integer. We'll use
|
||||
that to construct the regular expression for the links to follow: ``/tor/\d+``.
|
||||
next_page = response.css('li.next a::attr("href")').get()
|
||||
if next_page is not None:
|
||||
yield response.follow(next_page, self.parse)
|
||||
|
||||
We'll use `XPath`_ for selecting the data to extract from the web page HTML
|
||||
source. Let's take one of those torrent pages:
|
||||
Put this in a text file, name it to something like ``quotes_spider.py``
|
||||
and run the spider using the :command:`runspider` command::
|
||||
|
||||
http://www.mininova.org/tor/2676093
|
||||
scrapy runspider quotes_spider.py -o quotes.jl
|
||||
|
||||
And look at the page HTML source to construct the XPath to select the data we
|
||||
want which is: torrent name, description and size.
|
||||
When this finishes you will have in the ``quotes.jl`` file a list of the
|
||||
quotes in JSON Lines format, containing text and author, looking like this::
|
||||
|
||||
.. highlight:: html
|
||||
|
||||
By looking at the page HTML source we can see that the file name is contained
|
||||
inside a ``<h1>`` tag::
|
||||
|
||||
<h1>Darwin - The Evolution Of An Exhibition</h1>
|
||||
|
||||
.. highlight:: none
|
||||
|
||||
An XPath expression to extract the name could be::
|
||||
|
||||
//h1/text()
|
||||
|
||||
.. highlight:: html
|
||||
|
||||
And the description is contained inside a ``<div>`` tag with ``id="description"``::
|
||||
|
||||
<h2>Description:</h2>
|
||||
|
||||
<div id="description">
|
||||
Short documentary made for Plymouth City Museum and Art Gallery regarding the setup of an exhibit about Charles Darwin in conjunction with the 200th anniversary of his birth.
|
||||
|
||||
...
|
||||
|
||||
.. highlight:: none
|
||||
|
||||
An XPath expression to select the description could be::
|
||||
|
||||
//div[@id='description']
|
||||
|
||||
.. highlight:: html
|
||||
|
||||
Finally, the file size is contained in the second ``<p>`` tag inside the ``<div>``
|
||||
tag with ``id=specifications``::
|
||||
|
||||
<div id="specifications">
|
||||
|
||||
<p>
|
||||
<strong>Category:</strong>
|
||||
<a href="/cat/4">Movies</a> > <a href="/sub/35">Documentary</a>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
<strong>Total size:</strong>
|
||||
150.62 megabyte</p>
|
||||
{"author": "Jane Austen", "text": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d"}
|
||||
{"author": "Steve Martin", "text": "\u201cA day without sunshine is like, you know, night.\u201d"}
|
||||
{"author": "Garrison Keillor", "text": "\u201cAnyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.\u201d"}
|
||||
...
|
||||
|
||||
|
||||
.. highlight:: none
|
||||
What just happened?
|
||||
-------------------
|
||||
|
||||
An XPath expression to select the file size could be::
|
||||
When you ran the command ``scrapy runspider quotes_spider.py``, Scrapy looked for a
|
||||
Spider definition inside it and ran it through its crawler engine.
|
||||
|
||||
//div[@id='specifications']/p[2]/text()[2]
|
||||
The crawl started by making requests to the URLs defined in the ``start_urls``
|
||||
attribute (in this case, only the URL for quotes in *humor* category)
|
||||
and called the default callback method ``parse``, passing the response object as
|
||||
an argument. In the ``parse`` callback, we loop through the quote elements
|
||||
using a CSS Selector, yield a Python dict with the extracted quote text and author,
|
||||
look for a link to the next page and schedule another request using the same
|
||||
``parse`` method as callback.
|
||||
|
||||
.. highlight:: python
|
||||
Here you notice one of the main advantages about Scrapy: requests are
|
||||
:ref:`scheduled and processed asynchronously <topics-architecture>`. This
|
||||
means that Scrapy doesn't need to wait for a request to be finished and
|
||||
processed, it can send another request or do other things in the meantime. This
|
||||
also means that other requests can keep going even if some request fails or an
|
||||
error happens while handling it.
|
||||
|
||||
For more information about XPath see the `XPath reference`_.
|
||||
While this enables you to do very fast crawls (sending multiple concurrent
|
||||
requests at the same time, in a fault-tolerant way) Scrapy also gives you
|
||||
control over the politeness of the crawl through :ref:`a few settings
|
||||
<topics-settings-ref>`. You can do things like setting a download delay between
|
||||
each request, limiting amount of concurrent requests per domain or per IP, and
|
||||
even :ref:`using an auto-throttling extension <topics-autothrottle>` that tries
|
||||
to figure out these automatically.
|
||||
|
||||
Finally, here's the spider code::
|
||||
.. note::
|
||||
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
This is using :ref:`feed exports <topics-feed-exports>` to generate the
|
||||
JSON file, you can easily change the export format (XML or CSV, for example) or the
|
||||
storage backend (FTP or `Amazon S3`_, for example). You can also write an
|
||||
:ref:`item pipeline <topics-item-pipeline>` to store the items in a database.
|
||||
|
||||
class MininovaSpider(CrawlSpider):
|
||||
|
||||
name = 'mininova'
|
||||
allowed_domains = ['mininova.org']
|
||||
start_urls = ['http://www.mininova.org/today']
|
||||
rules = [Rule(LinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
||||
|
||||
def parse_torrent(self, response):
|
||||
torrent = TorrentItem()
|
||||
torrent['url'] = response.url
|
||||
torrent['name'] = response.xpath("//h1/text()").extract()
|
||||
torrent['description'] = response.xpath("//div[@id='description']").extract()
|
||||
torrent['size'] = response.xpath("//div[@id='info-left']/p[2]/text()[2]").extract()
|
||||
return torrent
|
||||
|
||||
The ``TorrentItem`` class is :ref:`defined above <intro-overview-item>`.
|
||||
|
||||
Run the spider to extract the data
|
||||
==================================
|
||||
|
||||
Finally, we'll run the spider to crawl the site and output the file
|
||||
``scraped_data.json`` with the scraped data in JSON format::
|
||||
|
||||
scrapy crawl mininova -o scraped_data.json
|
||||
|
||||
This uses :ref:`feed exports <topics-feed-exports>` to generate the JSON file.
|
||||
You can easily change the export format (XML or CSV, for example) or the
|
||||
storage backend (FTP or `Amazon S3`_, for example).
|
||||
|
||||
You can also write an :ref:`item pipeline <topics-item-pipeline>` to store the
|
||||
items in a database very easily.
|
||||
|
||||
Review scraped data
|
||||
===================
|
||||
|
||||
If you check the ``scraped_data.json`` file after the process finishes, you'll
|
||||
see the scraped items there::
|
||||
|
||||
[{"url": "http://www.mininova.org/tor/2676093", "name": ["Darwin - The Evolution Of An Exhibition"], "description": ["Short documentary made for Plymouth ..."], "size": ["150.62 megabyte"]},
|
||||
# ... other items ...
|
||||
]
|
||||
|
||||
You'll notice that all field values (except for the ``url`` which was assigned
|
||||
directly) are actually lists. This is because the :ref:`selectors
|
||||
<topics-selectors>` return lists. You may want to store single values, or
|
||||
perform some additional parsing/cleansing to the values. That's what
|
||||
:ref:`Item Loaders <topics-loaders>` are for.
|
||||
|
||||
.. _topics-whatelse:
|
||||
|
||||
@ -190,77 +103,53 @@ this is just the surface. Scrapy provides a lot of powerful features for making
|
||||
scraping easy and efficient, such as:
|
||||
|
||||
* Built-in support for :ref:`selecting and extracting <topics-selectors>` data
|
||||
from HTML and XML sources
|
||||
from HTML/XML sources using extended CSS selectors and XPath expressions,
|
||||
with helper methods to extract using regular expressions.
|
||||
|
||||
* Built-in support for cleaning and sanitizing the scraped data using a
|
||||
collection of reusable filters (called :ref:`Item Loaders <topics-loaders>`)
|
||||
shared between all the spiders.
|
||||
* An :ref:`interactive shell console <topics-shell>` (IPython aware) for trying
|
||||
out the CSS and XPath expressions to scrape data, very useful when writing or
|
||||
debugging your spiders.
|
||||
|
||||
* Built-in support for :ref:`generating feed exports <topics-feed-exports>` in
|
||||
multiple formats (JSON, CSV, XML) and storing them in multiple backends (FTP,
|
||||
S3, local filesystem)
|
||||
|
||||
* A media pipeline for :ref:`automatically downloading images <topics-images>`
|
||||
(or any other media) associated with the scraped items
|
||||
|
||||
* Support for :ref:`extending Scrapy <extending-scrapy>` by plugging
|
||||
your own functionality using :ref:`signals <topics-signals>` and a
|
||||
well-defined API (middlewares, :ref:`extensions <topics-extensions>`, and
|
||||
:ref:`pipelines <topics-item-pipeline>`).
|
||||
|
||||
* Wide range of built-in middlewares and extensions for:
|
||||
|
||||
* cookies and session handling
|
||||
* HTTP compression
|
||||
* HTTP authentication
|
||||
* HTTP cache
|
||||
* user-agent spoofing
|
||||
* robots.txt
|
||||
* crawl depth restriction
|
||||
* and more
|
||||
|
||||
* Robust encoding support and auto-detection, for dealing with foreign,
|
||||
non-standard and broken encoding declarations.
|
||||
|
||||
* Support for creating spiders based on pre-defined templates, to speed up
|
||||
spider creation and make their code more consistent on large projects. See
|
||||
:command:`genspider` command for more details.
|
||||
* :ref:`Strong extensibility support <extending-scrapy>`, allowing you to plug
|
||||
in your own functionality using :ref:`signals <topics-signals>` and a
|
||||
well-defined API (middlewares, :ref:`extensions <topics-extensions>`, and
|
||||
:ref:`pipelines <topics-item-pipeline>`).
|
||||
|
||||
* Extensible :ref:`stats collection <topics-stats>` for multiple spider
|
||||
metrics, useful for monitoring the performance of your spiders and detecting
|
||||
when they get broken
|
||||
* Wide range of built-in extensions and middlewares for handling:
|
||||
|
||||
* An :ref:`Interactive shell console <topics-shell>` for trying XPaths, very
|
||||
useful for writing and debugging your spiders
|
||||
|
||||
* A :ref:`System service <topics-scrapyd>` designed to ease the deployment and
|
||||
run of your spiders in production.
|
||||
- cookies and session handling
|
||||
- HTTP features like compression, authentication, caching
|
||||
- user-agent spoofing
|
||||
- robots.txt
|
||||
- crawl depth restriction
|
||||
- and more
|
||||
|
||||
* A :ref:`Telnet console <topics-telnetconsole>` for hooking into a Python
|
||||
console running inside your Scrapy process, to introspect and debug your
|
||||
crawler
|
||||
|
||||
* :ref:`Logging <topics-logging>` facility that you can hook on to for catching
|
||||
errors during the scraping process.
|
||||
|
||||
* Support for crawling based on URLs discovered through `Sitemaps`_
|
||||
|
||||
* A caching DNS resolver
|
||||
* Plus other goodies like reusable spiders to crawl sites from `Sitemaps`_ and
|
||||
XML/CSV feeds, a media pipeline for :ref:`automatically downloading images
|
||||
<topics-media-pipeline>` (or any other media) associated with the scraped
|
||||
items, a caching DNS resolver, and much more!
|
||||
|
||||
What's next?
|
||||
============
|
||||
|
||||
The next obvious steps are for you to `download Scrapy`_, read :ref:`the
|
||||
tutorial <intro-tutorial>` and join `the community`_. Thanks for your
|
||||
The next steps for you are to :ref:`install Scrapy <intro-install>`,
|
||||
:ref:`follow through the tutorial <intro-tutorial>` to learn how to create
|
||||
a full-blown Scrapy project and `join the community`_. Thanks for your
|
||||
interest!
|
||||
|
||||
.. _download Scrapy: http://scrapy.org/download/
|
||||
.. _the community: http://scrapy.org/community/
|
||||
.. _screen scraping: http://en.wikipedia.org/wiki/Screen_scraping
|
||||
.. _web scraping: http://en.wikipedia.org/wiki/Web_scraping
|
||||
.. _Amazon Associates Web Services: http://aws.amazon.com/associates/
|
||||
.. _Mininova: http://www.mininova.org
|
||||
.. _XPath: http://www.w3.org/TR/xpath
|
||||
.. _XPath reference: http://www.w3.org/TR/xpath
|
||||
.. _Amazon S3: http://aws.amazon.com/s3/
|
||||
.. _Sitemaps: http://www.sitemaps.org
|
||||
.. _join the community: https://scrapy.org/community/
|
||||
.. _web scraping: https://en.wikipedia.org/wiki/Web_scraping
|
||||
.. _Amazon Associates Web Services: https://affiliate-program.amazon.com/gp/advertising/api/detail/main.html
|
||||
.. _Amazon S3: https://aws.amazon.com/s3/
|
||||
.. _Sitemaps: https://www.sitemaps.org/index.html
|
||||
|
3499
docs/news.rst
4
docs/requirements.txt
Normal file
@ -0,0 +1,4 @@
|
||||
Sphinx>=3.0
|
||||
sphinx-hoverxref>=0.2b1
|
||||
sphinx-notfound-page>=0.4
|
||||
sphinx_rtd_theme>=0.4
|
Before Width: | Height: | Size: 43 KiB |
Before Width: | Height: | Size: 68 KiB |
Before Width: | Height: | Size: 88 KiB |
BIN
docs/topics/_images/inspector_01.png
Normal file
After Width: | Height: | Size: 53 KiB |
BIN
docs/topics/_images/network_01.png
Normal file
After Width: | Height: | Size: 10 KiB |
BIN
docs/topics/_images/network_02.png
Normal file
After Width: | Height: | Size: 81 KiB |
BIN
docs/topics/_images/network_03.png
Normal file
After Width: | Height: | Size: 44 KiB |
BIN
docs/topics/_images/scrapy_architecture_02.png
Normal file
After Width: | Height: | Size: 53 KiB |
@ -28,9 +28,10 @@ contains a dictionary of all available extensions and their order similar to
|
||||
how you :ref:`configure the downloader middlewares
|
||||
<topics-downloader-middleware-setting>`.
|
||||
|
||||
.. class:: Crawler(settings)
|
||||
.. class:: Crawler(spidercls, settings)
|
||||
|
||||
The Crawler object must be instantiated with a
|
||||
:class:`scrapy.spiders.Spider` subclass and a
|
||||
:class:`scrapy.settings.Settings` object.
|
||||
|
||||
.. attribute:: settings
|
||||
@ -64,7 +65,7 @@ how you :ref:`configure the downloader middlewares
|
||||
|
||||
For an introduction on stats collection see :ref:`topics-stats`.
|
||||
|
||||
For the API see :class:`~scrapy.statscol.StatsCollector` class.
|
||||
For the API see :class:`~scrapy.statscollectors.StatsCollector` class.
|
||||
|
||||
.. attribute:: extensions
|
||||
|
||||
@ -75,34 +76,39 @@ how you :ref:`configure the downloader middlewares
|
||||
For an introduction on extensions and a list of available extensions on
|
||||
Scrapy see :ref:`topics-extensions`.
|
||||
|
||||
.. attribute:: spiders
|
||||
|
||||
The spider manager which takes care of loading and instantiating
|
||||
spiders.
|
||||
|
||||
Most extensions won't need to access this attribute.
|
||||
|
||||
.. attribute:: engine
|
||||
|
||||
The execution engine, which coordinates the core crawling logic
|
||||
between the scheduler, downloader and spiders.
|
||||
|
||||
Some extension may want to access the Scrapy engine, to modify inspect
|
||||
or modify the downloader and scheduler behaviour, although this is an
|
||||
Some extension may want to access the Scrapy engine, to inspect or
|
||||
modify the downloader and scheduler behaviour, although this is an
|
||||
advanced use and this API is not yet stable.
|
||||
|
||||
.. method:: configure()
|
||||
.. attribute:: spider
|
||||
|
||||
Configure the crawler.
|
||||
Spider currently being crawled. This is an instance of the spider class
|
||||
provided while constructing the crawler, and it is created after the
|
||||
arguments given in the :meth:`crawl` method.
|
||||
|
||||
This loads extensions, middlewares and spiders, leaving the crawler
|
||||
ready to be started. It also configures the execution engine.
|
||||
.. method:: crawl(*args, **kwargs)
|
||||
|
||||
.. method:: start()
|
||||
Starts the crawler by instantiating its spider class with the given
|
||||
``args`` and ``kwargs`` arguments, while setting the execution engine in
|
||||
motion.
|
||||
|
||||
Start the crawler. This calls :meth:`configure` if it hasn't been called yet.
|
||||
Returns a deferred that is fired when the crawl is finished.
|
||||
|
||||
.. automethod:: stop
|
||||
|
||||
.. autoclass:: CrawlerRunner
|
||||
:members:
|
||||
|
||||
.. autoclass:: CrawlerProcess
|
||||
:show-inheritance:
|
||||
:members:
|
||||
:inherited-members:
|
||||
|
||||
.. _topics-api-settings:
|
||||
|
||||
Settings API
|
||||
@ -129,194 +135,80 @@ Settings API
|
||||
'default': 0,
|
||||
'command': 10,
|
||||
'project': 20,
|
||||
'spider': 30,
|
||||
'cmdline': 40,
|
||||
}
|
||||
|
||||
For a detailed explanation on each settings sources, see:
|
||||
:ref:`topics-settings`.
|
||||
|
||||
.. class:: Settings(values={}, priority='project')
|
||||
.. autofunction:: get_settings_priority
|
||||
|
||||
This object stores Scrapy settings for the configuration of internal
|
||||
components, and can be used for any further customization.
|
||||
.. autoclass:: Settings
|
||||
:show-inheritance:
|
||||
:members:
|
||||
|
||||
After instantiation of this class, the new object will have the global
|
||||
default settings described on :ref:`topics-settings-ref` already
|
||||
populated.
|
||||
.. autoclass:: BaseSettings
|
||||
:members:
|
||||
|
||||
Additional values can be passed on initialization with the ``values``
|
||||
argument, and they would take the ``priority`` level. If the latter
|
||||
argument is a string, the priority name will be looked up in
|
||||
:attr:`~scrapy.settings.SETTINGS_PRIORITIES`. Otherwise, a expecific
|
||||
integer should be provided.
|
||||
.. _topics-api-spiderloader:
|
||||
|
||||
Once the object is created, new settings can be loaded or updated with the
|
||||
:meth:`~scrapy.settings.Settings.set` method, and can be accessed with the
|
||||
square bracket notation of dictionaries, or with the
|
||||
:meth:`~scrapy.settings.Settings.get` method of the instance and its value
|
||||
conversion variants. When requesting a stored key, the value with the
|
||||
highest priority will be retrieved.
|
||||
SpiderLoader API
|
||||
================
|
||||
|
||||
.. method:: set(name, value, priority='project')
|
||||
.. module:: scrapy.spiderloader
|
||||
:synopsis: The spider loader
|
||||
|
||||
Store a key/value attribute with a given priority.
|
||||
.. class:: SpiderLoader
|
||||
|
||||
Settings should be populated *before* configuring the Crawler object
|
||||
(through the :meth:`~scrapy.crawler.Crawler.configure` method),
|
||||
otherwise they won't have any effect.
|
||||
This class is in charge of retrieving and handling the spider classes
|
||||
defined across the project.
|
||||
|
||||
:param name: the setting name
|
||||
:type name: string
|
||||
Custom spider loaders can be employed by specifying their path in the
|
||||
:setting:`SPIDER_LOADER_CLASS` project setting. They must fully implement
|
||||
the :class:`scrapy.interfaces.ISpiderLoader` interface to guarantee an
|
||||
errorless execution.
|
||||
|
||||
:param value: the value to associate with the setting
|
||||
:type value: any
|
||||
.. method:: from_settings(settings)
|
||||
|
||||
:param priority: the priority of the setting. Should be a key of
|
||||
:attr:`~scrapy.settings.SETTINGS_PRIORITIES` or an integer
|
||||
:type priority: string or int
|
||||
This class method is used by Scrapy to create an instance of the class.
|
||||
It's called with the current project settings, and it loads the spiders
|
||||
found recursively in the modules of the :setting:`SPIDER_MODULES`
|
||||
setting.
|
||||
|
||||
.. method:: setdict(values, priority='project')
|
||||
:param settings: project settings
|
||||
:type settings: :class:`~scrapy.settings.Settings` instance
|
||||
|
||||
Store key/value pairs with a given priority.
|
||||
.. method:: load(spider_name)
|
||||
|
||||
This is a helper function that calls
|
||||
:meth:`~scrapy.settings.Settings.set` for every item of ``values``
|
||||
with the provided ``priority``.
|
||||
Get the Spider class with the given name. It'll look into the previously
|
||||
loaded spiders for a spider class with name ``spider_name`` and will raise
|
||||
a KeyError if not found.
|
||||
|
||||
:param values: the settings names and values
|
||||
:type values: dict
|
||||
:param spider_name: spider class name
|
||||
:type spider_name: str
|
||||
|
||||
:param priority: the priority of the settings. Should be a key of
|
||||
:attr:`~scrapy.settings.SETTINGS_PRIORITIES` or an integer
|
||||
:type priority: string or int
|
||||
.. method:: list()
|
||||
|
||||
.. method:: setmodule(module, priority='project')
|
||||
Get the names of the available spiders in the project.
|
||||
|
||||
Store settings from a module with a given priority.
|
||||
.. method:: find_by_request(request)
|
||||
|
||||
This is a helper function that calls
|
||||
:meth:`~scrapy.settings.Settings.set` for every globally declared
|
||||
uppercase variable of ``module`` with the provided ``priority``.
|
||||
List the spiders' names that can handle the given request. Will try to
|
||||
match the request's url against the domains of the spiders.
|
||||
|
||||
:param module: the module or the path of the module
|
||||
:type module: module object or string
|
||||
|
||||
:param priority: the priority of the settings. Should be a key of
|
||||
:attr:`~scrapy.settings.SETTINGS_PRIORITIES` or an integer
|
||||
:type priority: string or int
|
||||
|
||||
.. method:: get(name, default=None)
|
||||
|
||||
Get a setting value without affecting its original type.
|
||||
|
||||
:param name: the setting name
|
||||
:type name: string
|
||||
|
||||
:param default: the value to return if no setting is found
|
||||
:type default: any
|
||||
|
||||
.. method:: getbool(name, default=False)
|
||||
|
||||
Get a setting value as a boolean. For example, both ``1`` and ``'1'``, and
|
||||
``True`` return ``True``, while ``0``, ``'0'``, ``False`` and ``None``
|
||||
return ``False````
|
||||
|
||||
For example, settings populated through environment variables set to ``'0'``
|
||||
will return ``False`` when using this method.
|
||||
|
||||
:param name: the setting name
|
||||
:type name: string
|
||||
|
||||
:param default: the value to return if no setting is found
|
||||
:type default: any
|
||||
|
||||
.. method:: getint(name, default=0)
|
||||
|
||||
Get a setting value as an int
|
||||
|
||||
:param name: the setting name
|
||||
:type name: string
|
||||
|
||||
:param default: the value to return if no setting is found
|
||||
:type default: any
|
||||
|
||||
.. method:: getfloat(name, default=0.0)
|
||||
|
||||
Get a setting value as a float
|
||||
|
||||
:param name: the setting name
|
||||
:type name: string
|
||||
|
||||
:param default: the value to return if no setting is found
|
||||
:type default: any
|
||||
|
||||
.. method:: getlist(name, default=None)
|
||||
|
||||
Get a setting value as a list. If the setting original type is a list it
|
||||
will be returned verbatim. If it's a string it will be split by ",".
|
||||
|
||||
For example, settings populated through environment variables set to
|
||||
``'one,two'`` will return a list ['one', 'two'] when using this method.
|
||||
|
||||
:param name: the setting name
|
||||
:type name: string
|
||||
|
||||
:param default: the value to return if no setting is found
|
||||
:type default: any
|
||||
:param request: queried request
|
||||
:type request: :class:`~scrapy.http.Request` instance
|
||||
|
||||
.. _topics-api-signals:
|
||||
|
||||
Signals API
|
||||
===========
|
||||
|
||||
.. module:: scrapy.signalmanager
|
||||
:synopsis: The signal manager
|
||||
|
||||
.. class:: SignalManager
|
||||
|
||||
.. method:: connect(receiver, signal)
|
||||
|
||||
Connect a receiver function to a signal.
|
||||
|
||||
The signal can be any object, although Scrapy comes with some
|
||||
predefined signals that are documented in the :ref:`topics-signals`
|
||||
section.
|
||||
|
||||
:param receiver: the function to be connected
|
||||
:type receiver: callable
|
||||
|
||||
:param signal: the signal to connect to
|
||||
:type signal: object
|
||||
|
||||
.. method:: send_catch_log(signal, \*\*kwargs)
|
||||
|
||||
Send a signal, catch exceptions and log them.
|
||||
|
||||
The keyword arguments are passed to the signal handlers (connected
|
||||
through the :meth:`connect` method).
|
||||
|
||||
.. method:: send_catch_log_deferred(signal, \*\*kwargs)
|
||||
|
||||
Like :meth:`send_catch_log` but supports returning `deferreds`_ from
|
||||
signal handlers.
|
||||
|
||||
Returns a `deferred`_ that gets fired once all signal handlers
|
||||
deferreds were fired. Send a signal, catch exceptions and log them.
|
||||
|
||||
The keyword arguments are passed to the signal handlers (connected
|
||||
through the :meth:`connect` method).
|
||||
|
||||
.. method:: disconnect(receiver, signal)
|
||||
|
||||
Disconnect a receiver function from a signal. This has the opposite
|
||||
effect of the :meth:`connect` method, and the arguments are the same.
|
||||
|
||||
.. method:: disconnect_all(signal)
|
||||
|
||||
Disconnect all receivers from the given signal.
|
||||
|
||||
:param signal: the signal to disconnect from
|
||||
:type signal: object
|
||||
.. automodule:: scrapy.signalmanager
|
||||
:synopsis: The signal manager
|
||||
:members:
|
||||
:undoc-members:
|
||||
|
||||
.. _topics-api-stats:
|
||||
|
||||
@ -324,11 +216,11 @@ Stats Collector API
|
||||
===================
|
||||
|
||||
There are several Stats Collectors available under the
|
||||
:mod:`scrapy.statscol` module and they all implement the Stats
|
||||
Collector API defined by the :class:`~scrapy.statscol.StatsCollector`
|
||||
:mod:`scrapy.statscollectors` module and they all implement the Stats
|
||||
Collector API defined by the :class:`~scrapy.statscollectors.StatsCollector`
|
||||
class (which they all inherit from).
|
||||
|
||||
.. module:: scrapy.statscol
|
||||
.. module:: scrapy.statscollectors
|
||||
:synopsis: Stats Collectors
|
||||
|
||||
.. class:: StatsCollector
|
||||
@ -358,7 +250,7 @@ class (which they all inherit from).
|
||||
|
||||
Set the given value for the given key only if current value for the
|
||||
same key is lower than value. If there is no current value for the
|
||||
given key, the value is always set.
|
||||
given key, the value is always set.
|
||||
|
||||
.. method:: min_value(key, value)
|
||||
|
||||
@ -381,6 +273,3 @@ class (which they all inherit from).
|
||||
|
||||
Close the given spider. After this is called, no more specific stats
|
||||
can be accessed or collected.
|
||||
|
||||
.. _deferreds: http://twistedmatrix.com/documents/current/core/howto/defer.html
|
||||
.. _deferred: http://twistedmatrix.com/documents/current/core/howto/defer.html
|
||||
|
@ -12,24 +12,77 @@ Overview
|
||||
|
||||
The following diagram shows an overview of the Scrapy architecture with its
|
||||
components and an outline of the data flow that takes place inside the system
|
||||
(shown by the green arrows). A brief description of the components is included
|
||||
(shown by the red arrows). A brief description of the components is included
|
||||
below with links for more detailed information about them. The data flow is
|
||||
also described below.
|
||||
|
||||
.. image:: _images/scrapy_architecture.png
|
||||
.. _data-flow:
|
||||
|
||||
Data flow
|
||||
=========
|
||||
|
||||
.. image:: _images/scrapy_architecture_02.png
|
||||
:width: 700
|
||||
:height: 494
|
||||
:height: 470
|
||||
:alt: Scrapy architecture
|
||||
|
||||
The data flow in Scrapy is controlled by the execution engine, and goes like
|
||||
this:
|
||||
|
||||
1. The :ref:`Engine <component-engine>` gets the initial Requests to crawl from the
|
||||
:ref:`Spider <component-spiders>`.
|
||||
|
||||
2. The :ref:`Engine <component-engine>` schedules the Requests in the
|
||||
:ref:`Scheduler <component-scheduler>` and asks for the
|
||||
next Requests to crawl.
|
||||
|
||||
3. The :ref:`Scheduler <component-scheduler>` returns the next Requests
|
||||
to the :ref:`Engine <component-engine>`.
|
||||
|
||||
4. The :ref:`Engine <component-engine>` sends the Requests to the
|
||||
:ref:`Downloader <component-downloader>`, passing through the
|
||||
:ref:`Downloader Middlewares <component-downloader-middleware>` (see
|
||||
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_request`).
|
||||
|
||||
5. Once the page finishes downloading the
|
||||
:ref:`Downloader <component-downloader>` generates a Response (with
|
||||
that page) and sends it to the Engine, passing through the
|
||||
:ref:`Downloader Middlewares <component-downloader-middleware>` (see
|
||||
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_response`).
|
||||
|
||||
6. The :ref:`Engine <component-engine>` receives the Response from the
|
||||
:ref:`Downloader <component-downloader>` and sends it to the
|
||||
:ref:`Spider <component-spiders>` for processing, passing
|
||||
through the :ref:`Spider Middleware <component-spider-middleware>` (see
|
||||
:meth:`~scrapy.spidermiddlewares.SpiderMiddleware.process_spider_input`).
|
||||
|
||||
7. The :ref:`Spider <component-spiders>` processes the Response and returns
|
||||
scraped items and new Requests (to follow) to the
|
||||
:ref:`Engine <component-engine>`, passing through the
|
||||
:ref:`Spider Middleware <component-spider-middleware>` (see
|
||||
:meth:`~scrapy.spidermiddlewares.SpiderMiddleware.process_spider_output`).
|
||||
|
||||
8. The :ref:`Engine <component-engine>` sends processed items to
|
||||
:ref:`Item Pipelines <component-pipelines>`, then send processed Requests to
|
||||
the :ref:`Scheduler <component-scheduler>` and asks for possible next Requests
|
||||
to crawl.
|
||||
|
||||
9. The process repeats (from step 1) until there are no more requests from the
|
||||
:ref:`Scheduler <component-scheduler>`.
|
||||
|
||||
Components
|
||||
==========
|
||||
|
||||
.. _component-engine:
|
||||
|
||||
Scrapy Engine
|
||||
-------------
|
||||
|
||||
The engine is responsible for controlling the data flow between all components
|
||||
of the system, and triggering events when certain actions occur. See the Data
|
||||
Flow section below for more details.
|
||||
of the system, and triggering events when certain actions occur. See the
|
||||
:ref:`Data Flow <data-flow>` section above for more details.
|
||||
|
||||
.. _component-scheduler:
|
||||
|
||||
Scheduler
|
||||
---------
|
||||
@ -37,19 +90,24 @@ Scheduler
|
||||
The Scheduler receives requests from the engine and enqueues them for feeding
|
||||
them later (also to the engine) when the engine requests them.
|
||||
|
||||
.. _component-downloader:
|
||||
|
||||
Downloader
|
||||
----------
|
||||
|
||||
The Downloader is responsible for fetching web pages and feeding them to the
|
||||
engine which, in turn, feeds them to the spiders.
|
||||
|
||||
.. _component-spiders:
|
||||
|
||||
Spiders
|
||||
-------
|
||||
|
||||
Spiders are custom classes written by Scrapy users to parse responses and
|
||||
extract items (aka scraped items) from them or additional URLs (requests) to
|
||||
follow. Each spider is able to handle a specific domain (or group of domains).
|
||||
For more information see :ref:`topics-spiders`.
|
||||
extract :ref:`items <topics-items>` from them or additional requests to
|
||||
follow. For more information see :ref:`topics-spiders`.
|
||||
|
||||
.. _component-pipelines:
|
||||
|
||||
Item Pipeline
|
||||
-------------
|
||||
@ -59,57 +117,44 @@ extracted (or scraped) by the spiders. Typical tasks include cleansing,
|
||||
validation and persistence (like storing the item in a database). For more
|
||||
information see :ref:`topics-item-pipeline`.
|
||||
|
||||
.. _component-downloader-middleware:
|
||||
|
||||
Downloader middlewares
|
||||
----------------------
|
||||
|
||||
Downloader middlewares are specific hooks that sit between the Engine and the
|
||||
Downloader and process requests when they pass from the Engine to the
|
||||
Downloader, and responses that pass from Downloader to the Engine. They provide
|
||||
a convenient mechanism for extending Scrapy functionality by plugging custom
|
||||
code. For more information see :ref:`topics-downloader-middleware`.
|
||||
Downloader, and responses that pass from Downloader to the Engine.
|
||||
|
||||
Use a Downloader middleware if you need to do one of the following:
|
||||
|
||||
* process a request just before it is sent to the Downloader
|
||||
(i.e. right before Scrapy sends the request to the website);
|
||||
* change received response before passing it to a spider;
|
||||
* send a new Request instead of passing received response to a spider;
|
||||
* pass response to a spider without fetching a web page;
|
||||
* silently drop some requests.
|
||||
|
||||
For more information see :ref:`topics-downloader-middleware`.
|
||||
|
||||
.. _component-spider-middleware:
|
||||
|
||||
Spider middlewares
|
||||
------------------
|
||||
|
||||
Spider middlewares are specific hooks that sit between the Engine and the
|
||||
Spiders and are able to process spider input (responses) and output (items and
|
||||
requests). They provide a convenient mechanism for extending Scrapy
|
||||
functionality by plugging custom code. For more information see
|
||||
:ref:`topics-spider-middleware`.
|
||||
requests).
|
||||
|
||||
Data flow
|
||||
=========
|
||||
Use a Spider middleware if you need to
|
||||
|
||||
The data flow in Scrapy is controlled by the execution engine, and goes like
|
||||
this:
|
||||
* post-process output of spider callbacks - change/add/remove requests or items;
|
||||
* post-process start_requests;
|
||||
* handle spider exceptions;
|
||||
* call errback instead of callback for some of the requests based on response
|
||||
content.
|
||||
|
||||
1. The Engine opens a domain, locates the Spider that handles that domain, and
|
||||
asks the spider for the first URLs to crawl.
|
||||
|
||||
2. The Engine gets the first URLs to crawl from the Spider and schedules them
|
||||
in the Scheduler, as Requests.
|
||||
|
||||
3. The Engine asks the Scheduler for the next URLs to crawl.
|
||||
|
||||
4. The Scheduler returns the next URLs to crawl to the Engine and the Engine
|
||||
sends them to the Downloader, passing through the Downloader Middleware
|
||||
(request direction).
|
||||
|
||||
5. Once the page finishes downloading the Downloader generates a Response (with
|
||||
that page) and sends it to the Engine, passing through the Downloader
|
||||
Middleware (response direction).
|
||||
|
||||
6. The Engine receives the Response from the Downloader and sends it to the
|
||||
Spider for processing, passing through the Spider Middleware (input direction).
|
||||
|
||||
7. The Spider processes the Response and returns scraped Items and new Requests
|
||||
(to follow) to the Engine.
|
||||
|
||||
8. The Engine sends scraped Items (returned by the Spider) to the Item Pipeline
|
||||
and Requests (returned by spider) to the Scheduler
|
||||
|
||||
9. The process repeats (from step 2) until there are no more requests from the
|
||||
Scheduler, and the Engine closes the domain.
|
||||
For more information see :ref:`topics-spider-middleware`.
|
||||
|
||||
Event-driven networking
|
||||
=======================
|
||||
@ -121,10 +166,10 @@ for concurrency.
|
||||
For more information about asynchronous programming and Twisted see these
|
||||
links:
|
||||
|
||||
* `Introduction to Deferreds in Twisted`_
|
||||
* :doc:`twisted:core/howto/defer-intro`
|
||||
* `Twisted - hello, asynchronous programming`_
|
||||
* `Twisted Introduction - Krondo`_
|
||||
|
||||
.. _Twisted: http://twistedmatrix.com/trac/
|
||||
.. _Introduction to Deferreds in Twisted: http://twistedmatrix.com/documents/current/core/howto/defer-intro.html
|
||||
.. _Twisted - hello, asynchronous programming: http://jessenoller.com/2009/02/11/twisted-hello-asynchronous-programming/
|
||||
|
||||
.. _Twisted: https://twistedmatrix.com/trac/
|
||||
.. _Twisted - hello, asynchronous programming: http://jessenoller.com/blog/2009/02/11/twisted-hello-asynchronous-programming/
|
||||
.. _Twisted Introduction - Krondo: http://krondo.com/an-introduction-to-asynchronous-programming-and-twisted/
|
||||
|
40
docs/topics/asyncio.rst
Normal file
@ -0,0 +1,40 @@
|
||||
=======
|
||||
asyncio
|
||||
=======
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Scrapy has partial support :mod:`asyncio`. After you :ref:`install the asyncio
|
||||
reactor <install-asyncio>`, you may use :mod:`asyncio` and
|
||||
:mod:`asyncio`-powered libraries in any :doc:`coroutine <coroutines>`.
|
||||
|
||||
.. warning:: :mod:`asyncio` support in Scrapy is experimental. Future Scrapy
|
||||
versions may introduce related changes without a deprecation
|
||||
period or warning.
|
||||
|
||||
.. _install-asyncio:
|
||||
|
||||
Installing the asyncio reactor
|
||||
==============================
|
||||
|
||||
To enable :mod:`asyncio` support, set the :setting:`TWISTED_REACTOR` setting to
|
||||
``'twisted.internet.asyncioreactor.AsyncioSelectorReactor'``.
|
||||
|
||||
If you are using :class:`~scrapy.crawler.CrawlerRunner`, you also need to
|
||||
install the :class:`~twisted.internet.asyncioreactor.AsyncioSelectorReactor`
|
||||
reactor manually. You can do that using
|
||||
:func:`~scrapy.utils.reactor.install_reactor`::
|
||||
|
||||
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
|
||||
|
||||
.. _using-custom-loops:
|
||||
|
||||
Using custom asyncio loops
|
||||
==========================
|
||||
|
||||
You can also use custom asyncio event loops with the asyncio reactor. Set the
|
||||
:setting:`ASYNCIO_EVENT_LOOP` setting to the import path of the desired event loop class to
|
||||
use it instead of the default asyncio event loop.
|
||||
|
||||
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _topics-autothrottle:
|
||||
|
||||
======================
|
||||
AutoThrottle extension
|
||||
======================
|
||||
@ -9,14 +11,66 @@ Design goals
|
||||
============
|
||||
|
||||
1. be nicer to sites instead of using default download delay of zero
|
||||
2. automatically adjust scrapy to the optimum crawling speed, so the user
|
||||
doesn't have to tune the download delays and concurrent requests to find the
|
||||
optimum one. the user only needs to specify the maximum concurrent requests
|
||||
2. automatically adjust Scrapy to the optimum crawling speed, so the user
|
||||
doesn't have to tune the download delays to find the optimum one.
|
||||
The user only needs to specify the maximum concurrent requests
|
||||
it allows, and the extension does the rest.
|
||||
|
||||
.. _autothrottle-algorithm:
|
||||
|
||||
How it works
|
||||
============
|
||||
|
||||
AutoThrottle extension adjusts download delays dynamically to make spider send
|
||||
:setting:`AUTOTHROTTLE_TARGET_CONCURRENCY` concurrent requests on average
|
||||
to each remote website.
|
||||
|
||||
It uses download latency to compute the delays. The main idea is the
|
||||
following: if a server needs ``latency`` seconds to respond, a client
|
||||
should send a request each ``latency/N`` seconds to have ``N`` requests
|
||||
processed in parallel.
|
||||
|
||||
Instead of adjusting the delays one can just set a small fixed
|
||||
download delay and impose hard limits on concurrency using
|
||||
:setting:`CONCURRENT_REQUESTS_PER_DOMAIN` or
|
||||
:setting:`CONCURRENT_REQUESTS_PER_IP` options. It will provide a similar
|
||||
effect, but there are some important differences:
|
||||
|
||||
* because the download delay is small there will be occasional bursts
|
||||
of requests;
|
||||
* often non-200 (error) responses can be returned faster than regular
|
||||
responses, so with a small download delay and a hard concurrency limit
|
||||
crawler will be sending requests to server faster when server starts to
|
||||
return errors. But this is an opposite of what crawler should do - in case
|
||||
of errors it makes more sense to slow down: these errors may be caused by
|
||||
the high request rate.
|
||||
|
||||
AutoThrottle doesn't have these issues.
|
||||
|
||||
Throttling algorithm
|
||||
====================
|
||||
|
||||
AutoThrottle algorithm adjusts download delays based on the following rules:
|
||||
|
||||
1. spiders always start with a download delay of
|
||||
:setting:`AUTOTHROTTLE_START_DELAY`;
|
||||
2. when a response is received, the target download delay is calculated as
|
||||
``latency / N`` where ``latency`` is a latency of the response,
|
||||
and ``N`` is :setting:`AUTOTHROTTLE_TARGET_CONCURRENCY`.
|
||||
3. download delay for next requests is set to the average of previous
|
||||
download delay and the target download delay;
|
||||
4. latencies of non-200 responses are not allowed to decrease the delay;
|
||||
5. download delay can't become less than :setting:`DOWNLOAD_DELAY` or greater
|
||||
than :setting:`AUTOTHROTTLE_MAX_DELAY`
|
||||
|
||||
.. note:: The AutoThrottle extension honours the standard Scrapy settings for
|
||||
concurrency and delay. This means that it will respect
|
||||
:setting:`CONCURRENT_REQUESTS_PER_DOMAIN` and
|
||||
:setting:`CONCURRENT_REQUESTS_PER_IP` options and
|
||||
never set a download delay lower than :setting:`DOWNLOAD_DELAY`.
|
||||
|
||||
.. _download-latency:
|
||||
|
||||
In Scrapy, the download latency is measured as the time elapsed between
|
||||
establishing the TCP connection and receiving the HTTP headers.
|
||||
|
||||
@ -26,24 +80,6 @@ callback, for example, and unable to attend downloads. However, these latencies
|
||||
should still give a reasonable estimate of how busy Scrapy (and ultimately, the
|
||||
server) is, and this extension builds on that premise.
|
||||
|
||||
.. _autothrottle-algorithm:
|
||||
|
||||
Throttling algorithm
|
||||
====================
|
||||
|
||||
This adjusts download delays and concurrency based on the following rules:
|
||||
|
||||
1. spiders always start with one concurrent request and a download delay of
|
||||
:setting:`AUTOTHROTTLE_START_DELAY`
|
||||
2. when a response is received, the download delay is adjusted to the
|
||||
average of previous download delay and the latency of the response.
|
||||
|
||||
.. note:: The AutoThrottle extension honours the standard Scrapy settings for
|
||||
concurrency and delay. This means that it will never set a download delay
|
||||
lower than :setting:`DOWNLOAD_DELAY` or a concurrency higher than
|
||||
:setting:`CONCURRENT_REQUESTS_PER_DOMAIN`
|
||||
(or :setting:`CONCURRENT_REQUESTS_PER_IP`, depending on which one you use).
|
||||
|
||||
Settings
|
||||
========
|
||||
|
||||
@ -52,6 +88,7 @@ The settings used to control the AutoThrottle extension are:
|
||||
* :setting:`AUTOTHROTTLE_ENABLED`
|
||||
* :setting:`AUTOTHROTTLE_START_DELAY`
|
||||
* :setting:`AUTOTHROTTLE_MAX_DELAY`
|
||||
* :setting:`AUTOTHROTTLE_TARGET_CONCURRENCY`
|
||||
* :setting:`AUTOTHROTTLE_DEBUG`
|
||||
* :setting:`CONCURRENT_REQUESTS_PER_DOMAIN`
|
||||
* :setting:`CONCURRENT_REQUESTS_PER_IP`
|
||||
@ -86,6 +123,36 @@ Default: ``60.0``
|
||||
|
||||
The maximum download delay (in seconds) to be set in case of high latencies.
|
||||
|
||||
.. setting:: AUTOTHROTTLE_TARGET_CONCURRENCY
|
||||
|
||||
AUTOTHROTTLE_TARGET_CONCURRENCY
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
Default: ``1.0``
|
||||
|
||||
Average number of requests Scrapy should be sending in parallel to remote
|
||||
websites.
|
||||
|
||||
By default, AutoThrottle adjusts the delay to send a single
|
||||
concurrent request to each of the remote websites. Set this option to
|
||||
a higher value (e.g. ``2.0``) to increase the throughput and the load on remote
|
||||
servers. A lower ``AUTOTHROTTLE_TARGET_CONCURRENCY`` value
|
||||
(e.g. ``0.5``) makes the crawler more conservative and polite.
|
||||
|
||||
Note that :setting:`CONCURRENT_REQUESTS_PER_DOMAIN`
|
||||
and :setting:`CONCURRENT_REQUESTS_PER_IP` options are still respected
|
||||
when AutoThrottle extension is enabled. This means that if
|
||||
``AUTOTHROTTLE_TARGET_CONCURRENCY`` is set to a value higher than
|
||||
:setting:`CONCURRENT_REQUESTS_PER_DOMAIN` or
|
||||
:setting:`CONCURRENT_REQUESTS_PER_IP`, the crawler won't reach this number
|
||||
of concurrent requests.
|
||||
|
||||
At every given time point Scrapy can be sending more or less concurrent
|
||||
requests than ``AUTOTHROTTLE_TARGET_CONCURRENCY``; it is a suggested
|
||||
value the crawler tries to approach, not a hard limit.
|
||||
|
||||
.. setting:: AUTOTHROTTLE_DEBUG
|
||||
|
||||
AUTOTHROTTLE_DEBUG
|
||||
|
@ -18,40 +18,66 @@ To run it use::
|
||||
|
||||
You should see an output like this::
|
||||
|
||||
2013-05-16 13:08:46-0300 [scrapy] INFO: Scrapy 0.17.0 started (bot: scrapybot)
|
||||
2013-05-16 13:08:47-0300 [follow] INFO: Spider opened
|
||||
2013-05-16 13:08:47-0300 [follow] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:48-0300 [follow] INFO: Crawled 74 pages (at 4440 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:49-0300 [follow] INFO: Crawled 143 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:50-0300 [follow] INFO: Crawled 210 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:51-0300 [follow] INFO: Crawled 274 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:52-0300 [follow] INFO: Crawled 343 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:53-0300 [follow] INFO: Crawled 410 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:54-0300 [follow] INFO: Crawled 474 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:55-0300 [follow] INFO: Crawled 538 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:56-0300 [follow] INFO: Crawled 602 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:57-0300 [follow] INFO: Closing spider (closespider_timeout)
|
||||
2013-05-16 13:08:57-0300 [follow] INFO: Crawled 666 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:57-0300 [follow] INFO: Dumping Scrapy stats:
|
||||
{'downloader/request_bytes': 231508,
|
||||
'downloader/request_count': 682,
|
||||
'downloader/request_method_count/GET': 682,
|
||||
'downloader/response_bytes': 1172802,
|
||||
'downloader/response_count': 682,
|
||||
'downloader/response_status_count/200': 682,
|
||||
'finish_reason': 'closespider_timeout',
|
||||
'finish_time': datetime.datetime(2013, 5, 16, 16, 8, 57, 985539),
|
||||
'log_count/INFO': 14,
|
||||
'request_depth_max': 34,
|
||||
'response_received_count': 682,
|
||||
'scheduler/dequeued': 682,
|
||||
'scheduler/dequeued/memory': 682,
|
||||
'scheduler/enqueued': 12767,
|
||||
'scheduler/enqueued/memory': 12767,
|
||||
'start_time': datetime.datetime(2013, 5, 16, 16, 8, 47, 676539)}
|
||||
2013-05-16 13:08:57-0300 [follow] INFO: Spider closed (closespider_timeout)
|
||||
2016-12-16 21:18:48 [scrapy.utils.log] INFO: Scrapy 1.2.2 started (bot: quotesbot)
|
||||
2016-12-16 21:18:48 [scrapy.utils.log] INFO: Overridden settings: {'CLOSESPIDER_TIMEOUT': 10, 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['quotesbot.spiders'], 'LOGSTATS_INTERVAL': 1, 'BOT_NAME': 'quotesbot', 'LOG_LEVEL': 'INFO', 'NEWSPIDER_MODULE': 'quotesbot.spiders'}
|
||||
2016-12-16 21:18:49 [scrapy.middleware] INFO: Enabled extensions:
|
||||
['scrapy.extensions.closespider.CloseSpider',
|
||||
'scrapy.extensions.logstats.LogStats',
|
||||
'scrapy.extensions.telnet.TelnetConsole',
|
||||
'scrapy.extensions.corestats.CoreStats']
|
||||
2016-12-16 21:18:49 [scrapy.middleware] INFO: Enabled downloader middlewares:
|
||||
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
|
||||
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
|
||||
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
|
||||
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
|
||||
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
|
||||
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
|
||||
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
|
||||
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
|
||||
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
|
||||
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
|
||||
'scrapy.downloadermiddlewares.stats.DownloaderStats']
|
||||
2016-12-16 21:18:49 [scrapy.middleware] INFO: Enabled spider middlewares:
|
||||
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
|
||||
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
|
||||
'scrapy.spidermiddlewares.referer.RefererMiddleware',
|
||||
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
|
||||
'scrapy.spidermiddlewares.depth.DepthMiddleware']
|
||||
2016-12-16 21:18:49 [scrapy.middleware] INFO: Enabled item pipelines:
|
||||
[]
|
||||
2016-12-16 21:18:49 [scrapy.core.engine] INFO: Spider opened
|
||||
2016-12-16 21:18:49 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:50 [scrapy.extensions.logstats] INFO: Crawled 70 pages (at 4200 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:51 [scrapy.extensions.logstats] INFO: Crawled 134 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:52 [scrapy.extensions.logstats] INFO: Crawled 198 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:53 [scrapy.extensions.logstats] INFO: Crawled 254 pages (at 3360 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:54 [scrapy.extensions.logstats] INFO: Crawled 302 pages (at 2880 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:55 [scrapy.extensions.logstats] INFO: Crawled 358 pages (at 3360 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:56 [scrapy.extensions.logstats] INFO: Crawled 406 pages (at 2880 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:57 [scrapy.extensions.logstats] INFO: Crawled 438 pages (at 1920 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:58 [scrapy.extensions.logstats] INFO: Crawled 470 pages (at 1920 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:18:59 [scrapy.core.engine] INFO: Closing spider (closespider_timeout)
|
||||
2016-12-16 21:18:59 [scrapy.extensions.logstats] INFO: Crawled 518 pages (at 2880 pages/min), scraped 0 items (at 0 items/min)
|
||||
2016-12-16 21:19:00 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
|
||||
{'downloader/request_bytes': 229995,
|
||||
'downloader/request_count': 534,
|
||||
'downloader/request_method_count/GET': 534,
|
||||
'downloader/response_bytes': 1565504,
|
||||
'downloader/response_count': 534,
|
||||
'downloader/response_status_count/200': 534,
|
||||
'finish_reason': 'closespider_timeout',
|
||||
'finish_time': datetime.datetime(2016, 12, 16, 16, 19, 0, 647725),
|
||||
'log_count/INFO': 17,
|
||||
'request_depth_max': 19,
|
||||
'response_received_count': 534,
|
||||
'scheduler/dequeued': 533,
|
||||
'scheduler/dequeued/memory': 533,
|
||||
'scheduler/enqueued': 10661,
|
||||
'scheduler/enqueued/memory': 10661,
|
||||
'start_time': datetime.datetime(2016, 12, 16, 16, 18, 49, 799869)}
|
||||
2016-12-16 21:19:00 [scrapy.core.engine] INFO: Spider closed (closespider_timeout)
|
||||
|
||||
That tells you that Scrapy is able to crawl about 3900 pages per minute in the
|
||||
That tells you that Scrapy is able to crawl about 3000 pages per minute in the
|
||||
hardware where you run it. Note that this is a very simple spider intended to
|
||||
follow links, any custom spider you write will probably do more stuff which
|
||||
results in slower crawl rates. How slower depends on how much your spider does
|
||||
|
@ -20,7 +20,7 @@ These are some common properties often found in broad crawls:
|
||||
|
||||
* they crawl many domains (often, unbounded) instead of a specific set of sites
|
||||
|
||||
* they don't necessarily crawl domains to completion, because it would
|
||||
* they don't necessarily crawl domains to completion, because it would be
|
||||
impractical (or impossible) to do so, and instead limit the crawl by time or
|
||||
number of pages crawled
|
||||
|
||||
@ -34,29 +34,76 @@ These are some common properties often found in broad crawls:
|
||||
|
||||
As said above, Scrapy default settings are optimized for focused crawls, not
|
||||
broad crawls. However, due to its asynchronous architecture, Scrapy is very
|
||||
well suited for performing fast broad crawls. This page summarize some things
|
||||
well suited for performing fast broad crawls. This page summarizes some things
|
||||
you need to keep in mind when using Scrapy for doing broad crawls, along with
|
||||
concrete suggestions of Scrapy settings to tune in order to achieve an
|
||||
efficient broad crawl.
|
||||
|
||||
.. _broad-crawls-scheduler-priority-queue:
|
||||
|
||||
Use the right :setting:`SCHEDULER_PRIORITY_QUEUE`
|
||||
=================================================
|
||||
|
||||
Scrapy’s default scheduler priority queue is ``'scrapy.pqueues.ScrapyPriorityQueue'``.
|
||||
It works best during single-domain crawl. It does not work well with crawling
|
||||
many different domains in parallel
|
||||
|
||||
To apply the recommended priority queue use::
|
||||
|
||||
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
|
||||
|
||||
.. _broad-crawls-concurrency:
|
||||
|
||||
Increase concurrency
|
||||
====================
|
||||
|
||||
Concurrency is the number of requests that are processed in parallel. There is
|
||||
a global limit and a per-domain limit.
|
||||
a global limit (:setting:`CONCURRENT_REQUESTS`) and an additional limit that
|
||||
can be set either per domain (:setting:`CONCURRENT_REQUESTS_PER_DOMAIN`) or per
|
||||
IP (:setting:`CONCURRENT_REQUESTS_PER_IP`).
|
||||
|
||||
.. note:: The scheduler priority queue :ref:`recommended for broad crawls
|
||||
<broad-crawls-scheduler-priority-queue>` does not support
|
||||
:setting:`CONCURRENT_REQUESTS_PER_IP`.
|
||||
|
||||
The default global concurrency limit in Scrapy is not suitable for crawling
|
||||
many different domains in parallel, so you will want to increase it. How much
|
||||
to increase it will depend on how much CPU you crawler will have available. A
|
||||
good starting point is ``100``, but the best way to find out is by doing some
|
||||
trials and identifying at what concurrency your Scrapy process gets CPU
|
||||
bounded. For optimum performance, You should pick a concurrency where CPU usage
|
||||
is at 80-90%.
|
||||
many different domains in parallel, so you will want to increase it. How much
|
||||
to increase it will depend on how much CPU and memory you crawler will have
|
||||
available.
|
||||
|
||||
To increase the global concurrency use::
|
||||
A good starting point is ``100``::
|
||||
|
||||
CONCURRENT_REQUESTS = 100
|
||||
|
||||
But the best way to find out is by doing some trials and identifying at what
|
||||
concurrency your Scrapy process gets CPU bounded. For optimum performance, you
|
||||
should pick a concurrency where CPU usage is at 80-90%.
|
||||
|
||||
Increasing concurrency also increases memory usage. If memory usage is a
|
||||
concern, you might need to lower your global concurrency limit accordingly.
|
||||
|
||||
|
||||
Increase Twisted IO thread pool maximum size
|
||||
============================================
|
||||
|
||||
Currently Scrapy does DNS resolution in a blocking way with usage of thread
|
||||
pool. With higher concurrency levels the crawling could be slow or even fail
|
||||
hitting DNS resolver timeouts. Possible solution to increase the number of
|
||||
threads handling DNS queries. The DNS queue will be processed faster speeding
|
||||
up establishing of connection and crawling overall.
|
||||
|
||||
To increase maximum thread pool size use::
|
||||
|
||||
REACTOR_THREADPOOL_MAXSIZE = 20
|
||||
|
||||
Setup your own DNS
|
||||
==================
|
||||
|
||||
If you have multiple crawling processes and single central DNS, it can act
|
||||
like DoS attack on the DNS server resulting to slow down of entire network or
|
||||
even blocking your machines. To avoid this setup your own DNS server with
|
||||
local cache and upstream to some large DNS like OpenDNS or Verizon.
|
||||
|
||||
Reduce log level
|
||||
================
|
||||
|
||||
@ -64,8 +111,8 @@ When doing broad crawls you are often only interested in the crawl rates you
|
||||
get and any errors found. These stats are reported by Scrapy when using the
|
||||
``INFO`` log level. In order to save CPU (and log storage requirements) you
|
||||
should not use ``DEBUG`` log level when preforming large broad crawls in
|
||||
production. Using ``DEBUG`` level when developing your (broad) crawler may fine
|
||||
though.
|
||||
production. Using ``DEBUG`` level when developing your (broad) crawler may be
|
||||
fine though.
|
||||
|
||||
To set the log level use::
|
||||
|
||||
@ -141,4 +188,33 @@ AjaxCrawlMiddleware helps to crawl them correctly.
|
||||
It is turned OFF by default because it has some performance overhead,
|
||||
and enabling it for focused crawls doesn't make much sense.
|
||||
|
||||
.. _ajax crawlable: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
||||
.. _ajax crawlable: https://developers.google.com/search/docs/ajax-crawling/docs/getting-started
|
||||
|
||||
.. _broad-crawls-bfo:
|
||||
|
||||
Crawl in BFO order
|
||||
==================
|
||||
|
||||
:ref:`Scrapy crawls in DFO order by default <faq-bfo-dfo>`.
|
||||
|
||||
In broad crawls, however, page crawling tends to be faster than page
|
||||
processing. As a result, unprocessed early requests stay in memory until the
|
||||
final depth is reached, which can significantly increase memory usage.
|
||||
|
||||
:ref:`Crawl in BFO order <faq-bfo-dfo>` instead to save memory.
|
||||
|
||||
|
||||
Be mindful of memory leaks
|
||||
==========================
|
||||
|
||||
If your broad crawl shows a high memory usage, in addition to :ref:`crawling in
|
||||
BFO order <broad-crawls-bfo>` and :ref:`lowering concurrency
|
||||
<broad-crawls-concurrency>` you should :ref:`debug your memory leaks
|
||||
<topics-leaks>`.
|
||||
|
||||
|
||||
Install a specific Twisted reactor
|
||||
==================================
|
||||
|
||||
If the crawl is exceeding the system's capabilities, you might want to try
|
||||
installing a specific Twisted reactor, via the :setting:`TWISTED_REACTOR` setting.
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. highlight:: none
|
||||
|
||||
.. _topics-commands:
|
||||
|
||||
=================
|
||||
@ -13,6 +15,33 @@ just call "commands" or "Scrapy commands".
|
||||
The Scrapy tool provides several commands, for multiple purposes, and each one
|
||||
accepts a different set of arguments and options.
|
||||
|
||||
(The ``scrapy deploy`` command has been removed in 1.0 in favor of the
|
||||
standalone ``scrapyd-deploy``. See `Deploying your project`_.)
|
||||
|
||||
.. _topics-config-settings:
|
||||
|
||||
Configuration settings
|
||||
======================
|
||||
|
||||
Scrapy will look for configuration parameters in ini-style ``scrapy.cfg`` files
|
||||
in standard locations:
|
||||
|
||||
1. ``/etc/scrapy.cfg`` or ``c:\scrapy\scrapy.cfg`` (system-wide),
|
||||
2. ``~/.config/scrapy.cfg`` (``$XDG_CONFIG_HOME``) and ``~/.scrapy.cfg`` (``$HOME``)
|
||||
for global (user-wide) settings, and
|
||||
3. ``scrapy.cfg`` inside a Scrapy project's root (see next section).
|
||||
|
||||
Settings from these files are merged in the listed order of preference:
|
||||
user-defined values have higher priority than system-wide defaults
|
||||
and project-wide settings will override all others, when defined.
|
||||
|
||||
Scrapy also understands, and can be configured through, a number of environment
|
||||
variables. Currently these are:
|
||||
|
||||
* ``SCRAPY_SETTINGS_MODULE`` (see :ref:`topics-settings-module-envvar`)
|
||||
* ``SCRAPY_PROJECT`` (see :ref:`topics-project-envvar`)
|
||||
* ``SCRAPY_PYTHON_SHELL`` (see :ref:`topics-shell`)
|
||||
|
||||
.. _topics-project-structure:
|
||||
|
||||
Default structure of Scrapy projects
|
||||
@ -28,6 +57,7 @@ structure by default, similar to this::
|
||||
myproject/
|
||||
__init__.py
|
||||
items.py
|
||||
middlewares.py
|
||||
pipelines.py
|
||||
settings.py
|
||||
spiders/
|
||||
@ -38,11 +68,42 @@ structure by default, similar to this::
|
||||
|
||||
The directory where the ``scrapy.cfg`` file resides is known as the *project
|
||||
root directory*. That file contains the name of the python module that defines
|
||||
the project settings. Here is an example::
|
||||
the project settings. Here is an example:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[settings]
|
||||
default = myproject.settings
|
||||
|
||||
.. _topics-project-envvar:
|
||||
|
||||
Sharing the root directory between projects
|
||||
===========================================
|
||||
|
||||
A project root directory, the one that contains the ``scrapy.cfg``, may be
|
||||
shared by multiple Scrapy projects, each with its own settings module.
|
||||
|
||||
In that case, you must define one or more aliases for those settings modules
|
||||
under ``[settings]`` in your ``scrapy.cfg`` file:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[settings]
|
||||
default = myproject1.settings
|
||||
project1 = myproject1.settings
|
||||
project2 = myproject2.settings
|
||||
|
||||
By default, the ``scrapy`` command-line tool will use the ``default`` settings.
|
||||
Use the ``SCRAPY_PROJECT`` environment variable to specify a different project
|
||||
for ``scrapy`` to use::
|
||||
|
||||
$ scrapy settings --get BOT_NAME
|
||||
Project 1 Bot
|
||||
$ export SCRAPY_PROJECT=project2
|
||||
$ scrapy settings --get BOT_NAME
|
||||
Project 2 Bot
|
||||
|
||||
|
||||
Using the ``scrapy`` tool
|
||||
=========================
|
||||
|
||||
@ -59,8 +120,8 @@ some usage help and the available commands::
|
||||
fetch Fetch a URL using the Scrapy downloader
|
||||
[...]
|
||||
|
||||
The first line will print the currently active project, if you're inside a
|
||||
Scrapy project. In this, it was run from outside a project. If run from inside
|
||||
The first line will print the currently active project if you're inside a
|
||||
Scrapy project. In this example it was run from outside a project. If run from inside
|
||||
a project it would have printed something like this::
|
||||
|
||||
Scrapy X.Y - project: myproject
|
||||
@ -76,13 +137,14 @@ Creating projects
|
||||
The first thing you typically do with the ``scrapy`` tool is create your Scrapy
|
||||
project::
|
||||
|
||||
scrapy startproject myproject
|
||||
scrapy startproject myproject [project_dir]
|
||||
|
||||
That will create a Scrapy project under the ``myproject`` directory.
|
||||
That will create a Scrapy project under the ``project_dir`` directory.
|
||||
If ``project_dir`` wasn't specified, ``project_dir`` will be the same as ``myproject``.
|
||||
|
||||
Next, you go inside the new project directory::
|
||||
|
||||
cd myproject
|
||||
cd project_dir
|
||||
|
||||
And you're ready to use the ``scrapy`` command to manage and control your
|
||||
project from there.
|
||||
@ -114,7 +176,7 @@ Available tool commands
|
||||
=======================
|
||||
|
||||
This section contains a list of the available built-in commands with a
|
||||
description and some usage examples. Remember you can always get more info
|
||||
description and some usage examples. Remember, you can always get more info
|
||||
about each command by running::
|
||||
|
||||
scrapy <command> -h
|
||||
@ -132,6 +194,7 @@ settings).
|
||||
Global commands:
|
||||
|
||||
* :command:`startproject`
|
||||
* :command:`genspider`
|
||||
* :command:`settings`
|
||||
* :command:`runspider`
|
||||
* :command:`shell`
|
||||
@ -146,8 +209,6 @@ Project-only commands:
|
||||
* :command:`list`
|
||||
* :command:`edit`
|
||||
* :command:`parse`
|
||||
* :command:`genspider`
|
||||
* :command:`deploy`
|
||||
* :command:`bench`
|
||||
|
||||
.. command:: startproject
|
||||
@ -155,11 +216,12 @@ Project-only commands:
|
||||
startproject
|
||||
------------
|
||||
|
||||
* Syntax: ``scrapy startproject <project_name>``
|
||||
* Syntax: ``scrapy startproject <project_name> [project_dir]``
|
||||
* Requires project: *no*
|
||||
|
||||
Creates a new Scrapy project named ``project_name``, under the ``project_name``
|
||||
Creates a new Scrapy project named ``project_name``, under the ``project_dir``
|
||||
directory.
|
||||
If ``project_dir`` wasn't specified, ``project_dir`` will be the same as ``project_name``.
|
||||
|
||||
Usage example::
|
||||
|
||||
@ -171,14 +233,9 @@ genspider
|
||||
---------
|
||||
|
||||
* Syntax: ``scrapy genspider [-t template] <name> <domain>``
|
||||
* Requires project: *yes*
|
||||
* Requires project: *no*
|
||||
|
||||
Create a new spider in the current project.
|
||||
|
||||
This is just a convenient shortcut command for creating spiders based on
|
||||
pre-defined templates, but certainly not the only way to create spiders. You
|
||||
can just create the spider source code files yourself, instead of using this
|
||||
command.
|
||||
Create a new spider in the current folder or in the current project's ``spiders`` folder, if called from inside a project. The ``<name>`` parameter is set as the spider's ``name``, while ``<domain>`` is used to generate the ``allowed_domains`` and ``start_urls`` spider's attributes.
|
||||
|
||||
Usage example::
|
||||
|
||||
@ -189,22 +246,16 @@ Usage example::
|
||||
csvfeed
|
||||
xmlfeed
|
||||
|
||||
$ scrapy genspider -d basic
|
||||
import scrapy
|
||||
$ scrapy genspider example example.com
|
||||
Created spider 'example' using template 'basic'
|
||||
|
||||
class $classname(scrapy.Spider):
|
||||
name = "$name"
|
||||
allowed_domains = ["$domain"]
|
||||
start_urls = (
|
||||
'http://www.$domain/',
|
||||
)
|
||||
$ scrapy genspider -t crawl scrapyorg scrapy.org
|
||||
Created spider 'scrapyorg' using template 'crawl'
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
||||
|
||||
$ scrapy genspider -t basic example example.com
|
||||
Created spider 'example' using template 'basic' in module:
|
||||
mybot.spiders.example
|
||||
This is just a convenience shortcut command for creating spiders based on
|
||||
pre-defined templates, but certainly not the only way to create spiders. You
|
||||
can just create the spider source code files yourself, instead of using this
|
||||
command.
|
||||
|
||||
.. command:: crawl
|
||||
|
||||
@ -232,6 +283,8 @@ check
|
||||
|
||||
Run contract checks.
|
||||
|
||||
.. skip: start
|
||||
|
||||
Usage examples::
|
||||
|
||||
$ scrapy check -l
|
||||
@ -249,6 +302,8 @@ Usage examples::
|
||||
[FAILED] first_spider:parse
|
||||
>>> Returned 92 requests, expected 0..4
|
||||
|
||||
.. skip: end
|
||||
|
||||
.. command:: list
|
||||
|
||||
list
|
||||
@ -274,12 +329,12 @@ edit
|
||||
* Syntax: ``scrapy edit <spider>``
|
||||
* Requires project: *yes*
|
||||
|
||||
Edit the given spider using the editor defined in the :setting:`EDITOR`
|
||||
setting.
|
||||
Edit the given spider using the editor defined in the ``EDITOR`` environment
|
||||
variable or (if unset) the :setting:`EDITOR` setting.
|
||||
|
||||
This command is provided only as a convenient shortcut for the most common
|
||||
This command is provided only as a convenience shortcut for the most common
|
||||
case, the developer is of course free to choose any tool or IDE to write and
|
||||
debug his spiders.
|
||||
debug spiders.
|
||||
|
||||
Usage example::
|
||||
|
||||
@ -297,7 +352,7 @@ Downloads the given URL using the Scrapy downloader and writes the contents to
|
||||
standard output.
|
||||
|
||||
The interesting thing about this command is that it fetches the page how the
|
||||
spider would download it. For example, if the spider has an ``USER_AGENT``
|
||||
spider would download it. For example, if the spider has a ``USER_AGENT``
|
||||
attribute which overrides the User Agent, it will use that one.
|
||||
|
||||
So this command can be used to "see" how your spider would fetch a certain page.
|
||||
@ -305,6 +360,14 @@ So this command can be used to "see" how your spider would fetch a certain page.
|
||||
If used outside a project, no particular per-spider behaviour would be applied
|
||||
and it will just use the default Scrapy downloader settings.
|
||||
|
||||
Supported options:
|
||||
|
||||
* ``--spider=SPIDER``: bypass spider autodetection and force use of specific spider
|
||||
|
||||
* ``--headers``: print the response's HTTP headers instead of the response's body
|
||||
|
||||
* ``--no-redirect``: do not follow HTTP 3xx redirects (default is to follow them)
|
||||
|
||||
Usage examples::
|
||||
|
||||
$ scrapy fetch --nolog http://www.example.com/some/page.html
|
||||
@ -333,6 +396,12 @@ Opens the given URL in a browser, as your Scrapy spider would "see" it.
|
||||
Sometimes spiders see pages differently from regular users, so this can be used
|
||||
to check what the spider "sees" and confirm it's what you expect.
|
||||
|
||||
Supported options:
|
||||
|
||||
* ``--spider=SPIDER``: bypass spider autodetection and force use of specific spider
|
||||
|
||||
* ``--no-redirect``: do not follow HTTP 3xx redirects (default is to follow them)
|
||||
|
||||
Usage example::
|
||||
|
||||
$ scrapy view http://www.example.com/some/page.html
|
||||
@ -347,13 +416,38 @@ shell
|
||||
* Requires project: *no*
|
||||
|
||||
Starts the Scrapy shell for the given URL (if given) or empty if no URL is
|
||||
given. See :ref:`topics-shell` for more info.
|
||||
given. Also supports UNIX-style local file paths, either relative with
|
||||
``./`` or ``../`` prefixes or absolute file paths.
|
||||
See :ref:`topics-shell` for more info.
|
||||
|
||||
Supported options:
|
||||
|
||||
* ``--spider=SPIDER``: bypass spider autodetection and force use of specific spider
|
||||
|
||||
* ``-c code``: evaluate the code in the shell, print the result and exit
|
||||
|
||||
* ``--no-redirect``: do not follow HTTP 3xx redirects (default is to follow them);
|
||||
this only affects the URL you may pass as argument on the command line;
|
||||
once you are inside the shell, ``fetch(url)`` will still follow HTTP redirects by default.
|
||||
|
||||
Usage example::
|
||||
|
||||
$ scrapy shell http://www.example.com/some/page.html
|
||||
[ ... scrapy shell starts ... ]
|
||||
|
||||
$ scrapy shell --nolog http://www.example.com/ -c '(response.status, response.url)'
|
||||
(200, 'http://www.example.com/')
|
||||
|
||||
# shell follows HTTP redirects by default
|
||||
$ scrapy shell --nolog http://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.com%2F -c '(response.status, response.url)'
|
||||
(200, 'http://example.com/')
|
||||
|
||||
# you can disable this with --no-redirect
|
||||
# (only for the URL passed as command line argument)
|
||||
$ scrapy shell --no-redirect --nolog http://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.com%2F -c '(response.status, response.url)'
|
||||
(302, 'http://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.com%2F')
|
||||
|
||||
|
||||
.. command:: parse
|
||||
|
||||
parse
|
||||
@ -374,9 +468,15 @@ Supported options:
|
||||
* ``--callback`` or ``-c``: spider method to use as callback for parsing the
|
||||
response
|
||||
|
||||
* ``--meta`` or ``-m``: additional request meta that will be passed to the callback
|
||||
request. This must be a valid json string. Example: --meta='{"foo" : "bar"}'
|
||||
|
||||
* ``--cbkwargs``: additional keyword arguments that will be passed to the callback.
|
||||
This must be a valid json string. Example: --cbkwargs='{"foo" : "bar"}'
|
||||
|
||||
* ``--pipelines``: process items through pipelines
|
||||
|
||||
* ``--rules`` or ``-r``: use :class:`~scrapy.contrib.spiders.CrawlSpider`
|
||||
* ``--rules`` or ``-r``: use :class:`~scrapy.spiders.CrawlSpider`
|
||||
rules to discover the callback (i.e. spider method) to use for parsing the
|
||||
response
|
||||
|
||||
@ -391,6 +491,12 @@ Supported options:
|
||||
|
||||
* ``--verbose`` or ``-v``: display information for each depth level
|
||||
|
||||
* ``--output`` or ``-o``: dump scraped items to a file
|
||||
|
||||
.. versionadded:: 2.3
|
||||
|
||||
.. skip: start
|
||||
|
||||
Usage example::
|
||||
|
||||
$ scrapy parse http://www.example.com/ -c parse_item
|
||||
@ -398,13 +504,15 @@ Usage example::
|
||||
|
||||
>>> STATUS DEPTH LEVEL 1 <<<
|
||||
# Scraped Items ------------------------------------------------------------
|
||||
[{'name': u'Example item',
|
||||
'category': u'Furniture',
|
||||
'length': u'12 cm'}]
|
||||
[{'name': 'Example item',
|
||||
'category': 'Furniture',
|
||||
'length': '12 cm'}]
|
||||
|
||||
# Requests -----------------------------------------------------------------
|
||||
[]
|
||||
|
||||
.. skip: end
|
||||
|
||||
|
||||
.. command:: settings
|
||||
|
||||
@ -453,18 +561,6 @@ version
|
||||
Prints the Scrapy version. If used with ``-v`` it also prints Python, Twisted
|
||||
and Platform info, which is useful for bug reports.
|
||||
|
||||
.. command:: deploy
|
||||
|
||||
deploy
|
||||
------
|
||||
|
||||
.. versionadded:: 0.11
|
||||
|
||||
* Syntax: ``scrapy deploy [ <target:project> | -l <target> | -L ]``
|
||||
* Requires project: *yes*
|
||||
|
||||
Deploy the project into a Scrapyd server. See `Deploying your project`_.
|
||||
|
||||
.. command:: bench
|
||||
|
||||
bench
|
||||
@ -484,7 +580,7 @@ You can also add your custom project commands by using the
|
||||
:setting:`COMMANDS_MODULE` setting. See the Scrapy commands in
|
||||
`scrapy/commands`_ for examples on how to implement your commands.
|
||||
|
||||
.. _scrapy/commands: https://github.com/scrapy/scrapy/blob/master/scrapy/commands
|
||||
.. _scrapy/commands: https://github.com/scrapy/scrapy/tree/master/scrapy/commands
|
||||
.. setting:: COMMANDS_MODULE
|
||||
|
||||
COMMANDS_MODULE
|
||||
@ -495,8 +591,35 @@ Default: ``''`` (empty string)
|
||||
A module to use for looking up custom Scrapy commands. This is used to add custom
|
||||
commands for your Scrapy project.
|
||||
|
||||
Example::
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
COMMANDS_MODULE = 'mybot.commands'
|
||||
|
||||
.. _Deploying your project: http://scrapyd.readthedocs.org/en/latest/deploy.html
|
||||
.. _Deploying your project: https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
Register commands via setup.py entry points
|
||||
-------------------------------------------
|
||||
|
||||
.. note:: This is an experimental feature, use with caution.
|
||||
|
||||
You can also add Scrapy commands from an external library by adding a
|
||||
``scrapy.commands`` section in the entry points of the library ``setup.py``
|
||||
file.
|
||||
|
||||
The following example adds ``my_command`` command:
|
||||
|
||||
.. skip: next
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(name='scrapy-mymodule',
|
||||
entry_points={
|
||||
'scrapy.commands': [
|
||||
'my_command=my_scrapy_module.commands:MyCommand',
|
||||
],
|
||||
},
|
||||
)
|
||||
|
@ -6,10 +6,6 @@ Spiders Contracts
|
||||
|
||||
.. versionadded:: 0.15
|
||||
|
||||
.. note:: This is a new feature (introduced in Scrapy 0.15) and may be subject
|
||||
to minor functionality/API updates. Check the :ref:`release notes <news>` to
|
||||
be notified of updates.
|
||||
|
||||
Testing spiders can get particularly annoying and while nothing prevents you
|
||||
from writing unit tests the task gets cumbersome quickly. Scrapy offers an
|
||||
integrated way of testing your spiders by the means of contracts.
|
||||
@ -35,12 +31,20 @@ This callback is tested using three built-in contracts:
|
||||
|
||||
.. class:: UrlContract
|
||||
|
||||
This contract (``@url``) sets the sample url used when checking other
|
||||
This contract (``@url``) sets the sample URL used when checking other
|
||||
contract conditions for this spider. This contract is mandatory. All
|
||||
callbacks lacking this contract are ignored when running the checks::
|
||||
|
||||
@url url
|
||||
|
||||
.. class:: CallbackKeywordArgumentsContract
|
||||
|
||||
This contract (``@cb_kwargs``) sets the :attr:`cb_kwargs <scrapy.http.Request.cb_kwargs>`
|
||||
attribute for the sample request. It must be a valid JSON dictionary.
|
||||
::
|
||||
|
||||
@cb_kwargs {"arg1": "value1", "arg2": "value2", ...}
|
||||
|
||||
.. class:: ReturnsContract
|
||||
|
||||
This contract (``@returns``) sets lower and upper bounds for the items and
|
||||
@ -60,7 +64,7 @@ Use the :command:`check` command to run the contract checks.
|
||||
Custom Contracts
|
||||
================
|
||||
|
||||
If you find you need more power than the built-in scrapy contracts you can
|
||||
If you find you need more power than the built-in Scrapy contracts you can
|
||||
create and load your own contracts in the project by using the
|
||||
:setting:`SPIDER_CONTRACTS` setting::
|
||||
|
||||
@ -69,15 +73,15 @@ create and load your own contracts in the project by using the
|
||||
'myproject.contracts.ItemValidate': 10,
|
||||
}
|
||||
|
||||
Each contract must inherit from :class:`scrapy.contracts.Contract` and can
|
||||
Each contract must inherit from :class:`~scrapy.contracts.Contract` and can
|
||||
override three methods:
|
||||
|
||||
.. module:: scrapy.contracts
|
||||
|
||||
.. class:: Contract(method, \*args)
|
||||
.. class:: Contract(method, *args)
|
||||
|
||||
:param method: callback function to which the contract is associated
|
||||
:type method: function
|
||||
:type method: collections.abc.Callable
|
||||
|
||||
:param args: list of arguments passed into the docstring (whitespace
|
||||
separated)
|
||||
@ -86,8 +90,11 @@ override three methods:
|
||||
.. method:: Contract.adjust_request_args(args)
|
||||
|
||||
This receives a ``dict`` as an argument containing default arguments
|
||||
for :class:`~scrapy.http.Request` object. Must return the same or a
|
||||
modified version of it.
|
||||
for request object. :class:`~scrapy.http.Request` is used by default,
|
||||
but this can be changed with the ``request_cls`` attribute.
|
||||
If multiple contracts in chain have this attribute defined, the last one is used.
|
||||
|
||||
Must return the same or a modified version of it.
|
||||
|
||||
.. method:: Contract.pre_process(response)
|
||||
|
||||
@ -99,9 +106,14 @@ override three methods:
|
||||
This allows processing the output of the callback. Iterators are
|
||||
converted listified before being passed to this hook.
|
||||
|
||||
Raise :class:`~scrapy.exceptions.ContractFail` from
|
||||
:class:`~scrapy.contracts.Contract.pre_process` or
|
||||
:class:`~scrapy.contracts.Contract.post_process` if expectations are not met:
|
||||
|
||||
.. autoclass:: scrapy.exceptions.ContractFail
|
||||
|
||||
Here is a demo contract which checks the presence of a custom header in the
|
||||
response received. Raise :class:`scrapy.exceptions.ContractFail` in order to
|
||||
get the failures pretty printed::
|
||||
response received::
|
||||
|
||||
from scrapy.contracts import Contract
|
||||
from scrapy.exceptions import ContractFail
|
||||
@ -117,3 +129,22 @@ get the failures pretty printed::
|
||||
for header in self.args:
|
||||
if header not in response.headers:
|
||||
raise ContractFail('X-CustomHeader not present')
|
||||
|
||||
.. _detecting-contract-check-runs:
|
||||
|
||||
Detecting check runs
|
||||
====================
|
||||
|
||||
When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
|
||||
set to the ``true`` string. You can use :data:`os.environ` to perform any change to
|
||||
your spiders or your settings when ``scrapy check`` is used::
|
||||
|
||||
import os
|
||||
import scrapy
|
||||
|
||||
class ExampleSpider(scrapy.Spider):
|
||||
name = 'example'
|
||||
|
||||
def __init__(self):
|
||||
if os.environ.get('SCRAPY_CHECK'):
|
||||
pass # Do some scraper adjustments when a check is running
|
||||
|
112
docs/topics/coroutines.rst
Normal file
@ -0,0 +1,112 @@
|
||||
==========
|
||||
Coroutines
|
||||
==========
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Scrapy has :ref:`partial support <coroutine-support>` for the
|
||||
:ref:`coroutine syntax <async>`.
|
||||
|
||||
.. _coroutine-support:
|
||||
|
||||
Supported callables
|
||||
===================
|
||||
|
||||
The following callables may be defined as coroutines using ``async def``, and
|
||||
hence use coroutine syntax (e.g. ``await``, ``async for``, ``async with``):
|
||||
|
||||
- :class:`~scrapy.http.Request` callbacks.
|
||||
|
||||
The following are known caveats of the current implementation that we aim
|
||||
to address in future versions of Scrapy:
|
||||
|
||||
- The callback output is not processed until the whole callback finishes.
|
||||
|
||||
As a side effect, if the callback raises an exception, none of its
|
||||
output is processed.
|
||||
|
||||
- Because `asynchronous generators were introduced in Python 3.6`_, you
|
||||
can only use ``yield`` if you are using Python 3.6 or later.
|
||||
|
||||
If you need to output multiple items or requests and you are using
|
||||
Python 3.5, return an iterable (e.g. a list) instead.
|
||||
|
||||
- The :meth:`process_item` method of
|
||||
:ref:`item pipelines <topics-item-pipeline>`.
|
||||
|
||||
- The
|
||||
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_request`,
|
||||
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_response`,
|
||||
and
|
||||
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_exception`
|
||||
methods of
|
||||
:ref:`downloader middlewares <topics-downloader-middleware-custom>`.
|
||||
|
||||
- :ref:`Signal handlers that support deferreds <signal-deferred>`.
|
||||
|
||||
.. _asynchronous generators were introduced in Python 3.6: https://www.python.org/dev/peps/pep-0525/
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
There are several use cases for coroutines in Scrapy. Code that would
|
||||
return Deferreds when written for previous Scrapy versions, such as downloader
|
||||
middlewares and signal handlers, can be rewritten to be shorter and cleaner::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
class DbPipeline:
|
||||
def _update_item(self, data, item):
|
||||
adapter = ItemAdapter(item)
|
||||
adapter['field'] = data
|
||||
return item
|
||||
|
||||
def process_item(self, item, spider):
|
||||
adapter = ItemAdapter(item)
|
||||
dfd = db.get_some_data(adapter['id'])
|
||||
dfd.addCallback(self._update_item, item)
|
||||
return dfd
|
||||
|
||||
becomes::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
class DbPipeline:
|
||||
async def process_item(self, item, spider):
|
||||
adapter = ItemAdapter(item)
|
||||
adapter['field'] = await db.get_some_data(adapter['id'])
|
||||
return item
|
||||
|
||||
Coroutines may be used to call asynchronous code. This includes other
|
||||
coroutines, functions that return Deferreds and functions that return
|
||||
:term:`awaitable objects <awaitable>` such as :class:`~asyncio.Future`.
|
||||
This means you can use many useful Python libraries providing such code::
|
||||
|
||||
class MySpider(Spider):
|
||||
# ...
|
||||
async def parse_with_deferred(self, response):
|
||||
additional_response = await treq.get('https://additional.url')
|
||||
additional_data = await treq.content(additional_response)
|
||||
# ... use response and additional_data to yield items and requests
|
||||
|
||||
async def parse_with_asyncio(self, response):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get('https://additional.url') as additional_response:
|
||||
additional_data = await r.text()
|
||||
# ... use response and additional_data to yield items and requests
|
||||
|
||||
.. note:: Many libraries that use coroutines, such as `aio-libs`_, require the
|
||||
:mod:`asyncio` loop and to use them you need to
|
||||
:doc:`enable asyncio support in Scrapy<asyncio>`.
|
||||
|
||||
Common use cases for asynchronous code include:
|
||||
|
||||
* requesting data from websites, databases and other services (in callbacks,
|
||||
pipelines and middlewares);
|
||||
* storing data in databases (in pipelines and middlewares);
|
||||
* delaying the spider initialization until some external event (in the
|
||||
:signal:`spider_opened` handler);
|
||||
* calling asynchronous Scrapy methods like ``ExecutionEngine.download`` (see
|
||||
:ref:`the screenshot pipeline example<ScreenshotPipeline>`).
|
||||
|
||||
.. _aio-libs: https://github.com/aio-libs
|
@ -5,7 +5,7 @@ Debugging Spiders
|
||||
=================
|
||||
|
||||
This document explains the most common techniques for debugging spiders.
|
||||
Consider the following scrapy spider below::
|
||||
Consider the following Scrapy spider below::
|
||||
|
||||
import scrapy
|
||||
from myproject.items import MyItem
|
||||
@ -18,24 +18,25 @@ Consider the following scrapy spider below::
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
# collect `item_urls`
|
||||
# <processing code not shown>
|
||||
# collect `item_urls`
|
||||
for item_url in item_urls:
|
||||
yield scrapy.Request(item_url, self.parse_item)
|
||||
|
||||
def parse_item(self, response):
|
||||
# <processing code not shown>
|
||||
item = MyItem()
|
||||
# populate `item` fields
|
||||
# and extract item_details_url
|
||||
yield scrapy.Request(item_details_url, self.parse_details, meta={'item': item})
|
||||
yield scrapy.Request(item_details_url, self.parse_details, cb_kwargs={'item': item})
|
||||
|
||||
def parse_details(self, response):
|
||||
item = response.meta['item']
|
||||
def parse_details(self, response, item):
|
||||
# populate more `item` fields
|
||||
return item
|
||||
|
||||
Basically this is a simple spider which parses two pages of items (the
|
||||
start_urls). Items also have a details page with additional information, so we
|
||||
use the ``meta`` functionality of :class:`~scrapy.http.Request` to pass a
|
||||
use the ``cb_kwargs`` functionality of :class:`~scrapy.http.Request` to pass a
|
||||
partially populated item.
|
||||
|
||||
|
||||
@ -47,6 +48,10 @@ The most basic way of checking the output of your spider is to use the
|
||||
of the spider at the method level. It has the advantage of being flexible and
|
||||
simple to use, but does not allow debugging code inside a method.
|
||||
|
||||
.. highlight:: none
|
||||
|
||||
.. skip: start
|
||||
|
||||
In order to see the item scraped from a specific url::
|
||||
|
||||
$ scrapy parse --spider=myspider -c parse_item -d 2 <item_url>
|
||||
@ -84,6 +89,8 @@ using::
|
||||
|
||||
$ scrapy parse --spider=myspider -d 3 'http://example.com/page1'
|
||||
|
||||
.. skip: end
|
||||
|
||||
|
||||
Scrapy Shell
|
||||
============
|
||||
@ -93,13 +100,14 @@ spider, it is of little help to check what happens inside a callback, besides
|
||||
showing the response received and the output. How to debug the situation when
|
||||
``parse_details`` sometimes receives no item?
|
||||
|
||||
.. highlight:: python
|
||||
|
||||
Fortunately, the :command:`shell` is your bread and butter in this case (see
|
||||
:ref:`topics-shell-inspect-response`)::
|
||||
|
||||
from scrapy.shell import inspect_response
|
||||
|
||||
def parse_details(self, response):
|
||||
item = response.meta.get('item', None)
|
||||
def parse_details(self, response, item=None):
|
||||
if item:
|
||||
# populate more `item` fields
|
||||
return item
|
||||
@ -132,17 +140,13 @@ Logging is another useful option for getting information about your spider run.
|
||||
Although not as convenient, it comes with the advantage that the logs will be
|
||||
available in all future runs should they be necessary again::
|
||||
|
||||
from scrapy import log
|
||||
|
||||
def parse_details(self, response):
|
||||
item = response.meta.get('item', None)
|
||||
def parse_details(self, response, item=None):
|
||||
if item:
|
||||
# populate more `item` fields
|
||||
return item
|
||||
else:
|
||||
self.log('No item received for %s' % response.url,
|
||||
level=log.WARNING)
|
||||
self.logger.warning('No item received for %s', response.url)
|
||||
|
||||
For more information, check the :ref:`topics-logging` section.
|
||||
|
||||
.. _base tag: http://www.w3schools.com/tags/tag_base.asp
|
||||
.. _base tag: https://www.w3schools.com/tags/tag_base.asp
|
||||
|
59
docs/topics/deploy.rst
Normal file
@ -0,0 +1,59 @@
|
||||
.. _topics-deploy:
|
||||
|
||||
=================
|
||||
Deploying Spiders
|
||||
=================
|
||||
|
||||
This section describes the different options you have for deploying your Scrapy
|
||||
spiders to run them on a regular basis. Running Scrapy spiders in your local
|
||||
machine is very convenient for the (early) development stage, but not so much
|
||||
when you need to execute long-running spiders or move spiders to run in
|
||||
production continuously. This is where the solutions for deploying Scrapy
|
||||
spiders come in.
|
||||
|
||||
Popular choices for deploying Scrapy spiders are:
|
||||
|
||||
* :ref:`Scrapyd <deploy-scrapyd>` (open source)
|
||||
* :ref:`Scrapy Cloud <deploy-scrapy-cloud>` (cloud-based)
|
||||
|
||||
.. _deploy-scrapyd:
|
||||
|
||||
Deploying to a Scrapyd Server
|
||||
=============================
|
||||
|
||||
`Scrapyd`_ is an open source application to run Scrapy spiders. It provides
|
||||
a server with HTTP API, capable of running and monitoring Scrapy spiders.
|
||||
|
||||
To deploy spiders to Scrapyd, you can use the scrapyd-deploy tool provided by
|
||||
the `scrapyd-client`_ package. Please refer to the `scrapyd-deploy
|
||||
documentation`_ for more information.
|
||||
|
||||
Scrapyd is maintained by some of the Scrapy developers.
|
||||
|
||||
.. _deploy-scrapy-cloud:
|
||||
|
||||
Deploying to Scrapy Cloud
|
||||
=========================
|
||||
|
||||
`Scrapy Cloud`_ is a hosted, cloud-based service by `Scrapinghub`_,
|
||||
the company behind Scrapy.
|
||||
|
||||
Scrapy Cloud removes the need to setup and monitor servers
|
||||
and provides a nice UI to manage spiders and review scraped items,
|
||||
logs and stats.
|
||||
|
||||
To deploy spiders to Scrapy Cloud you can use the `shub`_ command line tool.
|
||||
Please refer to the `Scrapy Cloud documentation`_ for more information.
|
||||
|
||||
Scrapy Cloud is compatible with Scrapyd and one can switch between
|
||||
them as needed - the configuration is read from the ``scrapy.cfg`` file
|
||||
just like ``scrapyd-deploy``.
|
||||
|
||||
.. _Scrapyd: https://github.com/scrapy/scrapyd
|
||||
.. _Deploying your project: https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
.. _Scrapy Cloud: https://scrapinghub.com/scrapy-cloud
|
||||
.. _scrapyd-client: https://github.com/scrapy/scrapyd-client
|
||||
.. _shub: https://doc.scrapinghub.com/shub.html
|
||||
.. _scrapyd-deploy documentation: https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
.. _Scrapy Cloud documentation: https://doc.scrapinghub.com/scrapy-cloud.html
|
||||
.. _Scrapinghub: https://scrapinghub.com/
|
311
docs/topics/developer-tools.rst
Normal file
@ -0,0 +1,311 @@
|
||||
.. _topics-developer-tools:
|
||||
|
||||
=================================================
|
||||
Using your browser's Developer Tools for scraping
|
||||
=================================================
|
||||
|
||||
Here is a general guide on how to use your browser's Developer Tools
|
||||
to ease the scraping process. Today almost all browsers come with
|
||||
built in `Developer Tools`_ and although we will use Firefox in this
|
||||
guide, the concepts are applicable to any other browser.
|
||||
|
||||
In this guide we'll introduce the basic tools to use from a browser's
|
||||
Developer Tools by scraping `quotes.toscrape.com`_.
|
||||
|
||||
.. _topics-livedom:
|
||||
|
||||
Caveats with inspecting the live browser DOM
|
||||
============================================
|
||||
|
||||
Since Developer Tools operate on a live browser DOM, what you'll actually see
|
||||
when inspecting the page source is not the original HTML, but a modified one
|
||||
after applying some browser clean up and executing Javascript code. Firefox,
|
||||
in particular, is known for adding ``<tbody>`` elements to tables. Scrapy, on
|
||||
the other hand, does not modify the original page HTML, so you won't be able to
|
||||
extract any data if you use ``<tbody>`` in your XPath expressions.
|
||||
|
||||
Therefore, you should keep in mind the following things:
|
||||
|
||||
* Disable Javascript while inspecting the DOM looking for XPaths to be
|
||||
used in Scrapy (in the Developer Tools settings click `Disable JavaScript`)
|
||||
|
||||
* Never use full XPath paths, use relative and clever ones based on attributes
|
||||
(such as ``id``, ``class``, ``width``, etc) or any identifying features like
|
||||
``contains(@href, 'image')``.
|
||||
|
||||
* Never include ``<tbody>`` elements in your XPath expressions unless you
|
||||
really know what you're doing
|
||||
|
||||
.. _topics-inspector:
|
||||
|
||||
Inspecting a website
|
||||
====================
|
||||
|
||||
By far the most handy feature of the Developer Tools is the `Inspector`
|
||||
feature, which allows you to inspect the underlying HTML code of
|
||||
any webpage. To demonstrate the Inspector, let's look at the
|
||||
`quotes.toscrape.com`_-site.
|
||||
|
||||
On the site we have a total of ten quotes from various authors with specific
|
||||
tags, as well as the Top Ten Tags. Let's say we want to extract all the quotes
|
||||
on this page, without any meta-information about authors, tags, etc.
|
||||
|
||||
Instead of viewing the whole source code for the page, we can simply right click
|
||||
on a quote and select ``Inspect Element (Q)``, which opens up the `Inspector`.
|
||||
In it you should see something like this:
|
||||
|
||||
.. image:: _images/inspector_01.png
|
||||
:width: 777
|
||||
:height: 469
|
||||
:alt: Firefox's Inspector-tool
|
||||
|
||||
The interesting part for us is this:
|
||||
|
||||
.. code-block:: html
|
||||
|
||||
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">(...)</span>
|
||||
<span>(...)</span>
|
||||
<div class="tags">(...)</div>
|
||||
</div>
|
||||
|
||||
If you hover over the first ``div`` directly above the ``span`` tag highlighted
|
||||
in the screenshot, you'll see that the corresponding section of the webpage gets
|
||||
highlighted as well. So now we have a section, but we can't find our quote text
|
||||
anywhere.
|
||||
|
||||
The advantage of the `Inspector` is that it automatically expands and collapses
|
||||
sections and tags of a webpage, which greatly improves readability. You can
|
||||
expand and collapse a tag by clicking on the arrow in front of it or by double
|
||||
clicking directly on the tag. If we expand the ``span`` tag with the ``class=
|
||||
"text"`` we will see the quote-text we clicked on. The `Inspector` lets you
|
||||
copy XPaths to selected elements. Let's try it out.
|
||||
|
||||
First open the Scrapy shell at http://quotes.toscrape.com/ in a terminal:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
$ scrapy shell "http://quotes.toscrape.com/"
|
||||
|
||||
Then, back to your web browser, right-click on the ``span`` tag, select
|
||||
``Copy > XPath`` and paste it in the Scrapy shell like so:
|
||||
|
||||
.. invisible-code-block: python
|
||||
|
||||
response = load_response('http://quotes.toscrape.com/', 'quotes.html')
|
||||
|
||||
>>> response.xpath('/html/body/div/div[2]/div[1]/div[1]/span[1]/text()').getall()
|
||||
['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”']
|
||||
|
||||
Adding ``text()`` at the end we are able to extract the first quote with this
|
||||
basic selector. But this XPath is not really that clever. All it does is
|
||||
go down a desired path in the source code starting from ``html``. So let's
|
||||
see if we can refine our XPath a bit:
|
||||
|
||||
If we check the `Inspector` again we'll see that directly beneath our
|
||||
expanded ``div`` tag we have nine identical ``div`` tags, each with the
|
||||
same attributes as our first. If we expand any of them, we'll see the same
|
||||
structure as with our first quote: Two ``span`` tags and one ``div`` tag. We can
|
||||
expand each ``span`` tag with the ``class="text"`` inside our ``div`` tags and
|
||||
see each quote:
|
||||
|
||||
.. code-block:: html
|
||||
|
||||
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">
|
||||
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
|
||||
</span>
|
||||
<span>(...)</span>
|
||||
<div class="tags">(...)</div>
|
||||
</div>
|
||||
|
||||
|
||||
With this knowledge we can refine our XPath: Instead of a path to follow,
|
||||
we'll simply select all ``span`` tags with the ``class="text"`` by using
|
||||
the `has-class-extension`_:
|
||||
|
||||
>>> response.xpath('//span[has-class("text")]/text()').getall()
|
||||
['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
|
||||
'“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
|
||||
'“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
|
||||
...]
|
||||
|
||||
And with one simple, cleverer XPath we are able to extract all quotes from
|
||||
the page. We could have constructed a loop over our first XPath to increase
|
||||
the number of the last ``div``, but this would have been unnecessarily
|
||||
complex and by simply constructing an XPath with ``has-class("text")``
|
||||
we were able to extract all quotes in one line.
|
||||
|
||||
The `Inspector` has a lot of other helpful features, such as searching in the
|
||||
source code or directly scrolling to an element you selected. Let's demonstrate
|
||||
a use case:
|
||||
|
||||
Say you want to find the ``Next`` button on the page. Type ``Next`` into the
|
||||
search bar on the top right of the `Inspector`. You should get two results.
|
||||
The first is a ``li`` tag with the ``class="next"``, the second the text
|
||||
of an ``a`` tag. Right click on the ``a`` tag and select ``Scroll into View``.
|
||||
If you hover over the tag, you'll see the button highlighted. From here
|
||||
we could easily create a :ref:`Link Extractor <topics-link-extractors>` to
|
||||
follow the pagination. On a simple site such as this, there may not be
|
||||
the need to find an element visually but the ``Scroll into View`` function
|
||||
can be quite useful on complex sites.
|
||||
|
||||
Note that the search bar can also be used to search for and test CSS
|
||||
selectors. For example, you could search for ``span.text`` to find
|
||||
all quote texts. Instead of a full text search, this searches for
|
||||
exactly the ``span`` tag with the ``class="text"`` in the page.
|
||||
|
||||
.. _topics-network-tool:
|
||||
|
||||
The Network-tool
|
||||
================
|
||||
While scraping you may come across dynamic webpages where some parts
|
||||
of the page are loaded dynamically through multiple requests. While
|
||||
this can be quite tricky, the `Network`-tool in the Developer Tools
|
||||
greatly facilitates this task. To demonstrate the Network-tool, let's
|
||||
take a look at the page `quotes.toscrape.com/scroll`_.
|
||||
|
||||
The page is quite similar to the basic `quotes.toscrape.com`_-page,
|
||||
but instead of the above-mentioned ``Next`` button, the page
|
||||
automatically loads new quotes when you scroll to the bottom. We
|
||||
could go ahead and try out different XPaths directly, but instead
|
||||
we'll check another quite useful command from the Scrapy shell:
|
||||
|
||||
.. skip: next
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
$ scrapy shell "quotes.toscrape.com/scroll"
|
||||
(...)
|
||||
>>> view(response)
|
||||
|
||||
A browser window should open with the webpage but with one
|
||||
crucial difference: Instead of the quotes we just see a greenish
|
||||
bar with the word ``Loading...``.
|
||||
|
||||
.. image:: _images/network_01.png
|
||||
:width: 777
|
||||
:height: 296
|
||||
:alt: Response from quotes.toscrape.com/scroll
|
||||
|
||||
The ``view(response)`` command let's us view the response our
|
||||
shell or later our spider receives from the server. Here we see
|
||||
that some basic template is loaded which includes the title,
|
||||
the login-button and the footer, but the quotes are missing. This
|
||||
tells us that the quotes are being loaded from a different request
|
||||
than ``quotes.toscrape/scroll``.
|
||||
|
||||
If you click on the ``Network`` tab, you will probably only see
|
||||
two entries. The first thing we do is enable persistent logs by
|
||||
clicking on ``Persist Logs``. If this option is disabled, the
|
||||
log is automatically cleared each time you navigate to a different
|
||||
page. Enabling this option is a good default, since it gives us
|
||||
control on when to clear the logs.
|
||||
|
||||
If we reload the page now, you'll see the log get populated with six
|
||||
new requests.
|
||||
|
||||
.. image:: _images/network_02.png
|
||||
:width: 777
|
||||
:height: 241
|
||||
:alt: Network tab with persistent logs and requests
|
||||
|
||||
Here we see every request that has been made when reloading the page
|
||||
and can inspect each request and its response. So let's find out
|
||||
where our quotes are coming from:
|
||||
|
||||
First click on the request with the name ``scroll``. On the right
|
||||
you can now inspect the request. In ``Headers`` you'll find details
|
||||
about the request headers, such as the URL, the method, the IP-address,
|
||||
and so on. We'll ignore the other tabs and click directly on ``Response``.
|
||||
|
||||
What you should see in the ``Preview`` pane is the rendered HTML-code,
|
||||
that is exactly what we saw when we called ``view(response)`` in the
|
||||
shell. Accordingly the ``type`` of the request in the log is ``html``.
|
||||
The other requests have types like ``css`` or ``js``, but what
|
||||
interests us is the one request called ``quotes?page=1`` with the
|
||||
type ``json``.
|
||||
|
||||
If we click on this request, we see that the request URL is
|
||||
``http://quotes.toscrape.com/api/quotes?page=1`` and the response
|
||||
is a JSON-object that contains our quotes. We can also right-click
|
||||
on the request and open ``Open in new tab`` to get a better overview.
|
||||
|
||||
.. image:: _images/network_03.png
|
||||
:width: 777
|
||||
:height: 375
|
||||
:alt: JSON-object returned from the quotes.toscrape API
|
||||
|
||||
With this response we can now easily parse the JSON-object and
|
||||
also request each page to get every quote on the site::
|
||||
|
||||
import scrapy
|
||||
import json
|
||||
|
||||
|
||||
class QuoteSpider(scrapy.Spider):
|
||||
name = 'quote'
|
||||
allowed_domains = ['quotes.toscrape.com']
|
||||
page = 1
|
||||
start_urls = ['http://quotes.toscrape.com/api/quotes?page=1']
|
||||
|
||||
def parse(self, response):
|
||||
data = json.loads(response.text)
|
||||
for quote in data["quotes"]:
|
||||
yield {"quote": quote["text"]}
|
||||
if data["has_next"]:
|
||||
self.page += 1
|
||||
url = "http://quotes.toscrape.com/api/quotes?page={}".format(self.page)
|
||||
yield scrapy.Request(url=url, callback=self.parse)
|
||||
|
||||
This spider starts at the first page of the quotes-API. With each
|
||||
response, we parse the ``response.text`` and assign it to ``data``.
|
||||
This lets us operate on the JSON-object like on a Python dictionary.
|
||||
We iterate through the ``quotes`` and print out the ``quote["text"]``.
|
||||
If the handy ``has_next`` element is ``true`` (try loading
|
||||
`quotes.toscrape.com/api/quotes?page=10`_ in your browser or a
|
||||
page-number greater than 10), we increment the ``page`` attribute
|
||||
and ``yield`` a new request, inserting the incremented page-number
|
||||
into our ``url``.
|
||||
|
||||
.. _requests-from-curl:
|
||||
|
||||
In more complex websites, it could be difficult to easily reproduce the
|
||||
requests, as we could need to add ``headers`` or ``cookies`` to make it work.
|
||||
In those cases you can export the requests in `cURL <https://curl.haxx.se/>`_
|
||||
format, by right-clicking on each of them in the network tool and using the
|
||||
:meth:`~scrapy.http.Request.from_curl()` method to generate an equivalent
|
||||
request::
|
||||
|
||||
from scrapy import Request
|
||||
|
||||
request = Request.from_curl(
|
||||
"curl 'http://quotes.toscrape.com/api/quotes?page=1' -H 'User-Agent: Mozil"
|
||||
"la/5.0 (X11; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0' -H 'Acce"
|
||||
"pt: */*' -H 'Accept-Language: ca,en-US;q=0.7,en;q=0.3' --compressed -H 'X"
|
||||
"-Requested-With: XMLHttpRequest' -H 'Proxy-Authorization: Basic QFRLLTAzM"
|
||||
"zEwZTAxLTk5MWUtNDFiNC1iZWRmLTJjNGI4M2ZiNDBmNDpAVEstMDMzMTBlMDEtOTkxZS00MW"
|
||||
"I0LWJlZGYtMmM0YjgzZmI0MGY0' -H 'Connection: keep-alive' -H 'Referer: http"
|
||||
"://quotes.toscrape.com/scroll' -H 'Cache-Control: max-age=0'")
|
||||
|
||||
Alternatively, if you want to know the arguments needed to recreate that
|
||||
request you can use the :func:`~scrapy.utils.curl.curl_to_request_kwargs`
|
||||
function to get a dictionary with the equivalent arguments:
|
||||
|
||||
.. autofunction:: scrapy.utils.curl.curl_to_request_kwargs
|
||||
|
||||
Note that to translate a cURL command into a Scrapy request,
|
||||
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||
|
||||
As you can see, with a few inspections in the `Network`-tool we
|
||||
were able to easily replicate the dynamic requests of the scrolling
|
||||
functionality of the page. Crawling dynamic pages can be quite
|
||||
daunting and pages can be very complex, but it (mostly) boils down
|
||||
to identifying the correct request and replicating it in your spider.
|
||||
|
||||
.. _Developer Tools: https://en.wikipedia.org/wiki/Web_development_tools
|
||||
.. _quotes.toscrape.com: http://quotes.toscrape.com
|
||||
.. _quotes.toscrape.com/scroll: http://quotes.toscrape.com/scroll
|
||||
.. _quotes.toscrape.com/api/quotes?page=10: http://quotes.toscrape.com/api/quotes?page=10
|
||||
.. _has-class-extension: https://parsel.readthedocs.io/en/latest/usage.html#other-xpath-extensions
|
||||
|
@ -1,149 +1,13 @@
|
||||
:orphan:
|
||||
|
||||
.. _topics-djangoitem:
|
||||
|
||||
==========
|
||||
DjangoItem
|
||||
==========
|
||||
|
||||
:class:`DjangoItem` is a class of item that gets its fields definition from a
|
||||
Django model, you simply create a :class:`DjangoItem` and specify what Django
|
||||
model it relates to.
|
||||
DjangoItem has been moved into a separate project.
|
||||
|
||||
Besides of getting the model fields defined on your item, :class:`DjangoItem`
|
||||
provides a method to create and populate a Django model instance with the item
|
||||
data.
|
||||
It is hosted at:
|
||||
|
||||
Using DjangoItem
|
||||
================
|
||||
|
||||
:class:`DjangoItem` works much like ModelForms in Django, you create a subclass
|
||||
and define its ``django_model`` attribute to be a valid Django model. With this
|
||||
you will get an item with a field for each Django model field.
|
||||
|
||||
In addition, you can define fields that aren't present in the model and even
|
||||
override fields that are present in the model defining them in the item.
|
||||
|
||||
Let's see some examples:
|
||||
|
||||
Creating a Django model for the examples::
|
||||
|
||||
from django.db import models
|
||||
|
||||
class Person(models.Model):
|
||||
name = models.CharField(max_length=255)
|
||||
age = models.IntegerField()
|
||||
|
||||
Defining a basic :class:`DjangoItem`::
|
||||
|
||||
from scrapy.contrib.djangoitem import DjangoItem
|
||||
|
||||
class PersonItem(DjangoItem):
|
||||
django_model = Person
|
||||
|
||||
:class:`DjangoItem` work just like :class:`~scrapy.item.Item`::
|
||||
|
||||
>>> p = PersonItem()
|
||||
>>> p['name'] = 'John'
|
||||
>>> p['age'] = '22'
|
||||
|
||||
To obtain the Django model from the item, we call the extra method
|
||||
:meth:`~DjangoItem.save` of the :class:`DjangoItem`::
|
||||
|
||||
>>> person = p.save()
|
||||
>>> person.name
|
||||
'John'
|
||||
>>> person.age
|
||||
'22'
|
||||
>>> person.id
|
||||
1
|
||||
|
||||
The model is already saved when we call :meth:`~DjangoItem.save`, we
|
||||
can prevent this by calling it with ``commit=False``. We can use
|
||||
``commit=False`` in :meth:`~DjangoItem.save` method to obtain an unsaved model::
|
||||
|
||||
>>> person = p.save(commit=False)
|
||||
>>> person.name
|
||||
'John'
|
||||
>>> person.age
|
||||
'22'
|
||||
>>> person.id
|
||||
None
|
||||
|
||||
As said before, we can add other fields to the item::
|
||||
|
||||
import scrapy
|
||||
from scrapy.contrib.djangoitem import DjangoItem
|
||||
|
||||
class PersonItem(DjangoItem):
|
||||
django_model = Person
|
||||
sex = scrapy.Field()
|
||||
|
||||
::
|
||||
|
||||
>>> p = PersonItem()
|
||||
>>> p['name'] = 'John'
|
||||
>>> p['age'] = '22'
|
||||
>>> p['sex'] = 'M'
|
||||
|
||||
.. note:: fields added to the item won't be taken into account when doing a :meth:`~DjangoItem.save`
|
||||
|
||||
And we can override the fields of the model with your own::
|
||||
|
||||
class PersonItem(DjangoItem):
|
||||
django_model = Person
|
||||
name = scrapy.Field(default='No Name')
|
||||
|
||||
This is useful to provide properties to the field, like a default or any other
|
||||
property that your project uses.
|
||||
|
||||
DjangoItem caveats
|
||||
==================
|
||||
|
||||
DjangoItem is a rather convenient way to integrate Scrapy projects with Django
|
||||
models, but bear in mind that Django ORM may not scale well if you scrape a lot
|
||||
of items (ie. millions) with Scrapy. This is because a relational backend is
|
||||
often not a good choice for a write intensive application (such as a web
|
||||
crawler), specially if the database is highly normalized and with many indices.
|
||||
|
||||
Django settings set up
|
||||
======================
|
||||
|
||||
To use the Django models outside the Django application you need to set up the
|
||||
``DJANGO_SETTINGS_MODULE`` environment variable and --in most cases-- modify
|
||||
the ``PYTHONPATH`` environment variable to be able to import the settings
|
||||
module.
|
||||
|
||||
There are many ways to do this depending on your use case and preferences.
|
||||
Below is detailed one of the simplest ways to do it.
|
||||
|
||||
Suppose your Django project is named ``mysite``, is located in the path
|
||||
``/home/projects/mysite`` and you have created an app ``myapp`` with the model
|
||||
``Person``. That means your directory structure is something like this::
|
||||
|
||||
/home/projects/mysite
|
||||
├── manage.py
|
||||
├── myapp
|
||||
│ ├── __init__.py
|
||||
│ ├── models.py
|
||||
│ ├── tests.py
|
||||
│ └── views.py
|
||||
└── mysite
|
||||
├── __init__.py
|
||||
├── settings.py
|
||||
├── urls.py
|
||||
└── wsgi.py
|
||||
|
||||
Then you need to add ``/home/projects/mysite`` to the ``PYTHONPATH``
|
||||
environment variable and set up the environment variable
|
||||
``DJANGO_SETTINGS_MODULE`` to ``mysite.settings``. That can be done in your
|
||||
Scrapy's settings file by adding the lines below::
|
||||
|
||||
import sys
|
||||
sys.path.append('/home/projects/mysite')
|
||||
|
||||
import os
|
||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'mysite.settings'
|
||||
|
||||
Notice that we modify the ``sys.path`` variable instead the ``PYTHONPATH``
|
||||
environment variable as we are already within the python runtime. If everything
|
||||
is right, you should be able to start the ``scrapy shell`` command and import
|
||||
the model ``Person`` (i.e. ``from myapp.models import Person``).
|
||||
https://github.com/scrapy-plugins/scrapy-djangoitem
|
||||
|
269
docs/topics/dynamic-content.rst
Normal file
@ -0,0 +1,269 @@
|
||||
.. _topics-dynamic-content:
|
||||
|
||||
====================================
|
||||
Selecting dynamically-loaded content
|
||||
====================================
|
||||
|
||||
Some webpages show the desired data when you load them in a web browser.
|
||||
However, when you download them using Scrapy, you cannot reach the desired data
|
||||
using :ref:`selectors <topics-selectors>`.
|
||||
|
||||
When this happens, the recommended approach is to
|
||||
:ref:`find the data source <topics-finding-data-source>` and extract the data
|
||||
from it.
|
||||
|
||||
If you fail to do that, and you can nonetheless access the desired data through
|
||||
the :ref:`DOM <topics-livedom>` from your web browser, see
|
||||
:ref:`topics-javascript-rendering`.
|
||||
|
||||
.. _topics-finding-data-source:
|
||||
|
||||
Finding the data source
|
||||
=======================
|
||||
|
||||
To extract the desired data, you must first find its source location.
|
||||
|
||||
If the data is in a non-text-based format, such as an image or a PDF document,
|
||||
use the :ref:`network tool <topics-network-tool>` of your web browser to find
|
||||
the corresponding request, and :ref:`reproduce it
|
||||
<topics-reproducing-requests>`.
|
||||
|
||||
If your web browser lets you select the desired data as text, the data may be
|
||||
defined in embedded JavaScript code, or loaded from an external resource in a
|
||||
text-based format.
|
||||
|
||||
In that case, you can use a tool like wgrep_ to find the URL of that resource.
|
||||
|
||||
If the data turns out to come from the original URL itself, you must
|
||||
:ref:`inspect the source code of the webpage <topics-inspecting-source>` to
|
||||
determine where the data is located.
|
||||
|
||||
If the data comes from a different URL, you will need to :ref:`reproduce the
|
||||
corresponding request <topics-reproducing-requests>`.
|
||||
|
||||
.. _topics-inspecting-source:
|
||||
|
||||
Inspecting the source code of a webpage
|
||||
=======================================
|
||||
|
||||
Sometimes you need to inspect the source code of a webpage (not the
|
||||
:ref:`DOM <topics-livedom>`) to determine where some desired data is located.
|
||||
|
||||
Use Scrapy’s :command:`fetch` command to download the webpage contents as seen
|
||||
by Scrapy::
|
||||
|
||||
scrapy fetch --nolog https://example.com > response.html
|
||||
|
||||
If the desired data is in embedded JavaScript code within a ``<script/>``
|
||||
element, see :ref:`topics-parsing-javascript`.
|
||||
|
||||
If you cannot find the desired data, first make sure it’s not just Scrapy:
|
||||
download the webpage with an HTTP client like curl_ or wget_ and see if the
|
||||
information can be found in the response they get.
|
||||
|
||||
If they get a response with the desired data, modify your Scrapy
|
||||
:class:`~scrapy.http.Request` to match that of the other HTTP client. For
|
||||
example, try using the same user-agent string (:setting:`USER_AGENT`) or the
|
||||
same :attr:`~scrapy.http.Request.headers`.
|
||||
|
||||
If they also get a response without the desired data, you’ll need to take
|
||||
steps to make your request more similar to that of the web browser. See
|
||||
:ref:`topics-reproducing-requests`.
|
||||
|
||||
.. _topics-reproducing-requests:
|
||||
|
||||
Reproducing requests
|
||||
====================
|
||||
|
||||
Sometimes we need to reproduce a request the way our web browser performs it.
|
||||
|
||||
Use the :ref:`network tool <topics-network-tool>` of your web browser to see
|
||||
how your web browser performs the desired request, and try to reproduce that
|
||||
request with Scrapy.
|
||||
|
||||
It might be enough to yield a :class:`~scrapy.http.Request` with the same HTTP
|
||||
method and URL. However, you may also need to reproduce the body, headers and
|
||||
form parameters (see :class:`~scrapy.http.FormRequest`) of that request.
|
||||
|
||||
As all major browsers allow to export the requests in `cURL
|
||||
<https://curl.haxx.se/>`_ format, Scrapy incorporates the method
|
||||
:meth:`~scrapy.http.Request.from_curl()` to generate an equivalent
|
||||
:class:`~scrapy.http.Request` from a cURL command. To get more information
|
||||
visit :ref:`request from curl <requests-from-curl>` inside the network
|
||||
tool section.
|
||||
|
||||
Once you get the expected response, you can :ref:`extract the desired data from
|
||||
it <topics-handling-response-formats>`.
|
||||
|
||||
You can reproduce any request with Scrapy. However, some times reproducing all
|
||||
necessary requests may not seem efficient in developer time. If that is your
|
||||
case, and crawling speed is not a major concern for you, you can alternatively
|
||||
consider :ref:`JavaScript pre-rendering <topics-javascript-rendering>`.
|
||||
|
||||
If you get the expected response `sometimes`, but not always, the issue is
|
||||
probably not your request, but the target server. The target server might be
|
||||
buggy, overloaded, or :ref:`banning <bans>` some of your requests.
|
||||
|
||||
Note that to translate a cURL command into a Scrapy request,
|
||||
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||
|
||||
.. _topics-handling-response-formats:
|
||||
|
||||
Handling different response formats
|
||||
===================================
|
||||
|
||||
Once you have a response with the desired data, how you extract the desired
|
||||
data from it depends on the type of response:
|
||||
|
||||
- If the response is HTML or XML, use :ref:`selectors
|
||||
<topics-selectors>` as usual.
|
||||
|
||||
- If the response is JSON, use :func:`json.loads` to load the desired data from
|
||||
:attr:`response.text <scrapy.http.TextResponse.text>`::
|
||||
|
||||
data = json.loads(response.text)
|
||||
|
||||
If the desired data is inside HTML or XML code embedded within JSON data,
|
||||
you can load that HTML or XML code into a
|
||||
:class:`~scrapy.selector.Selector` and then
|
||||
:ref:`use it <topics-selectors>` as usual::
|
||||
|
||||
selector = Selector(data['html'])
|
||||
|
||||
- If the response is JavaScript, or HTML with a ``<script/>`` element
|
||||
containing the desired data, see :ref:`topics-parsing-javascript`.
|
||||
|
||||
- If the response is CSS, use a :doc:`regular expression <library/re>` to
|
||||
extract the desired data from
|
||||
:attr:`response.text <scrapy.http.TextResponse.text>`.
|
||||
|
||||
.. _topics-parsing-images:
|
||||
|
||||
- If the response is an image or another format based on images (e.g. PDF),
|
||||
read the response as bytes from
|
||||
:attr:`response.body <scrapy.http.TextResponse.body>` and use an OCR
|
||||
solution to extract the desired data as text.
|
||||
|
||||
For example, you can use pytesseract_. To read a table from a PDF,
|
||||
`tabula-py`_ may be a better choice.
|
||||
|
||||
- If the response is SVG, or HTML with embedded SVG containing the desired
|
||||
data, you may be able to extract the desired data using
|
||||
:ref:`selectors <topics-selectors>`, since SVG is based on XML.
|
||||
|
||||
Otherwise, you might need to convert the SVG code into a raster image, and
|
||||
:ref:`handle that raster image <topics-parsing-images>`.
|
||||
|
||||
.. _topics-parsing-javascript:
|
||||
|
||||
Parsing JavaScript code
|
||||
=======================
|
||||
|
||||
If the desired data is hardcoded in JavaScript, you first need to get the
|
||||
JavaScript code:
|
||||
|
||||
- If the JavaScript code is in a JavaScript file, simply read
|
||||
:attr:`response.text <scrapy.http.TextResponse.text>`.
|
||||
|
||||
- If the JavaScript code is within a ``<script/>`` element of an HTML page,
|
||||
use :ref:`selectors <topics-selectors>` to extract the text within that
|
||||
``<script/>`` element.
|
||||
|
||||
Once you have a string with the JavaScript code, you can extract the desired
|
||||
data from it:
|
||||
|
||||
- You might be able to use a :doc:`regular expression <library/re>` to
|
||||
extract the desired data in JSON format, which you can then parse with
|
||||
:func:`json.loads`.
|
||||
|
||||
For example, if the JavaScript code contains a separate line like
|
||||
``var data = {"field": "value"};`` you can extract that data as follows:
|
||||
|
||||
>>> pattern = r'\bvar\s+data\s*=\s*(\{.*?\})\s*;\s*\n'
|
||||
>>> json_data = response.css('script::text').re_first(pattern)
|
||||
>>> json.loads(json_data)
|
||||
{'field': 'value'}
|
||||
|
||||
- chompjs_ provides an API to parse JavaScript objects into a :class:`dict`.
|
||||
|
||||
For example, if the JavaScript code contains
|
||||
``var data = {field: "value", secondField: "second value"};``
|
||||
you can extract that data as follows:
|
||||
|
||||
>>> import chompjs
|
||||
>>> javascript = response.css('script::text').get()
|
||||
>>> data = chompjs.parse_js_object(javascript)
|
||||
>>> data
|
||||
{'field': 'value', 'secondField': 'second value'}
|
||||
|
||||
- Otherwise, use js2xml_ to convert the JavaScript code into an XML document
|
||||
that you can parse using :ref:`selectors <topics-selectors>`.
|
||||
|
||||
For example, if the JavaScript code contains
|
||||
``var data = {field: "value"};`` you can extract that data as follows:
|
||||
|
||||
>>> import js2xml
|
||||
>>> import lxml.etree
|
||||
>>> from parsel import Selector
|
||||
>>> javascript = response.css('script::text').get()
|
||||
>>> xml = lxml.etree.tostring(js2xml.parse(javascript), encoding='unicode')
|
||||
>>> selector = Selector(text=xml)
|
||||
>>> selector.css('var[name="data"]').get()
|
||||
'<var name="data"><object><property name="field"><string>value</string></property></object></var>'
|
||||
|
||||
.. _topics-javascript-rendering:
|
||||
|
||||
Pre-rendering JavaScript
|
||||
========================
|
||||
|
||||
On webpages that fetch data from additional requests, reproducing those
|
||||
requests that contain the desired data is the preferred approach. The effort is
|
||||
often worth the result: structured, complete data with minimum parsing time and
|
||||
network transfer.
|
||||
|
||||
However, sometimes it can be really hard to reproduce certain requests. Or you
|
||||
may need something that no request can give you, such as a screenshot of a
|
||||
webpage as seen in a web browser.
|
||||
|
||||
In these cases use the Splash_ JavaScript-rendering service, along with
|
||||
`scrapy-splash`_ for seamless integration.
|
||||
|
||||
Splash returns as HTML the :ref:`DOM <topics-livedom>` of a webpage, so that
|
||||
you can parse it with :ref:`selectors <topics-selectors>`. It provides great
|
||||
flexibility through configuration_ or scripting_.
|
||||
|
||||
If you need something beyond what Splash offers, such as interacting with the
|
||||
DOM on-the-fly from Python code instead of using a previously-written script,
|
||||
or handling multiple web browser windows, you might need to
|
||||
:ref:`use a headless browser <topics-headless-browsing>` instead.
|
||||
|
||||
.. _configuration: https://splash.readthedocs.io/en/stable/api.html
|
||||
.. _scripting: https://splash.readthedocs.io/en/stable/scripting-tutorial.html
|
||||
|
||||
.. _topics-headless-browsing:
|
||||
|
||||
Using a headless browser
|
||||
========================
|
||||
|
||||
A `headless browser`_ is a special web browser that provides an API for
|
||||
automation.
|
||||
|
||||
The easiest way to use a headless browser with Scrapy is to use Selenium_,
|
||||
along with `scrapy-selenium`_ for seamless integration.
|
||||
|
||||
|
||||
.. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
|
||||
.. _chompjs: https://github.com/Nykakin/chompjs
|
||||
.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
|
||||
.. _curl: https://curl.haxx.se/
|
||||
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
|
||||
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
|
||||
.. _js2xml: https://github.com/scrapinghub/js2xml
|
||||
.. _pytesseract: https://github.com/madmaze/pytesseract
|
||||
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
|
||||
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
|
||||
.. _Selenium: https://www.selenium.dev/
|
||||
.. _Splash: https://github.com/scrapinghub/splash
|
||||
.. _tabula-py: https://github.com/chezou/tabula-py
|
||||
.. _wget: https://www.gnu.org/software/wget/
|
||||
.. _wgrep: https://github.com/stav/wgrep
|
@ -7,21 +7,19 @@ Sending e-mail
|
||||
.. module:: scrapy.mail
|
||||
:synopsis: Email sending facility
|
||||
|
||||
Although Python makes sending e-mails relatively easy via the `smtplib`_
|
||||
Although Python makes sending e-mails relatively easy via the :mod:`smtplib`
|
||||
library, Scrapy provides its own facility for sending e-mails which is very
|
||||
easy to use and it's implemented using `Twisted non-blocking IO`_, to avoid
|
||||
interfering with the non-blocking IO of the crawler. It also provides a
|
||||
simple API for sending attachments and it's very easy to configure, with a few
|
||||
:ref:`settings <topics-email-settings>`.
|
||||
|
||||
.. _smtplib: http://docs.python.org/library/smtplib.html
|
||||
.. _Twisted non-blocking IO: http://twistedmatrix.com/documents/current/core/howto/defer-intro.html
|
||||
easy to use and it's implemented using :doc:`Twisted non-blocking IO
|
||||
<twisted:core/howto/defer-intro>`, to avoid interfering with the non-blocking
|
||||
IO of the crawler. It also provides a simple API for sending attachments and
|
||||
it's very easy to configure, with a few :ref:`settings
|
||||
<topics-email-settings>`.
|
||||
|
||||
Quick example
|
||||
=============
|
||||
|
||||
There are two ways to instantiate the mail sender. You can instantiate it using
|
||||
the standard constructor::
|
||||
the standard ``__init__`` method::
|
||||
|
||||
from scrapy.mail import MailSender
|
||||
mailer = MailSender()
|
||||
@ -39,7 +37,8 @@ MailSender class reference
|
||||
==========================
|
||||
|
||||
MailSender is the preferred class to use for sending emails from Scrapy, as it
|
||||
uses `Twisted non-blocking IO`_, like the rest of the framework.
|
||||
uses :doc:`Twisted non-blocking IO <twisted:core/howto/defer-intro>`, like the
|
||||
rest of the framework.
|
||||
|
||||
.. class:: MailSender(smtphost=None, mailfrom=None, smtpuser=None, smtppass=None, smtpport=None)
|
||||
|
||||
@ -54,19 +53,19 @@ uses `Twisted non-blocking IO`_, like the rest of the framework.
|
||||
:param smtpuser: the SMTP user. If omitted, the :setting:`MAIL_USER`
|
||||
setting will be used. If not given, no SMTP authentication will be
|
||||
performed.
|
||||
:type smtphost: str
|
||||
:type smtphost: str or bytes
|
||||
|
||||
:param smtppass: the SMTP pass for authentication.
|
||||
:type smtppass: str
|
||||
:type smtppass: str or bytes
|
||||
|
||||
:param smtpport: the SMTP port to connect to
|
||||
:type smtpport: int
|
||||
|
||||
:param smtptls: enforce using SMTP STARTTLS
|
||||
:type smtpport: boolean
|
||||
:type smtptls: bool
|
||||
|
||||
:param smtpssl: enforce using a secure SSL connection
|
||||
:type smtpport: boolean
|
||||
:type smtpssl: bool
|
||||
|
||||
.. classmethod:: from_settings(settings)
|
||||
|
||||
@ -76,18 +75,18 @@ uses `Twisted non-blocking IO`_, like the rest of the framework.
|
||||
:param settings: the e-mail recipients
|
||||
:type settings: :class:`scrapy.settings.Settings` object
|
||||
|
||||
.. method:: send(to, subject, body, cc=None, attachs=(), mimetype='text/plain')
|
||||
.. method:: send(to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None)
|
||||
|
||||
Send email to the given recipients.
|
||||
|
||||
:param to: the e-mail recipients
|
||||
:type to: list
|
||||
:param to: the e-mail recipients as a string or as a list of strings
|
||||
:type to: str or list
|
||||
|
||||
:param subject: the subject of the e-mail
|
||||
:type subject: str
|
||||
|
||||
:param cc: the e-mails to CC
|
||||
:type cc: list
|
||||
:param cc: the e-mails to CC as a string or as a list of strings
|
||||
:type cc: str or list
|
||||
|
||||
:param body: the e-mail body
|
||||
:type body: str
|
||||
@ -97,18 +96,21 @@ uses `Twisted non-blocking IO`_, like the rest of the framework.
|
||||
appear on the e-mail's attachment, ``mimetype`` is the mimetype of the
|
||||
attachment and ``file_object`` is a readable file object with the
|
||||
contents of the attachment
|
||||
:type attachs: iterable
|
||||
:type attachs: collections.abc.Iterable
|
||||
|
||||
:param mimetype: the MIME type of the e-mail
|
||||
:type mimetype: str
|
||||
|
||||
:param charset: the character encoding to use for the e-mail contents
|
||||
:type charset: str
|
||||
|
||||
|
||||
.. _topics-email-settings:
|
||||
|
||||
Mail settings
|
||||
=============
|
||||
|
||||
These settings define the default constructor values of the :class:`MailSender`
|
||||
These settings define the default ``__init__`` method values of the :class:`MailSender`
|
||||
class, and can be used to configure e-mail notifications in your project without
|
||||
writing any code (for those extensions and code that uses :class:`MailSender`).
|
||||
|
||||
@ -161,7 +163,7 @@ Password to use for SMTP authentication, along with :setting:`MAIL_USER`.
|
||||
.. setting:: MAIL_TLS
|
||||
|
||||
MAIL_TLS
|
||||
---------
|
||||
--------
|
||||
|
||||
Default: ``False``
|
||||
|
||||
@ -170,7 +172,7 @@ Enforce using STARTTLS. STARTTLS is a way to take an existing insecure connectio
|
||||
.. setting:: MAIL_SSL
|
||||
|
||||
MAIL_SSL
|
||||
---------
|
||||
--------
|
||||
|
||||
Default: ``False``
|
||||
|
||||
|
@ -14,13 +14,6 @@ Built-in Exceptions reference
|
||||
|
||||
Here's a list of all exceptions included in Scrapy and their usage.
|
||||
|
||||
DropItem
|
||||
--------
|
||||
|
||||
.. exception:: DropItem
|
||||
|
||||
The exception that must be raised by item pipeline stages to stop processing an
|
||||
Item. For more information see :ref:`topics-item-pipeline`.
|
||||
|
||||
CloseSpider
|
||||
-----------
|
||||
@ -39,6 +32,22 @@ For example::
|
||||
if 'Bandwidth exceeded' in response.body:
|
||||
raise CloseSpider('bandwidth_exceeded')
|
||||
|
||||
DontCloseSpider
|
||||
---------------
|
||||
|
||||
.. exception:: DontCloseSpider
|
||||
|
||||
This exception can be raised in a :signal:`spider_idle` signal handler to
|
||||
prevent the spider from being closed.
|
||||
|
||||
DropItem
|
||||
--------
|
||||
|
||||
.. exception:: DropItem
|
||||
|
||||
The exception that must be raised by item pipeline stages to stop processing an
|
||||
Item. For more information see :ref:`topics-item-pipeline`.
|
||||
|
||||
IgnoreRequest
|
||||
-------------
|
||||
|
||||
@ -57,10 +66,10 @@ remain disabled. Those components include:
|
||||
|
||||
* Extensions
|
||||
* Item pipelines
|
||||
* Downloader middlwares
|
||||
* Downloader middlewares
|
||||
* Spider middlewares
|
||||
|
||||
The exception must be raised in the component constructor.
|
||||
The exception must be raised in the component's ``__init__`` method.
|
||||
|
||||
NotSupported
|
||||
------------
|
||||
@ -69,3 +78,37 @@ NotSupported
|
||||
|
||||
This exception is raised to indicate an unsupported feature.
|
||||
|
||||
StopDownload
|
||||
-------------
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
.. exception:: StopDownload(fail=True)
|
||||
|
||||
Raised from a :class:`~scrapy.signals.bytes_received` signal handler to
|
||||
indicate that no further bytes should be downloaded for a response.
|
||||
|
||||
The ``fail`` boolean parameter controls which method will handle the resulting
|
||||
response:
|
||||
|
||||
* If ``fail=True`` (default), the request errback is called. The response object is
|
||||
available as the ``response`` attribute of the ``StopDownload`` exception,
|
||||
which is in turn stored as the ``value`` attribute of the received
|
||||
:class:`~twisted.python.failure.Failure` object. This means that in an errback
|
||||
defined as ``def errback(self, failure)``, the response can be accessed though
|
||||
``failure.value.response``.
|
||||
|
||||
* If ``fail=False``, the request callback is called instead.
|
||||
|
||||
In both cases, the response could have its body truncated: the body contains
|
||||
all bytes received up until the exception is raised, including the bytes
|
||||
received in the signal handler that raises the exception. Also, the response
|
||||
object is marked with ``"download_stopped"`` in its :attr:`Response.flags`
|
||||
attribute.
|
||||
|
||||
.. note:: ``fail`` is a keyword-only parameter, i.e. raising
|
||||
``StopDownload(False)`` or ``StopDownload(True)`` will raise
|
||||
a :class:`TypeError`.
|
||||
|
||||
See the documentation for the :class:`~scrapy.signals.bytes_received` signal
|
||||
and the :ref:`topics-stop-response-download` topic for additional information and examples.
|
||||
|
@ -4,10 +4,10 @@
|
||||
Item Exporters
|
||||
==============
|
||||
|
||||
.. module:: scrapy.contrib.exporter
|
||||
.. module:: scrapy.exporters
|
||||
:synopsis: Item Exporters
|
||||
|
||||
Once you have scraped your Items, you often want to persist or export those
|
||||
Once you have scraped your items, you often want to persist or export those
|
||||
items, to use the data in some other application. That is, after all, the whole
|
||||
purpose of the scraping process.
|
||||
|
||||
@ -36,38 +36,37 @@ to export
|
||||
3. and finally call the :meth:`~BaseItemExporter.finish_exporting` to signal
|
||||
the end of the exporting process
|
||||
|
||||
Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses an Item
|
||||
Exporter to export scraped items to different files, one per spider::
|
||||
Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses multiple
|
||||
Item Exporters to group scraped items to different files according to the
|
||||
value of one of their fields::
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.contrib.exporter import XmlItemExporter
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exporters import XmlItemExporter
|
||||
|
||||
class XmlExportPipeline(object):
|
||||
class PerYearXmlExportPipeline:
|
||||
"""Distribute items across multiple XML files according to their 'year' field"""
|
||||
|
||||
def __init__(self):
|
||||
self.files = {}
|
||||
def open_spider(self, spider):
|
||||
self.year_to_exporter = {}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
pipeline = cls()
|
||||
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
|
||||
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
|
||||
return pipeline
|
||||
def close_spider(self, spider):
|
||||
for exporter in self.year_to_exporter.values():
|
||||
exporter.finish_exporting()
|
||||
|
||||
def spider_opened(self, spider):
|
||||
file = open('%s_products.xml' % spider.name, 'w+b')
|
||||
self.files[spider] = file
|
||||
self.exporter = XmlItemExporter(file)
|
||||
self.exporter.start_exporting()
|
||||
def _exporter_for_item(self, item):
|
||||
adapter = ItemAdapter(item)
|
||||
year = adapter['year']
|
||||
if year not in self.year_to_exporter:
|
||||
f = open('{}.xml'.format(year), 'wb')
|
||||
exporter = XmlItemExporter(f)
|
||||
exporter.start_exporting()
|
||||
self.year_to_exporter[year] = exporter
|
||||
return self.year_to_exporter[year]
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.exporter.finish_exporting()
|
||||
file = self.files.pop(spider)
|
||||
file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.exporter.export_item(item)
|
||||
return item
|
||||
def process_item(self, item, spider):
|
||||
exporter = self._exporter_for_item(item)
|
||||
exporter.export_item(item)
|
||||
return item
|
||||
|
||||
|
||||
.. _topics-exporters-field-serialization:
|
||||
@ -90,9 +89,9 @@ described next.
|
||||
1. Declaring a serializer in the field
|
||||
--------------------------------------
|
||||
|
||||
You can declare a serializer in the :ref:`field metadata
|
||||
<topics-items-fields>`. The serializer must be a callable which receives a
|
||||
value and returns its serialized form.
|
||||
If you use :class:`~.Item` you can declare a serializer in the
|
||||
:ref:`field metadata <topics-items-fields>`. The serializer must be
|
||||
a callable which receives a value and returns its serialized form.
|
||||
|
||||
Example::
|
||||
|
||||
@ -117,7 +116,7 @@ after your custom code.
|
||||
|
||||
Example::
|
||||
|
||||
from scrapy.contrib.exporter import XmlItemExporter
|
||||
from scrapy.exporter import XmlItemExporter
|
||||
|
||||
class ProductXmlExporter(XmlItemExporter):
|
||||
|
||||
@ -140,16 +139,19 @@ output examples, which assume you're exporting these two items::
|
||||
BaseItemExporter
|
||||
----------------
|
||||
|
||||
.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8')
|
||||
.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8', indent=0, dont_fail=False)
|
||||
|
||||
This is the (abstract) base class for all Item Exporters. It provides
|
||||
support for common features used by all (concrete) Item Exporters, such as
|
||||
defining what fields to export, whether to export empty fields, or which
|
||||
encoding to use.
|
||||
|
||||
These features can be configured through the constructor arguments which
|
||||
These features can be configured through the ``__init__`` method arguments which
|
||||
populate their respective instance attributes: :attr:`fields_to_export`,
|
||||
:attr:`export_empty_fields`, :attr:`encoding`.
|
||||
:attr:`export_empty_fields`, :attr:`encoding`, :attr:`indent`.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
The *dont_fail* parameter.
|
||||
|
||||
.. method:: export_item(item)
|
||||
|
||||
@ -164,11 +166,12 @@ BaseItemExporter
|
||||
By default, this method looks for a serializer :ref:`declared in the item
|
||||
field <topics-exporters-serializers>` and returns the result of applying
|
||||
that serializer to the value. If no serializer is found, it returns the
|
||||
value unchanged except for ``unicode`` values which are encoded to
|
||||
``str`` using the encoding declared in the :attr:`encoding` attribute.
|
||||
value unchanged.
|
||||
|
||||
:param field: the field being serialized
|
||||
:type field: :class:`~scrapy.item.Field` object
|
||||
:param field: the field being serialized. If the source :ref:`item object
|
||||
<item-types>` does not define field metadata, *field* is an empty
|
||||
:class:`dict`.
|
||||
:type field: :class:`~scrapy.item.Field` object or a :class:`dict` instance
|
||||
|
||||
:param name: the name of the field being serialized
|
||||
:type name: str
|
||||
@ -191,35 +194,56 @@ BaseItemExporter
|
||||
|
||||
.. attribute:: fields_to_export
|
||||
|
||||
A list with the name of the fields that will be exported, or None if you
|
||||
want to export all fields. Defaults to None.
|
||||
A list with the name of the fields that will be exported, or ``None`` if
|
||||
you want to export all fields. Defaults to ``None``.
|
||||
|
||||
Some exporters (like :class:`CsvItemExporter`) respect the order of the
|
||||
fields defined in this attribute.
|
||||
|
||||
When using :ref:`item objects <item-types>` that do not expose all their
|
||||
possible fields, exporters that do not support exporting a different
|
||||
subset of fields per item will only export the fields found in the first
|
||||
item exported. Use ``fields_to_export`` to define all the fields to be
|
||||
exported.
|
||||
|
||||
.. attribute:: export_empty_fields
|
||||
|
||||
Whether to include empty/unpopulated item fields in the exported data.
|
||||
Defaults to ``False``. Some exporters (like :class:`CsvItemExporter`)
|
||||
ignore this attribute and always export all empty fields.
|
||||
|
||||
This option is ignored for dict items.
|
||||
|
||||
.. attribute:: encoding
|
||||
|
||||
The encoding that will be used to encode unicode values. This only
|
||||
affects unicode values (which are always serialized to str using this
|
||||
encoding). Other value types are passed unchanged to the specific
|
||||
serialization library.
|
||||
The output character encoding.
|
||||
|
||||
.. attribute:: indent
|
||||
|
||||
Amount of spaces used to indent the output on each level. Defaults to ``0``.
|
||||
|
||||
* ``indent=None`` selects the most compact representation,
|
||||
all items in the same line with no indentation
|
||||
* ``indent<=0`` each item on its own line, no indentation
|
||||
* ``indent>0`` each item on its own line, indented with the provided numeric value
|
||||
|
||||
PythonItemExporter
|
||||
------------------
|
||||
|
||||
.. autoclass:: PythonItemExporter
|
||||
|
||||
|
||||
.. highlight:: none
|
||||
|
||||
XmlItemExporter
|
||||
---------------
|
||||
|
||||
.. class:: XmlItemExporter(file, item_element='item', root_element='items', \**kwargs)
|
||||
.. class:: XmlItemExporter(file, item_element='item', root_element='items', **kwargs)
|
||||
|
||||
Exports Items in XML format to the specified file object.
|
||||
Exports items in XML format to the specified file object.
|
||||
|
||||
:param file: the file-like object to use for exporting the data.
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
|
||||
:param root_element: The name of root element in the exported XML.
|
||||
:type root_element: str
|
||||
@ -227,8 +251,8 @@ XmlItemExporter
|
||||
:param item_element: The name of each item element in the exported XML.
|
||||
:type item_element: str
|
||||
|
||||
The additional keyword arguments of this constructor are passed to the
|
||||
:class:`BaseItemExporter` constructor.
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method.
|
||||
|
||||
A typical output of this exporter would be::
|
||||
|
||||
@ -268,27 +292,28 @@ XmlItemExporter
|
||||
CsvItemExporter
|
||||
---------------
|
||||
|
||||
.. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', \**kwargs)
|
||||
.. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', **kwargs)
|
||||
|
||||
Exports Items in CSV format to the given file-like object. If the
|
||||
Exports items in CSV format to the given file-like object. If the
|
||||
:attr:`fields_to_export` attribute is set, it will be used to define the
|
||||
CSV columns and their order. The :attr:`export_empty_fields` attribute has
|
||||
no effect on this exporter.
|
||||
|
||||
:param file: the file-like object to use for exporting the data.
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
|
||||
:param include_headers_line: If enabled, makes the exporter output a header
|
||||
line with the field names taken from
|
||||
:attr:`BaseItemExporter.fields_to_export` or the first exported item fields.
|
||||
:type include_headers_line: boolean
|
||||
:type include_headers_line: bool
|
||||
|
||||
:param join_multivalued: The char (or chars) that will be used for joining
|
||||
multi-valued fields, if found.
|
||||
:type include_headers_line: str
|
||||
|
||||
The additional keyword arguments of this constructor are passed to the
|
||||
:class:`BaseItemExporter` constructor, and the leftover arguments to the
|
||||
`csv.writer`_ constructor, so you can use any `csv.writer` constructor
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the
|
||||
:func:`csv.writer` function, so you can use any :func:`csv.writer` function
|
||||
argument to customize this exporter.
|
||||
|
||||
A typical output of this exporter would be::
|
||||
@ -297,40 +322,38 @@ CsvItemExporter
|
||||
Color TV,1200
|
||||
DVD player,200
|
||||
|
||||
.. _csv.writer: http://docs.python.org/library/csv.html#csv.writer
|
||||
|
||||
PickleItemExporter
|
||||
------------------
|
||||
|
||||
.. class:: PickleItemExporter(file, protocol=0, \**kwargs)
|
||||
.. class:: PickleItemExporter(file, protocol=0, **kwargs)
|
||||
|
||||
Exports Items in pickle format to the given file-like object.
|
||||
Exports items in pickle format to the given file-like object.
|
||||
|
||||
:param file: the file-like object to use for exporting the data.
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
|
||||
:param protocol: The pickle protocol to use.
|
||||
:type protocol: int
|
||||
|
||||
For more information, refer to the `pickle module documentation`_.
|
||||
For more information, see :mod:`pickle`.
|
||||
|
||||
The additional keyword arguments of this constructor are passed to the
|
||||
:class:`BaseItemExporter` constructor.
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method.
|
||||
|
||||
Pickle isn't a human readable format, so no output examples are provided.
|
||||
|
||||
.. _pickle module documentation: http://docs.python.org/library/pickle.html
|
||||
|
||||
PprintItemExporter
|
||||
------------------
|
||||
|
||||
.. class:: PprintItemExporter(file, \**kwargs)
|
||||
.. class:: PprintItemExporter(file, **kwargs)
|
||||
|
||||
Exports Items in pretty print format to the specified file object.
|
||||
Exports items in pretty print format to the specified file object.
|
||||
|
||||
:param file: the file-like object to use for exporting the data.
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
|
||||
The additional keyword arguments of this constructor are passed to the
|
||||
:class:`BaseItemExporter` constructor.
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method.
|
||||
|
||||
A typical output of this exporter would be::
|
||||
|
||||
@ -342,15 +365,16 @@ PprintItemExporter
|
||||
JsonItemExporter
|
||||
----------------
|
||||
|
||||
.. class:: JsonItemExporter(file, \**kwargs)
|
||||
.. class:: JsonItemExporter(file, **kwargs)
|
||||
|
||||
Exports Items in JSON format to the specified file-like object, writing all
|
||||
objects as a list of objects. The additional constructor arguments are
|
||||
passed to the :class:`BaseItemExporter` constructor, and the leftover
|
||||
arguments to the `JSONEncoder`_ constructor, so you can use any
|
||||
`JSONEncoder`_ constructor argument to customize this exporter.
|
||||
Exports items in JSON format to the specified file-like object, writing all
|
||||
objects as a list of objects. The additional ``__init__`` method arguments are
|
||||
passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover
|
||||
arguments to the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any
|
||||
:class:`~json.JSONEncoder` ``__init__`` method argument to customize this exporter.
|
||||
|
||||
:param file: the file-like object to use for exporting the data.
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
|
||||
A typical output of this exporter would be::
|
||||
|
||||
@ -367,20 +391,19 @@ JsonItemExporter
|
||||
stream-friendly format, consider using :class:`JsonLinesItemExporter`
|
||||
instead, or splitting the output in multiple chunks.
|
||||
|
||||
.. _JSONEncoder: http://docs.python.org/library/json.html#json.JSONEncoder
|
||||
|
||||
JsonLinesItemExporter
|
||||
---------------------
|
||||
|
||||
.. class:: JsonLinesItemExporter(file, \**kwargs)
|
||||
.. class:: JsonLinesItemExporter(file, **kwargs)
|
||||
|
||||
Exports Items in JSON format to the specified file-like object, writing one
|
||||
JSON-encoded item per line. The additional constructor arguments are passed
|
||||
to the :class:`BaseItemExporter` constructor, and the leftover arguments to
|
||||
the `JSONEncoder`_ constructor, so you can use any `JSONEncoder`_
|
||||
constructor argument to customize this exporter.
|
||||
Exports items in JSON format to the specified file-like object, writing one
|
||||
JSON-encoded item per line. The additional ``__init__`` method arguments are passed
|
||||
to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to
|
||||
the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any
|
||||
:class:`~json.JSONEncoder` ``__init__`` method argument to customize this exporter.
|
||||
|
||||
:param file: the file-like object to use for exporting the data.
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
|
||||
A typical output of this exporter would be::
|
||||
|
||||
@ -390,4 +413,7 @@ JsonLinesItemExporter
|
||||
Unlike the one produced by :class:`JsonItemExporter`, the format produced by
|
||||
this exporter is well suited for serializing large amounts of data.
|
||||
|
||||
.. _JSONEncoder: http://docs.python.org/library/json.html#json.JSONEncoder
|
||||
MarshalItemExporter
|
||||
-------------------
|
||||
|
||||
.. autoclass:: MarshalItemExporter
|
||||
|
@ -5,7 +5,7 @@ Extensions
|
||||
==========
|
||||
|
||||
The extensions framework provides a mechanism for inserting your own
|
||||
custom functionality into Scrapy.
|
||||
custom functionality into Scrapy.
|
||||
|
||||
Extensions are just regular classes that are instantiated at Scrapy startup,
|
||||
when extensions are initialized.
|
||||
@ -17,42 +17,41 @@ Extensions use the :ref:`Scrapy settings <topics-settings>` to manage their
|
||||
settings, just like any other Scrapy code.
|
||||
|
||||
It is customary for extensions to prefix their settings with their own name, to
|
||||
avoid collision with existing (and future) extensions. For example, an
|
||||
avoid collision with existing (and future) extensions. For example, a
|
||||
hypothetic extension to handle `Google Sitemaps`_ would use settings like
|
||||
`GOOGLESITEMAP_ENABLED`, `GOOGLESITEMAP_DEPTH`, and so on.
|
||||
``GOOGLESITEMAP_ENABLED``, ``GOOGLESITEMAP_DEPTH``, and so on.
|
||||
|
||||
.. _Google Sitemaps: http://en.wikipedia.org/wiki/Sitemaps
|
||||
.. _Google Sitemaps: https://en.wikipedia.org/wiki/Sitemaps
|
||||
|
||||
Loading & activating extensions
|
||||
===============================
|
||||
|
||||
Extensions are loaded and activated at startup by instantiating a single
|
||||
instance of the extension class. Therefore, all the extension initialization
|
||||
code must be performed in the class constructor (``__init__`` method).
|
||||
code must be performed in the class ``__init__`` method.
|
||||
|
||||
To make an extension available, add it to the :setting:`EXTENSIONS` setting in
|
||||
your Scrapy settings. In :setting:`EXTENSIONS`, each extension is represented
|
||||
by a string: the full Python path to the extension's class name. For example::
|
||||
|
||||
EXTENSIONS = {
|
||||
'scrapy.contrib.corestats.CoreStats': 500,
|
||||
'scrapy.telnet.TelnetConsole': 500,
|
||||
'scrapy.extensions.corestats.CoreStats': 500,
|
||||
'scrapy.extensions.telnet.TelnetConsole': 500,
|
||||
}
|
||||
|
||||
|
||||
As you can see, the :setting:`EXTENSIONS` setting is a dict where the keys are
|
||||
the extension paths, and their values are the orders, which define the
|
||||
extension *loading* order. Extensions orders are not as important as middleware
|
||||
orders though, and they are typically irrelevant, ie. it doesn't matter in
|
||||
which order the extensions are loaded because they don't depend on each other
|
||||
[1].
|
||||
extension *loading* order. The :setting:`EXTENSIONS` setting is merged with the
|
||||
:setting:`EXTENSIONS_BASE` setting defined in Scrapy (and not meant to be
|
||||
overridden) and then sorted by order to get the final sorted list of enabled
|
||||
extensions.
|
||||
|
||||
However, this feature can be exploited if you need to add an extension which
|
||||
depends on other extensions already loaded.
|
||||
|
||||
[1] This is is why the :setting:`EXTENSIONS_BASE` setting in Scrapy (which
|
||||
contains all built-in extensions enabled by default) defines all the extensions
|
||||
with the same order (``500``).
|
||||
As extensions typically do not depend on each other, their loading order is
|
||||
irrelevant in most cases. This is why the :setting:`EXTENSIONS_BASE` setting
|
||||
defines all extensions with the same order (``0``). However, this feature can
|
||||
be exploited if you need to add an extension which depends on other extensions
|
||||
already loaded.
|
||||
|
||||
Available, enabled and disabled extensions
|
||||
==========================================
|
||||
@ -64,25 +63,21 @@ but disabled unless the :setting:`HTTPCACHE_ENABLED` setting is set.
|
||||
Disabling an extension
|
||||
======================
|
||||
|
||||
In order to disable an extension that comes enabled by default (ie. those
|
||||
In order to disable an extension that comes enabled by default (i.e. those
|
||||
included in the :setting:`EXTENSIONS_BASE` setting) you must set its order to
|
||||
``None``. For example::
|
||||
|
||||
EXTENSIONS = {
|
||||
'scrapy.contrib.corestats.CoreStats': None,
|
||||
'scrapy.extensions.corestats.CoreStats': None,
|
||||
}
|
||||
|
||||
Writing your own extension
|
||||
==========================
|
||||
|
||||
Writing your own extension is easy. Each extension is a single Python class
|
||||
which doesn't need to implement any particular method.
|
||||
|
||||
The main entry point for a Scrapy extension (this also includes middlewares and
|
||||
pipelines) is the ``from_crawler`` class method which receives a
|
||||
``Crawler`` instance which is the main object controlling the Scrapy crawler.
|
||||
Through that object you can access settings, signals, stats, and also control
|
||||
the crawler behaviour, if your extension needs to such thing.
|
||||
Each extension is a Python class. The main entry point for a Scrapy extension
|
||||
(this also includes middlewares and pipelines) is the ``from_crawler``
|
||||
class method which receives a ``Crawler`` instance. Through the Crawler object
|
||||
you can access settings, signals, stats, and also control the crawling behaviour.
|
||||
|
||||
Typically, extensions connect to :ref:`signals <topics-signals>` and perform
|
||||
tasks triggered by them.
|
||||
@ -106,10 +101,13 @@ number of items will be specified through the ``MYEXT_ITEMCOUNT`` setting.
|
||||
|
||||
Here is the code of such extension::
|
||||
|
||||
import logging
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
class SpiderOpenCloseLogging(object):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SpiderOpenCloseLogging:
|
||||
|
||||
def __init__(self, item_count):
|
||||
self.item_count = item_count
|
||||
@ -133,20 +131,20 @@ Here is the code of such extension::
|
||||
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
|
||||
|
||||
# return the extension object
|
||||
# return the extension object
|
||||
return ext
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.log("opened spider %s" % spider.name)
|
||||
logger.info("opened spider %s", spider.name)
|
||||
|
||||
def spider_closed(self, spider):
|
||||
spider.log("closed spider %s" % spider.name)
|
||||
logger.info("closed spider %s", spider.name)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.items_scraped += 1
|
||||
if self.items_scraped == self.item_count:
|
||||
spider.log("scraped %d items, resetting counter" % self.items_scraped)
|
||||
self.item_count = 0
|
||||
if self.items_scraped % self.item_count == 0:
|
||||
logger.info("scraped %d items", self.items_scraped)
|
||||
|
||||
|
||||
.. _topics-extensions-ref:
|
||||
|
||||
@ -159,7 +157,7 @@ General purpose extensions
|
||||
Log Stats extension
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.logstats
|
||||
.. module:: scrapy.extensions.logstats
|
||||
:synopsis: Basic stats logging
|
||||
|
||||
.. class:: LogStats
|
||||
@ -169,7 +167,7 @@ Log basic stats like crawled pages and scraped items.
|
||||
Core Stats extension
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.corestats
|
||||
.. module:: scrapy.extensions.corestats
|
||||
:synopsis: Core stats collection
|
||||
|
||||
.. class:: CoreStats
|
||||
@ -182,13 +180,13 @@ enabled (see :ref:`topics-stats`).
|
||||
Telnet console extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.telnet
|
||||
:synopsis: Telnet console
|
||||
.. module:: scrapy.extensions.telnet
|
||||
:synopsis: Telnet console
|
||||
|
||||
.. class:: scrapy.telnet.TelnetConsole
|
||||
.. class:: TelnetConsole
|
||||
|
||||
Provides a telnet console for getting into a Python interpreter inside the
|
||||
currently running Scrapy process, which can be very useful for debugging.
|
||||
currently running Scrapy process, which can be very useful for debugging.
|
||||
|
||||
The telnet console must be enabled by the :setting:`TELNETCONSOLE_ENABLED`
|
||||
setting, and the server will listen in the port specified in
|
||||
@ -199,16 +197,16 @@ setting, and the server will listen in the port specified in
|
||||
Memory usage extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.memusage
|
||||
.. module:: scrapy.extensions.memusage
|
||||
:synopsis: Memory usage extension
|
||||
|
||||
.. class:: scrapy.contrib.memusage.MemoryUsage
|
||||
.. class:: MemoryUsage
|
||||
|
||||
.. note:: This extension does not work in Windows.
|
||||
|
||||
Monitors the memory used by the Scrapy process that runs the spider and:
|
||||
|
||||
1, sends a notification e-mail when it exceeds a certain value
|
||||
1. sends a notification e-mail when it exceeds a certain value
|
||||
2. closes the spider when it exceeds a certain value
|
||||
|
||||
The notification e-mails can be triggered when a certain warning value is
|
||||
@ -222,15 +220,15 @@ can be configured with the following settings:
|
||||
* :setting:`MEMUSAGE_LIMIT_MB`
|
||||
* :setting:`MEMUSAGE_WARNING_MB`
|
||||
* :setting:`MEMUSAGE_NOTIFY_MAIL`
|
||||
* :setting:`MEMUSAGE_REPORT`
|
||||
* :setting:`MEMUSAGE_CHECK_INTERVAL_SECONDS`
|
||||
|
||||
Memory debugger extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.memdebug
|
||||
.. module:: scrapy.extensions.memdebug
|
||||
:synopsis: Memory debugger extension
|
||||
|
||||
.. class:: scrapy.contrib.memdebug.MemoryDebugger
|
||||
.. class:: MemoryDebugger
|
||||
|
||||
An extension for debugging memory usage. It collects information about:
|
||||
|
||||
@ -243,10 +241,10 @@ info will be stored in the stats.
|
||||
Close spider extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.closespider
|
||||
.. module:: scrapy.extensions.closespider
|
||||
:synopsis: Close spider extension
|
||||
|
||||
.. class:: scrapy.contrib.closespider.CloseSpider
|
||||
.. class:: CloseSpider
|
||||
|
||||
Closes a spider automatically when some conditions are met, using a specific
|
||||
closing reason for each condition.
|
||||
@ -279,9 +277,11 @@ CLOSESPIDER_ITEMCOUNT
|
||||
Default: ``0``
|
||||
|
||||
An integer which specifies a number of items. If the spider scrapes more than
|
||||
that amount if items and those items are passed by the item pipeline, the
|
||||
spider will be closed with the reason ``closespider_itemcount``. If zero (or
|
||||
non set), spiders won't be closed by number of passed items.
|
||||
that amount and those items are passed by the item pipeline, the
|
||||
spider will be closed with the reason ``closespider_itemcount``.
|
||||
Requests which are currently in the downloader queue (up to
|
||||
:setting:`CONCURRENT_REQUESTS` requests) are still processed.
|
||||
If zero (or non set), spiders won't be closed by number of passed items.
|
||||
|
||||
.. setting:: CLOSESPIDER_PAGECOUNT
|
||||
|
||||
@ -314,17 +314,17 @@ set), spiders won't be closed by number of errors.
|
||||
StatsMailer extension
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.statsmailer
|
||||
.. module:: scrapy.extensions.statsmailer
|
||||
:synopsis: StatsMailer extension
|
||||
|
||||
.. class:: scrapy.contrib.statsmailer.StatsMailer
|
||||
.. class:: StatsMailer
|
||||
|
||||
This simple extension can be used to send a notification e-mail every time a
|
||||
domain has finished scraping, including the Scrapy stats collected. The email
|
||||
will be sent to all recipients specified in the :setting:`STATSMAILER_RCPTS`
|
||||
setting.
|
||||
|
||||
.. module:: scrapy.contrib.debug
|
||||
.. module:: scrapy.extensions.debug
|
||||
:synopsis: Extensions for debugging Scrapy
|
||||
|
||||
Debugging extensions
|
||||
@ -333,7 +333,7 @@ Debugging extensions
|
||||
Stack trace dump extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. class:: scrapy.contrib.debug.StackTraceDump
|
||||
.. class:: StackTraceDump
|
||||
|
||||
Dumps information about the running process when a `SIGQUIT`_ or `SIGUSR2`_
|
||||
signal is received. The information dumped is the following:
|
||||
@ -345,7 +345,7 @@ signal is received. The information dumped is the following:
|
||||
After the stack trace and engine status is dumped, the Scrapy process continues
|
||||
running normally.
|
||||
|
||||
This extension only works on POSIX-compliant platforms (ie. not Windows),
|
||||
This extension only works on POSIX-compliant platforms (i.e. not Windows),
|
||||
because the `SIGQUIT`_ and `SIGUSR2`_ signals are not available on Windows.
|
||||
|
||||
There are at least two ways to send Scrapy the `SIGQUIT`_ signal:
|
||||
@ -356,21 +356,20 @@ There are at least two ways to send Scrapy the `SIGQUIT`_ signal:
|
||||
|
||||
kill -QUIT <pid>
|
||||
|
||||
.. _SIGUSR2: http://en.wikipedia.org/wiki/SIGUSR1_and_SIGUSR2
|
||||
.. _SIGQUIT: http://en.wikipedia.org/wiki/SIGQUIT
|
||||
.. _SIGUSR2: https://en.wikipedia.org/wiki/SIGUSR1_and_SIGUSR2
|
||||
.. _SIGQUIT: https://en.wikipedia.org/wiki/SIGQUIT
|
||||
|
||||
Debugger extension
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. class:: scrapy.contrib.debug.Debugger
|
||||
.. class:: Debugger
|
||||
|
||||
Invokes a `Python debugger`_ inside a running Scrapy process when a `SIGUSR2`_
|
||||
Invokes a :doc:`Python debugger <library/pdb>` inside a running Scrapy process when a `SIGUSR2`_
|
||||
signal is received. After the debugger is exited, the Scrapy process continues
|
||||
running normally.
|
||||
|
||||
For more info see `Debugging in Python`.
|
||||
For more info see `Debugging in Python`_.
|
||||
|
||||
This extension only works on POSIX-compliant platforms (ie. not Windows).
|
||||
This extension only works on POSIX-compliant platforms (i.e. not Windows).
|
||||
|
||||
.. _Python debugger: http://docs.python.org/library/pdb.html
|
||||
.. _Debugging in Python: http://www.ferg.org/papers/debugging_in_python.html
|
||||
.. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/
|
||||
|
@ -8,11 +8,11 @@ Feed exports
|
||||
|
||||
One of the most frequently required features when implementing scrapers is
|
||||
being able to store the scraped data properly and, quite often, that means
|
||||
generating a "export file" with the scraped data (commonly called "export
|
||||
generating an "export file" with the scraped data (commonly called "export
|
||||
feed") to be consumed by other systems.
|
||||
|
||||
Scrapy provides this functionality out of the box with the Feed Exports, which
|
||||
allows you to generate a feed with the scraped items, using multiple
|
||||
allows you to generate feeds with the scraped items, using multiple
|
||||
serialization formats and storage backends.
|
||||
|
||||
.. _topics-feed-format:
|
||||
@ -21,7 +21,7 @@ Serialization formats
|
||||
=====================
|
||||
|
||||
For serializing the scraped data, the feed exports use the :ref:`Item exporters
|
||||
<topics-exporters>` and these formats are supported out of the box:
|
||||
<topics-exporters>`. These formats are supported out of the box:
|
||||
|
||||
* :ref:`topics-feed-format-json`
|
||||
* :ref:`topics-feed-format-jsonlines`
|
||||
@ -30,55 +30,60 @@ For serializing the scraped data, the feed exports use the :ref:`Item exporters
|
||||
|
||||
But you can also extend the supported format through the
|
||||
:setting:`FEED_EXPORTERS` setting.
|
||||
|
||||
|
||||
.. _topics-feed-format-json:
|
||||
|
||||
JSON
|
||||
----
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``json``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.JsonItemExporter`
|
||||
* See :ref:`this warning <json-with-large-data>` if you're using JSON with large feeds
|
||||
* Value for the ``format`` key in the :setting:`FEEDS` setting: ``json``
|
||||
* Exporter used: :class:`~scrapy.exporters.JsonItemExporter`
|
||||
* See :ref:`this warning <json-with-large-data>` if you're using JSON with
|
||||
large feeds.
|
||||
|
||||
.. _topics-feed-format-jsonlines:
|
||||
|
||||
JSON lines
|
||||
----------
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``jsonlines``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.JsonLinesItemExporter`
|
||||
* Value for the ``format`` key in the :setting:`FEEDS` setting: ``jsonlines``
|
||||
* Exporter used: :class:`~scrapy.exporters.JsonLinesItemExporter`
|
||||
|
||||
.. _topics-feed-format-csv:
|
||||
|
||||
CSV
|
||||
---
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``csv``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.CsvItemExporter`
|
||||
* Value for the ``format`` key in the :setting:`FEEDS` setting: ``csv``
|
||||
* Exporter used: :class:`~scrapy.exporters.CsvItemExporter`
|
||||
* To specify columns to export and their order use
|
||||
:setting:`FEED_EXPORT_FIELDS`. Other feed exporters can also use this
|
||||
option, but it is important for CSV because unlike many other export
|
||||
formats CSV uses a fixed header.
|
||||
|
||||
.. _topics-feed-format-xml:
|
||||
|
||||
XML
|
||||
---
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``xml``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.XmlItemExporter`
|
||||
* Value for the ``format`` key in the :setting:`FEEDS` setting: ``xml``
|
||||
* Exporter used: :class:`~scrapy.exporters.XmlItemExporter`
|
||||
|
||||
.. _topics-feed-format-pickle:
|
||||
|
||||
Pickle
|
||||
------
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``pickle``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.PickleItemExporter`
|
||||
* Value for the ``format`` key in the :setting:`FEEDS` setting: ``pickle``
|
||||
* Exporter used: :class:`~scrapy.exporters.PickleItemExporter`
|
||||
|
||||
.. _topics-feed-format-marshal:
|
||||
|
||||
Marshal
|
||||
-------
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``marshal``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.MarshalItemExporter`
|
||||
* Value for the ``format`` key in the :setting:`FEEDS` setting: ``marshal``
|
||||
* Exporter used: :class:`~scrapy.exporters.MarshalItemExporter`
|
||||
|
||||
|
||||
.. _topics-feed-storage:
|
||||
@ -86,19 +91,20 @@ Marshal
|
||||
Storages
|
||||
========
|
||||
|
||||
When using the feed exports you define where to store the feed using a URI_
|
||||
(through the :setting:`FEED_URI` setting). The feed exports supports multiple
|
||||
When using the feed exports you define where to store the feed using one or multiple URIs_
|
||||
(through the :setting:`FEEDS` setting). The feed exports supports multiple
|
||||
storage backend types which are defined by the URI scheme.
|
||||
|
||||
The storages backends supported out of the box are:
|
||||
|
||||
* :ref:`topics-feed-storage-fs`
|
||||
* :ref:`topics-feed-storage-ftp`
|
||||
* :ref:`topics-feed-storage-s3` (requires boto_)
|
||||
* :ref:`topics-feed-storage-s3` (requires botocore_)
|
||||
* :ref:`topics-feed-storage-gcs` (requires `google-cloud-storage`_)
|
||||
* :ref:`topics-feed-storage-stdout`
|
||||
|
||||
Some storage backends may be unavailable if the required external libraries are
|
||||
not available. For example, the S3 backend is only available if the boto_
|
||||
not available. For example, the S3 backend is only available if the botocore_
|
||||
library is installed.
|
||||
|
||||
|
||||
@ -159,6 +165,14 @@ The feeds are stored in a FTP server.
|
||||
* Example URI: ``ftp://user:pass@ftp.example.com/path/to/export.csv``
|
||||
* Required external libraries: none
|
||||
|
||||
FTP supports two different connection modes: `active or passive
|
||||
<https://stackoverflow.com/a/1699163>`_. Scrapy uses the passive connection
|
||||
mode by default. To use the active connection mode instead, set the
|
||||
:setting:`FEED_STORAGE_FTP_ACTIVE` setting to ``True``.
|
||||
|
||||
This storage backend uses :ref:`delayed file delivery <delayed-file-delivery>`.
|
||||
|
||||
|
||||
.. _topics-feed-storage-s3:
|
||||
|
||||
S3
|
||||
@ -172,7 +186,7 @@ The feeds are stored on `Amazon S3`_.
|
||||
* ``s3://mybucket/path/to/export.csv``
|
||||
* ``s3://aws_key:aws_secret@mybucket/path/to/export.csv``
|
||||
|
||||
* Required external libraries: `boto`_
|
||||
* Required external libraries: `botocore`_
|
||||
|
||||
The AWS credentials can be passed as user/password in the URI, or they can be
|
||||
passed through the following settings:
|
||||
@ -180,6 +194,41 @@ passed through the following settings:
|
||||
* :setting:`AWS_ACCESS_KEY_ID`
|
||||
* :setting:`AWS_SECRET_ACCESS_KEY`
|
||||
|
||||
You can also define a custom ACL for exported feeds using this setting:
|
||||
|
||||
* :setting:`FEED_STORAGE_S3_ACL`
|
||||
|
||||
This storage backend uses :ref:`delayed file delivery <delayed-file-delivery>`.
|
||||
|
||||
|
||||
.. _topics-feed-storage-gcs:
|
||||
|
||||
Google Cloud Storage (GCS)
|
||||
--------------------------
|
||||
|
||||
.. versionadded:: 2.3
|
||||
|
||||
The feeds are stored on `Google Cloud Storage`_.
|
||||
|
||||
* URI scheme: ``gs``
|
||||
* Example URIs:
|
||||
|
||||
* ``gs://mybucket/path/to/export.csv``
|
||||
|
||||
* Required external libraries: `google-cloud-storage`_.
|
||||
|
||||
For more information about authentication, please refer to `Google Cloud documentation <https://cloud.google.com/docs/authentication/production>`_.
|
||||
|
||||
You can set a *Project ID* and *Access Control List (ACL)* through the following settings:
|
||||
|
||||
* :setting:`FEED_STORAGE_GCS_ACL`
|
||||
* :setting:`GCS_PROJECT_ID`
|
||||
|
||||
This storage backend uses :ref:`delayed file delivery <delayed-file-delivery>`.
|
||||
|
||||
.. _google-cloud-storage: https://cloud.google.com/storage/docs/reference/libraries#client-libraries-install-python
|
||||
|
||||
|
||||
.. _topics-feed-storage-stdout:
|
||||
|
||||
Standard output
|
||||
@ -192,38 +241,171 @@ The feeds are written to the standard output of the Scrapy process.
|
||||
* Required external libraries: none
|
||||
|
||||
|
||||
.. _delayed-file-delivery:
|
||||
|
||||
Delayed file delivery
|
||||
---------------------
|
||||
|
||||
As indicated above, some of the described storage backends use delayed file
|
||||
delivery.
|
||||
|
||||
These storage backends do not upload items to the feed URI as those items are
|
||||
scraped. Instead, Scrapy writes items into a temporary local file, and only
|
||||
once all the file contents have been written (i.e. at the end of the crawl) is
|
||||
that file uploaded to the feed URI.
|
||||
|
||||
If you want item delivery to start earlier when using one of these storage
|
||||
backends, use :setting:`FEED_EXPORT_BATCH_ITEM_COUNT` to split the output items
|
||||
in multiple files, with the specified maximum item count per file. That way, as
|
||||
soon as a file reaches the maximum item count, that file is delivered to the
|
||||
feed URI, allowing item delivery to start way before the end of the crawl.
|
||||
|
||||
|
||||
Settings
|
||||
========
|
||||
|
||||
These are the settings used for configuring the feed exports:
|
||||
|
||||
* :setting:`FEED_URI` (mandatory)
|
||||
* :setting:`FEED_FORMAT`
|
||||
* :setting:`FEED_STORAGES`
|
||||
* :setting:`FEED_EXPORTERS`
|
||||
* :setting:`FEEDS` (mandatory)
|
||||
* :setting:`FEED_EXPORT_ENCODING`
|
||||
* :setting:`FEED_STORE_EMPTY`
|
||||
* :setting:`FEED_EXPORT_FIELDS`
|
||||
* :setting:`FEED_EXPORT_INDENT`
|
||||
* :setting:`FEED_STORAGES`
|
||||
* :setting:`FEED_STORAGE_FTP_ACTIVE`
|
||||
* :setting:`FEED_STORAGE_S3_ACL`
|
||||
* :setting:`FEED_EXPORTERS`
|
||||
* :setting:`FEED_EXPORT_BATCH_ITEM_COUNT`
|
||||
|
||||
.. currentmodule:: scrapy.contrib.feedexport
|
||||
.. currentmodule:: scrapy.extensions.feedexport
|
||||
|
||||
.. setting:: FEED_URI
|
||||
.. setting:: FEEDS
|
||||
|
||||
FEED_URI
|
||||
--------
|
||||
FEEDS
|
||||
-----
|
||||
|
||||
.. versionadded:: 2.1
|
||||
|
||||
Default: ``{}``
|
||||
|
||||
A dictionary in which every key is a feed URI (or a :class:`pathlib.Path`
|
||||
object) and each value is a nested dictionary containing configuration
|
||||
parameters for the specific feed.
|
||||
|
||||
This setting is required for enabling the feed export feature.
|
||||
|
||||
See :ref:`topics-feed-storage-backends` for supported URI schemes.
|
||||
|
||||
For instance::
|
||||
|
||||
{
|
||||
'items.json': {
|
||||
'format': 'json',
|
||||
'encoding': 'utf8',
|
||||
'store_empty': False,
|
||||
'fields': None,
|
||||
'indent': 4,
|
||||
},
|
||||
'/home/user/documents/items.xml': {
|
||||
'format': 'xml',
|
||||
'fields': ['name', 'price'],
|
||||
'encoding': 'latin1',
|
||||
'indent': 8,
|
||||
},
|
||||
pathlib.Path('items.csv'): {
|
||||
'format': 'csv',
|
||||
'fields': ['price', 'name'],
|
||||
},
|
||||
}
|
||||
|
||||
The following is a list of the accepted keys and the setting that is used
|
||||
as a fallback value if that key is not provided for a specific feed definition:
|
||||
|
||||
- ``format``: the :ref:`serialization format <topics-feed-format>`.
|
||||
|
||||
This setting is mandatory, there is no fallback value.
|
||||
|
||||
- ``batch_item_count``: falls back to
|
||||
:setting:`FEED_EXPORT_BATCH_ITEM_COUNT`.
|
||||
|
||||
- ``encoding``: falls back to :setting:`FEED_EXPORT_ENCODING`.
|
||||
|
||||
- ``fields``: falls back to :setting:`FEED_EXPORT_FIELDS`.
|
||||
|
||||
- ``indent``: falls back to :setting:`FEED_EXPORT_INDENT`.
|
||||
|
||||
- ``overwrite``: whether to overwrite the file if it already exists
|
||||
(``True``) or append to its content (``False``).
|
||||
|
||||
The default value depends on the :ref:`storage backend
|
||||
<topics-feed-storage-backends>`:
|
||||
|
||||
- :ref:`topics-feed-storage-fs`: ``False``
|
||||
|
||||
- :ref:`topics-feed-storage-ftp`: ``True``
|
||||
|
||||
.. note:: Some FTP servers may not support appending to files (the
|
||||
``APPE`` FTP command).
|
||||
|
||||
- :ref:`topics-feed-storage-s3`: ``True`` (appending `is not supported
|
||||
<https://forums.aws.amazon.com/message.jspa?messageID=540395>`_)
|
||||
|
||||
- :ref:`topics-feed-storage-stdout`: ``False`` (overwriting is not supported)
|
||||
|
||||
- ``store_empty``: falls back to :setting:`FEED_STORE_EMPTY`.
|
||||
|
||||
- ``uri_params``: falls back to :setting:`FEED_URI_PARAMS`.
|
||||
|
||||
|
||||
.. setting:: FEED_EXPORT_ENCODING
|
||||
|
||||
FEED_EXPORT_ENCODING
|
||||
--------------------
|
||||
|
||||
Default: ``None``
|
||||
|
||||
The URI of the export feed. See :ref:`topics-feed-storage-backends` for
|
||||
supported URI schemes.
|
||||
The encoding to be used for the feed.
|
||||
|
||||
This setting is required for enabling the feed exports.
|
||||
If unset or set to ``None`` (default) it uses UTF-8 for everything except JSON output,
|
||||
which uses safe numeric encoding (``\uXXXX`` sequences) for historic reasons.
|
||||
|
||||
.. setting:: FEED_FORMAT
|
||||
Use ``utf-8`` if you want UTF-8 for JSON too.
|
||||
|
||||
FEED_FORMAT
|
||||
-----------
|
||||
.. setting:: FEED_EXPORT_FIELDS
|
||||
|
||||
The serialization format to be used for the feed. See
|
||||
:ref:`topics-feed-format` for possible values.
|
||||
FEED_EXPORT_FIELDS
|
||||
------------------
|
||||
|
||||
Default: ``None``
|
||||
|
||||
A list of fields to export, optional.
|
||||
Example: ``FEED_EXPORT_FIELDS = ["foo", "bar", "baz"]``.
|
||||
|
||||
Use FEED_EXPORT_FIELDS option to define fields to export and their order.
|
||||
|
||||
When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses the fields
|
||||
defined in :ref:`item objects <topics-items>` yielded by your spider.
|
||||
|
||||
If an exporter requires a fixed set of fields (this is the case for
|
||||
:ref:`CSV <topics-feed-format-csv>` export format) and FEED_EXPORT_FIELDS
|
||||
is empty or None, then Scrapy tries to infer field names from the
|
||||
exported data - currently it uses field names from the first item.
|
||||
|
||||
.. setting:: FEED_EXPORT_INDENT
|
||||
|
||||
FEED_EXPORT_INDENT
|
||||
------------------
|
||||
|
||||
Default: ``0``
|
||||
|
||||
Amount of spaces used to indent the output on each level. If ``FEED_EXPORT_INDENT``
|
||||
is a non-negative integer, then array elements and object members will be pretty-printed
|
||||
with that indent level. An indent level of ``0`` (the default), or negative,
|
||||
will put each item on a new line. ``None`` selects the most compact representation.
|
||||
|
||||
Currently implemented only by :class:`~scrapy.exporters.JsonItemExporter`
|
||||
and :class:`~scrapy.exporters.XmlItemExporter`, i.e. when you are exporting
|
||||
to ``.json`` or ``.xml``.
|
||||
|
||||
.. setting:: FEED_STORE_EMPTY
|
||||
|
||||
@ -232,64 +414,207 @@ FEED_STORE_EMPTY
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Whether to export empty feeds (ie. feeds with no items).
|
||||
Whether to export empty feeds (i.e. feeds with no items).
|
||||
|
||||
.. setting:: FEED_STORAGES
|
||||
|
||||
FEED_STORAGES
|
||||
-------------
|
||||
|
||||
Default:: ``{}``
|
||||
Default: ``{}``
|
||||
|
||||
A dict containing additional feed storage backends supported by your project.
|
||||
The keys are URI schemes and the values are paths to storage classes.
|
||||
|
||||
.. setting:: FEED_STORAGE_FTP_ACTIVE
|
||||
|
||||
FEED_STORAGE_FTP_ACTIVE
|
||||
-----------------------
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Whether to use the active connection mode when exporting feeds to an FTP server
|
||||
(``True``) or use the passive connection mode instead (``False``, default).
|
||||
|
||||
For information about FTP connection modes, see `What is the difference between
|
||||
active and passive FTP? <https://stackoverflow.com/a/1699163>`_.
|
||||
|
||||
.. setting:: FEED_STORAGE_S3_ACL
|
||||
|
||||
FEED_STORAGE_S3_ACL
|
||||
-------------------
|
||||
|
||||
Default: ``''`` (empty string)
|
||||
|
||||
A string containing a custom ACL for feeds exported to Amazon S3 by your project.
|
||||
|
||||
For a complete list of available values, access the `Canned ACL`_ section on Amazon S3 docs.
|
||||
|
||||
.. setting:: FEED_STORAGES_BASE
|
||||
|
||||
FEED_STORAGES_BASE
|
||||
------------------
|
||||
|
||||
Default::
|
||||
Default::
|
||||
|
||||
{
|
||||
'': 'scrapy.contrib.feedexport.FileFeedStorage',
|
||||
'file': 'scrapy.contrib.feedexport.FileFeedStorage',
|
||||
'stdout': 'scrapy.contrib.feedexport.StdoutFeedStorage',
|
||||
's3': 'scrapy.contrib.feedexport.S3FeedStorage',
|
||||
'ftp': 'scrapy.contrib.feedexport.FTPFeedStorage',
|
||||
'': 'scrapy.extensions.feedexport.FileFeedStorage',
|
||||
'file': 'scrapy.extensions.feedexport.FileFeedStorage',
|
||||
'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage',
|
||||
's3': 'scrapy.extensions.feedexport.S3FeedStorage',
|
||||
'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage',
|
||||
}
|
||||
|
||||
A dict containing the built-in feed storage backends supported by Scrapy.
|
||||
A dict containing the built-in feed storage backends supported by Scrapy. You
|
||||
can disable any of these backends by assigning ``None`` to their URI scheme in
|
||||
:setting:`FEED_STORAGES`. E.g., to disable the built-in FTP storage backend
|
||||
(without replacement), place this in your ``settings.py``::
|
||||
|
||||
FEED_STORAGES = {
|
||||
'ftp': None,
|
||||
}
|
||||
|
||||
.. setting:: FEED_EXPORTERS
|
||||
|
||||
FEED_EXPORTERS
|
||||
--------------
|
||||
|
||||
Default:: ``{}``
|
||||
Default: ``{}``
|
||||
|
||||
A dict containing additional exporters supported by your project. The keys are
|
||||
URI schemes and the values are paths to :ref:`Item exporter <topics-exporters>`
|
||||
classes.
|
||||
serialization formats and the values are paths to :ref:`Item exporter
|
||||
<topics-exporters>` classes.
|
||||
|
||||
.. setting:: FEED_EXPORTERS_BASE
|
||||
|
||||
FEED_EXPORTERS_BASE
|
||||
-------------------
|
||||
Default::
|
||||
|
||||
Default::
|
||||
|
||||
FEED_EXPORTERS_BASE = {
|
||||
'json': 'scrapy.contrib.exporter.JsonItemExporter',
|
||||
'jsonlines': 'scrapy.contrib.exporter.JsonLinesItemExporter',
|
||||
'csv': 'scrapy.contrib.exporter.CsvItemExporter',
|
||||
'xml': 'scrapy.contrib.exporter.XmlItemExporter',
|
||||
'marshal': 'scrapy.contrib.exporter.MarshalItemExporter',
|
||||
{
|
||||
'json': 'scrapy.exporters.JsonItemExporter',
|
||||
'jsonlines': 'scrapy.exporters.JsonLinesItemExporter',
|
||||
'jl': 'scrapy.exporters.JsonLinesItemExporter',
|
||||
'csv': 'scrapy.exporters.CsvItemExporter',
|
||||
'xml': 'scrapy.exporters.XmlItemExporter',
|
||||
'marshal': 'scrapy.exporters.MarshalItemExporter',
|
||||
'pickle': 'scrapy.exporters.PickleItemExporter',
|
||||
}
|
||||
|
||||
A dict containing the built-in feed exporters supported by Scrapy.
|
||||
A dict containing the built-in feed exporters supported by Scrapy. You can
|
||||
disable any of these exporters by assigning ``None`` to their serialization
|
||||
format in :setting:`FEED_EXPORTERS`. E.g., to disable the built-in CSV exporter
|
||||
(without replacement), place this in your ``settings.py``::
|
||||
|
||||
FEED_EXPORTERS = {
|
||||
'csv': None,
|
||||
}
|
||||
|
||||
|
||||
.. _URI: http://en.wikipedia.org/wiki/Uniform_Resource_Identifier
|
||||
.. _Amazon S3: http://aws.amazon.com/s3/
|
||||
.. _boto: http://code.google.com/p/boto/
|
||||
.. setting:: FEED_EXPORT_BATCH_ITEM_COUNT
|
||||
|
||||
FEED_EXPORT_BATCH_ITEM_COUNT
|
||||
-----------------------------
|
||||
|
||||
Default: ``0``
|
||||
|
||||
If assigned an integer number higher than ``0``, Scrapy generates multiple output files
|
||||
storing up to the specified number of items in each output file.
|
||||
|
||||
When generating multiple output files, you must use at least one of the following
|
||||
placeholders in the feed URI to indicate how the different output file names are
|
||||
generated:
|
||||
|
||||
* ``%(batch_time)s`` - gets replaced by a timestamp when the feed is being created
|
||||
(e.g. ``2020-03-28T14-45-08.237134``)
|
||||
|
||||
* ``%(batch_id)d`` - gets replaced by the 1-based sequence number of the batch.
|
||||
|
||||
Use :ref:`printf-style string formatting <python:old-string-formatting>` to
|
||||
alter the number format. For example, to make the batch ID a 5-digit
|
||||
number by introducing leading zeroes as needed, use ``%(batch_id)05d``
|
||||
(e.g. ``3`` becomes ``00003``, ``123`` becomes ``00123``).
|
||||
|
||||
For instance, if your settings include::
|
||||
|
||||
FEED_EXPORT_BATCH_ITEM_COUNT = 100
|
||||
|
||||
And your :command:`crawl` command line is::
|
||||
|
||||
scrapy crawl spidername -o "dirname/%(batch_id)d-filename%(batch_time)s.json"
|
||||
|
||||
The command line above can generate a directory tree like::
|
||||
|
||||
->projectname
|
||||
-->dirname
|
||||
--->1-filename2020-03-28T14-45-08.237134.json
|
||||
--->2-filename2020-03-28T14-45-09.148903.json
|
||||
--->3-filename2020-03-28T14-45-10.046092.json
|
||||
|
||||
Where the first and second files contain exactly 100 items. The last one contains
|
||||
100 items or fewer.
|
||||
|
||||
|
||||
.. setting:: FEED_URI_PARAMS
|
||||
|
||||
FEED_URI_PARAMS
|
||||
---------------
|
||||
|
||||
Default: ``None``
|
||||
|
||||
A string with the import path of a function to set the parameters to apply with
|
||||
:ref:`printf-style string formatting <python:old-string-formatting>` to the
|
||||
feed URI.
|
||||
|
||||
The function signature should be as follows:
|
||||
|
||||
.. function:: uri_params(params, spider)
|
||||
|
||||
Return a :class:`dict` of key-value pairs to apply to the feed URI using
|
||||
:ref:`printf-style string formatting <python:old-string-formatting>`.
|
||||
|
||||
:param params: default key-value pairs
|
||||
|
||||
Specifically:
|
||||
|
||||
- ``batch_id``: ID of the file batch. See
|
||||
:setting:`FEED_EXPORT_BATCH_ITEM_COUNT`.
|
||||
|
||||
If :setting:`FEED_EXPORT_BATCH_ITEM_COUNT` is ``0``, ``batch_id``
|
||||
is always ``1``.
|
||||
|
||||
- ``batch_time``: UTC date and time, in ISO format with ``:``
|
||||
replaced with ``-``.
|
||||
|
||||
See :setting:`FEED_EXPORT_BATCH_ITEM_COUNT`.
|
||||
|
||||
- ``time``: ``batch_time``, with microseconds set to ``0``.
|
||||
:type params: dict
|
||||
|
||||
:param spider: source spider of the feed items
|
||||
:type spider: scrapy.spiders.Spider
|
||||
|
||||
For example, to include the :attr:`name <scrapy.spiders.Spider.name>` of the
|
||||
source spider in the feed URI:
|
||||
|
||||
#. Define the following function somewhere in your project::
|
||||
|
||||
# myproject/utils.py
|
||||
def uri_params(params, spider):
|
||||
return {**params, 'spider_name': spider.name}
|
||||
|
||||
#. Point :setting:`FEED_URI_PARAMS` to that function in your settings::
|
||||
|
||||
# myproject/settings.py
|
||||
FEED_URI_PARAMS = 'myproject.utils.uri_params'
|
||||
|
||||
#. Use ``%(spider_name)s`` in your feed URI::
|
||||
|
||||
scrapy crawl <spider_name> -o "%(spider_name)s.jl"
|
||||
|
||||
|
||||
.. _URIs: https://en.wikipedia.org/wiki/Uniform_Resource_Identifier
|
||||
.. _Amazon S3: https://aws.amazon.com/s3/
|
||||
.. _botocore: https://github.com/boto/botocore
|
||||
.. _Canned ACL: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl
|
||||
.. _Google Cloud Storage: https://cloud.google.com/storage/
|
||||
|
@ -1,167 +0,0 @@
|
||||
.. _topics-firebug:
|
||||
|
||||
==========================
|
||||
Using Firebug for scraping
|
||||
==========================
|
||||
|
||||
.. note:: Google Directory, the example website used in this guide is no longer
|
||||
available as it `has been shut down by Google`_. The concepts in this guide
|
||||
are still valid though. If you want to update this guide to use a new
|
||||
(working) site, your contribution will be more than welcome!. See :ref:`topics-contributing`
|
||||
for information on how to do so.
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
This document explains how to use `Firebug`_ (a Firefox add-on) to make the
|
||||
scraping process easier and more fun. For other useful Firefox add-ons see
|
||||
:ref:`topics-firefox-addons`. There are some caveats with using Firefox add-ons
|
||||
to inspect pages, see :ref:`topics-firefox-livedom`.
|
||||
|
||||
In this example, we'll show how to use `Firebug`_ to scrape data from the
|
||||
`Google Directory`_, which contains the same data as the `Open Directory
|
||||
Project`_ used in the :ref:`tutorial <intro-tutorial>` but with a different
|
||||
face.
|
||||
|
||||
.. _Firebug: http://getfirebug.com
|
||||
.. _Google Directory: http://directory.google.com/
|
||||
.. _Open Directory Project: http://www.dmoz.org
|
||||
|
||||
Firebug comes with a very useful feature called `Inspect Element`_ which allows
|
||||
you to inspect the HTML code of the different page elements just by hovering
|
||||
your mouse over them. Otherwise you would have to search for the tags manually
|
||||
through the HTML body which can be a very tedious task.
|
||||
|
||||
.. _Inspect Element: http://www.youtube.com/watch?v=-pT_pDe54aA
|
||||
|
||||
In the following screenshot you can see the `Inspect Element`_ tool in action.
|
||||
|
||||
.. image:: _images/firebug1.png
|
||||
:width: 913
|
||||
:height: 600
|
||||
:alt: Inspecting elements with Firebug
|
||||
|
||||
At first sight, we can see that the directory is divided in categories, which
|
||||
are also divided in subcategories.
|
||||
|
||||
However, it seems that there are more subcategories than the ones being shown
|
||||
in this page, so we'll keep looking:
|
||||
|
||||
.. image:: _images/firebug2.png
|
||||
:width: 819
|
||||
:height: 629
|
||||
:alt: Inspecting elements with Firebug
|
||||
|
||||
As expected, the subcategories contain links to other subcategories, and also
|
||||
links to actual websites, which is the purpose of the directory.
|
||||
|
||||
Getting links to follow
|
||||
=======================
|
||||
|
||||
By looking at the category URLs we can see they share a pattern:
|
||||
|
||||
http://directory.google.com/Category/Subcategory/Another_Subcategory
|
||||
|
||||
Once we know that, we are able to construct a regular expression to follow
|
||||
those links. For example, the following one::
|
||||
|
||||
directory\.google\.com/[A-Z][a-zA-Z_/]+$
|
||||
|
||||
So, based on that regular expression we can create the first crawling rule::
|
||||
|
||||
Rule(LinkExtractor(allow='directory.google.com/[A-Z][a-zA-Z_/]+$', ),
|
||||
'parse_category',
|
||||
follow=True,
|
||||
),
|
||||
|
||||
The :class:`~scrapy.contrib.spiders.Rule` object instructs
|
||||
:class:`~scrapy.contrib.spiders.CrawlSpider` based spiders how to follow the
|
||||
category links. ``parse_category`` will be a method of the spider which will
|
||||
process and extract data from those pages.
|
||||
|
||||
This is how the spider would look so far::
|
||||
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
|
||||
class GoogleDirectorySpider(CrawlSpider):
|
||||
name = 'directory.google.com'
|
||||
allowed_domains = ['directory.google.com']
|
||||
start_urls = ['http://directory.google.com/']
|
||||
|
||||
rules = (
|
||||
Rule(LinkExtractor(allow='directory\.google\.com/[A-Z][a-zA-Z_/]+$'),
|
||||
'parse_category', follow=True,
|
||||
),
|
||||
)
|
||||
|
||||
def parse_category(self, response):
|
||||
# write the category page data extraction code here
|
||||
pass
|
||||
|
||||
|
||||
Extracting the data
|
||||
===================
|
||||
|
||||
Now we're going to write the code to extract data from those pages.
|
||||
|
||||
With the help of Firebug, we'll take a look at some page containing links to
|
||||
websites (say http://directory.google.com/Top/Arts/Awards/) and find out how we can
|
||||
extract those links using :ref:`Selectors <topics-selectors>`. We'll also
|
||||
use the :ref:`Scrapy shell <topics-shell>` to test those XPath's and make sure
|
||||
they work as we expect.
|
||||
|
||||
.. image:: _images/firebug3.png
|
||||
:width: 965
|
||||
:height: 751
|
||||
:alt: Inspecting elements with Firebug
|
||||
|
||||
As you can see, the page markup is not very descriptive: the elements don't
|
||||
contain ``id``, ``class`` or any attribute that clearly identifies them, so
|
||||
we''ll use the ranking bars as a reference point to select the data to extract
|
||||
when we construct our XPaths.
|
||||
|
||||
After using FireBug, we can see that each link is inside a ``td`` tag, which is
|
||||
itself inside a ``tr`` tag that also contains the link's ranking bar (in
|
||||
another ``td``).
|
||||
|
||||
So we can select the ranking bar, then find its parent (the ``tr``), and then
|
||||
finally, the link's ``td`` (which contains the data we want to scrape).
|
||||
|
||||
This results in the following XPath::
|
||||
|
||||
//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td//a
|
||||
|
||||
It's important to use the :ref:`Scrapy shell <topics-shell>` to test these
|
||||
complex XPath expressions and make sure they work as expected.
|
||||
|
||||
Basically, that expression will look for the ranking bar's ``td`` element, and
|
||||
then select any ``td`` element who has a descendant ``a`` element whose
|
||||
``href`` attribute contains the string ``#pagerank``"
|
||||
|
||||
Of course, this is not the only XPath, and maybe not the simpler one to select
|
||||
that data. Another approach could be, for example, to find any ``font`` tags
|
||||
that have that grey colour of the links,
|
||||
|
||||
Finally, we can write our ``parse_category()`` method::
|
||||
|
||||
def parse_category(self, response):
|
||||
# The path to website links in directory page
|
||||
links = response.xpath('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
|
||||
|
||||
for link in links:
|
||||
item = DirectoryItem()
|
||||
item['name'] = link.xpath('a/text()').extract()
|
||||
item['url'] = link.xpath('a/@href').extract()
|
||||
item['description'] = link.xpath('font[2]/text()').extract()
|
||||
yield item
|
||||
|
||||
|
||||
Be aware that you may find some elements which appear in Firebug but
|
||||
not in the original HTML, such as the typical case of ``<tbody>``
|
||||
elements.
|
||||
|
||||
or tags which Therefer in page HTML
|
||||
sources may on Firebug inspects the live DOM
|
||||
|
||||
.. _has been shut down by Google: http://searchenginewatch.com/article/2096661/Google-Directory-Has-Been-Shut-Down
|
@ -1,82 +0,0 @@
|
||||
.. _topics-firefox:
|
||||
|
||||
==========================
|
||||
Using Firefox for scraping
|
||||
==========================
|
||||
|
||||
Here is a list of tips and advice on using Firefox for scraping, along with a
|
||||
list of useful Firefox add-ons to ease the scraping process.
|
||||
|
||||
.. _topics-firefox-livedom:
|
||||
|
||||
Caveats with inspecting the live browser DOM
|
||||
============================================
|
||||
|
||||
Since Firefox add-ons operate on a live browser DOM, what you'll actually see
|
||||
when inspecting the page source is not the original HTML, but a modified one
|
||||
after applying some browser clean up and executing Javascript code. Firefox,
|
||||
in particular, is known for adding ``<tbody>`` elements to tables. Scrapy, on
|
||||
the other hand, does not modify the original page HTML, so you won't be able to
|
||||
extract any data if you use ``<tbody`` in your XPath expressions.
|
||||
|
||||
Therefore, you should keep in mind the following things when working with
|
||||
Firefox and XPath:
|
||||
|
||||
* Disable Firefox Javascript while inspecting the DOM looking for XPaths to be
|
||||
used in Scrapy
|
||||
|
||||
* Never use full XPath paths, use relative and clever ones based on attributes
|
||||
(such as ``id``, ``class``, ``width``, etc) or any identifying features like
|
||||
``contains(@href, 'image')``.
|
||||
|
||||
* Never include ``<tbody>`` elements in your XPath expressions unless you
|
||||
really know what you're doing
|
||||
|
||||
.. _topics-firefox-addons:
|
||||
|
||||
Useful Firefox add-ons for scraping
|
||||
===================================
|
||||
|
||||
Firebug
|
||||
-------
|
||||
|
||||
`Firebug`_ is a widely known tool among web developers and it's also very
|
||||
useful for scraping. In particular, its `Inspect Element`_ feature comes very
|
||||
handy when you need to construct the XPaths for extracting data because it
|
||||
allows you to view the HTML code of each page element while moving your mouse
|
||||
over it.
|
||||
|
||||
See :ref:`topics-firebug` for a detailed guide on how to use Firebug with
|
||||
Scrapy.
|
||||
|
||||
XPather
|
||||
-------
|
||||
|
||||
`XPather`_ allows you to test XPath expressions directly on the pages.
|
||||
|
||||
XPath Checker
|
||||
-------------
|
||||
|
||||
`XPath Checker`_ is another Firefox add-on for testing XPaths on your pages.
|
||||
|
||||
Tamper Data
|
||||
-----------
|
||||
|
||||
`Tamper Data`_ is a Firefox add-on which allows you to view and modify the HTTP
|
||||
request headers sent by Firefox. Firebug also allows to view HTTP headers, but
|
||||
not to modify them.
|
||||
|
||||
Firecookie
|
||||
----------
|
||||
|
||||
`Firecookie`_ makes it easier to view and manage cookies. You can use this
|
||||
extension to create a new cookie, delete existing cookies, see a list of cookies
|
||||
for the current site, manage cookies permissions and a lot more.
|
||||
|
||||
.. _Firebug: http://getfirebug.com
|
||||
.. _Inspect Element: http://www.youtube.com/watch?v=-pT_pDe54aA
|
||||
.. _XPather: https://addons.mozilla.org/firefox/addon/1192
|
||||
.. _XPath Checker: https://addons.mozilla.org/firefox/addon/1095
|
||||
.. _Tamper Data: http://addons.mozilla.org/firefox/addon/966
|
||||
.. _Firecookie: https://addons.mozilla.org/firefox/addon/6683
|
||||
|
@ -1,323 +0,0 @@
|
||||
.. _topics-images:
|
||||
|
||||
=======================
|
||||
Downloading Item Images
|
||||
=======================
|
||||
|
||||
.. currentmodule:: scrapy.contrib.pipeline.images
|
||||
|
||||
Scrapy provides an :doc:`item pipeline </topics/item-pipeline>` for downloading
|
||||
images attached to a particular item, for example, when you scrape products and
|
||||
also want to download their images locally.
|
||||
|
||||
This pipeline, called the Images Pipeline and implemented in the
|
||||
:class:`ImagesPipeline` class, provides a convenient way for
|
||||
downloading and storing images locally with some additional features:
|
||||
|
||||
* Convert all downloaded images to a common format (JPG) and mode (RGB)
|
||||
* Avoid re-downloading images which were downloaded recently
|
||||
* Thumbnail generation
|
||||
* Check images width/height to make sure they meet a minimum constraint
|
||||
|
||||
This pipeline also keeps an internal queue of those images which are currently
|
||||
being scheduled for download, and connects those items that arrive containing
|
||||
the same image, to that queue. This avoids downloading the same image more than
|
||||
once when it's shared by several items.
|
||||
|
||||
`Pillow`_ is used for thumbnailing and normalizing images to JPEG/RGB format,
|
||||
so you need to install this library in order to use the images pipeline.
|
||||
`Python Imaging Library`_ (PIL) should also work in most cases, but it
|
||||
is known to cause troubles in some setups, so we recommend to use `Pillow`_
|
||||
instead of `PIL <Python Imaging Library>`_.
|
||||
|
||||
.. _Pillow: https://github.com/python-imaging/Pillow
|
||||
.. _Python Imaging Library: http://www.pythonware.com/products/pil/
|
||||
|
||||
Using the Images Pipeline
|
||||
=========================
|
||||
|
||||
The typical workflow, when using the :class:`ImagesPipeline` goes like
|
||||
this:
|
||||
|
||||
1. In a Spider, you scrape an item and put the URLs of its images into a
|
||||
``image_urls`` field.
|
||||
|
||||
2. The item is returned from the spider and goes to the item pipeline.
|
||||
|
||||
3. When the item reaches the :class:`ImagesPipeline`, the URLs in the
|
||||
``image_urls`` field are scheduled for download using the standard
|
||||
Scrapy scheduler and downloader (which means the scheduler and downloader
|
||||
middlewares are reused), but with a higher priority, processing them before other
|
||||
pages are scraped. The item remains "locked" at that particular pipeline stage
|
||||
until the images have finish downloading (or fail for some reason).
|
||||
|
||||
4. When the images are downloaded another field (``images``) will be populated
|
||||
with the results. This field will contain a list of dicts with information
|
||||
about the images downloaded, such as the downloaded path, the original
|
||||
scraped url (taken from the ``image_urls`` field) , and the image checksum.
|
||||
The images in the list of the ``images`` field will retain the same order of
|
||||
the original ``image_urls`` field. If some image failed downloading, an
|
||||
error will be logged and the image won't be present in the ``images`` field.
|
||||
|
||||
|
||||
Usage example
|
||||
=============
|
||||
|
||||
In order to use the image pipeline you just need to :ref:`enable it
|
||||
<topics-images-enabling>` and define an item with the ``image_urls`` and
|
||||
``images`` fields::
|
||||
|
||||
import scrapy
|
||||
|
||||
class MyItem(scrapy.Item):
|
||||
|
||||
# ... other item fields ...
|
||||
image_urls = scrapy.Field()
|
||||
images = scrapy.Field()
|
||||
|
||||
If you need something more complex and want to override the custom images
|
||||
pipeline behaviour, see :ref:`topics-images-override`.
|
||||
|
||||
.. _topics-images-enabling:
|
||||
|
||||
Enabling your Images Pipeline
|
||||
=============================
|
||||
|
||||
.. setting:: IMAGES_STORE
|
||||
|
||||
To enable your images pipeline you must first add it to your project
|
||||
:setting:`ITEM_PIPELINES` setting::
|
||||
|
||||
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
|
||||
|
||||
And set the :setting:`IMAGES_STORE` setting to a valid directory that will be
|
||||
used for storing the downloaded images. Otherwise the pipeline will remain
|
||||
disabled, even if you include it in the :setting:`ITEM_PIPELINES` setting.
|
||||
|
||||
For example::
|
||||
|
||||
IMAGES_STORE = '/path/to/valid/dir'
|
||||
|
||||
Images Storage
|
||||
==============
|
||||
|
||||
File system is currently the only officially supported storage, but there is
|
||||
also (undocumented) support for `Amazon S3`_.
|
||||
|
||||
.. _Amazon S3: https://s3.amazonaws.com/
|
||||
|
||||
File system storage
|
||||
-------------------
|
||||
|
||||
The images are stored in files (one per image), using a `SHA1 hash`_ of their
|
||||
URLs for the file names.
|
||||
|
||||
For example, the following image URL::
|
||||
|
||||
http://www.example.com/image.jpg
|
||||
|
||||
Whose `SHA1 hash` is::
|
||||
|
||||
3afec3b4765f8f0a07b78f98c07b83f013567a0a
|
||||
|
||||
Will be downloaded and stored in the following file::
|
||||
|
||||
<IMAGES_STORE>/full/3afec3b4765f8f0a07b78f98c07b83f013567a0a.jpg
|
||||
|
||||
Where:
|
||||
|
||||
* ``<IMAGES_STORE>`` is the directory defined in :setting:`IMAGES_STORE` setting
|
||||
|
||||
* ``full`` is a sub-directory to separate full images from thumbnails (if
|
||||
used). For more info see :ref:`topics-images-thumbnails`.
|
||||
|
||||
Additional features
|
||||
===================
|
||||
|
||||
Image expiration
|
||||
----------------
|
||||
|
||||
.. setting:: IMAGES_EXPIRES
|
||||
|
||||
The Image Pipeline avoids downloading images that were downloaded recently. To
|
||||
adjust this retention delay use the :setting:`IMAGES_EXPIRES` setting, which
|
||||
specifies the delay in number of days::
|
||||
|
||||
# 90 days of delay for image expiration
|
||||
IMAGES_EXPIRES = 90
|
||||
|
||||
.. _topics-images-thumbnails:
|
||||
|
||||
Thumbnail generation
|
||||
--------------------
|
||||
|
||||
The Images Pipeline can automatically create thumbnails of the downloaded
|
||||
images.
|
||||
|
||||
.. setting:: IMAGES_THUMBS
|
||||
|
||||
In order use this feature, you must set :setting:`IMAGES_THUMBS` to a dictionary
|
||||
where the keys are the thumbnail names and the values are their dimensions.
|
||||
|
||||
For example::
|
||||
|
||||
IMAGES_THUMBS = {
|
||||
'small': (50, 50),
|
||||
'big': (270, 270),
|
||||
}
|
||||
|
||||
When you use this feature, the Images Pipeline will create thumbnails of the
|
||||
each specified size with this format::
|
||||
|
||||
<IMAGES_STORE>/thumbs/<size_name>/<image_id>.jpg
|
||||
|
||||
Where:
|
||||
|
||||
* ``<size_name>`` is the one specified in the :setting:`IMAGES_THUMBS`
|
||||
dictionary keys (``small``, ``big``, etc)
|
||||
|
||||
* ``<image_id>`` is the `SHA1 hash`_ of the image url
|
||||
|
||||
.. _SHA1 hash: http://en.wikipedia.org/wiki/SHA_hash_functions
|
||||
|
||||
Example of image files stored using ``small`` and ``big`` thumbnail names::
|
||||
|
||||
<IMAGES_STORE>/full/63bbfea82b8880ed33cdb762aa11fab722a90a24.jpg
|
||||
<IMAGES_STORE>/thumbs/small/63bbfea82b8880ed33cdb762aa11fab722a90a24.jpg
|
||||
<IMAGES_STORE>/thumbs/big/63bbfea82b8880ed33cdb762aa11fab722a90a24.jpg
|
||||
|
||||
The first one is the full image, as downloaded from the site.
|
||||
|
||||
Filtering out small images
|
||||
--------------------------
|
||||
|
||||
.. setting:: IMAGES_MIN_HEIGHT
|
||||
|
||||
.. setting:: IMAGES_MIN_WIDTH
|
||||
|
||||
You can drop images which are too small, by specifying the minimum allowed size
|
||||
in the :setting:`IMAGES_MIN_HEIGHT` and :setting:`IMAGES_MIN_WIDTH` settings.
|
||||
|
||||
For example::
|
||||
|
||||
IMAGES_MIN_HEIGHT = 110
|
||||
IMAGES_MIN_WIDTH = 110
|
||||
|
||||
Note: these size constraints don't affect thumbnail generation at all.
|
||||
|
||||
By default, there are no size constraints, so all images are processed.
|
||||
|
||||
.. _topics-images-override:
|
||||
|
||||
Implementing your custom Images Pipeline
|
||||
========================================
|
||||
|
||||
.. module:: scrapy.contrib.pipeline.images
|
||||
:synopsis: Images Pipeline
|
||||
|
||||
Here are the methods that you should override in your custom Images Pipeline:
|
||||
|
||||
.. class:: ImagesPipeline
|
||||
|
||||
.. method:: get_media_requests(item, info)
|
||||
|
||||
As seen on the workflow, the pipeline will get the URLs of the images to
|
||||
download from the item. In order to do this, you must override the
|
||||
:meth:`~get_media_requests` method and return a Request for each
|
||||
image URL::
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
for image_url in item['image_urls']:
|
||||
yield scrapy.Request(image_url)
|
||||
|
||||
Those requests will be processed by the pipeline and, when they have finished
|
||||
downloading, the results will be sent to the
|
||||
:meth:`~item_completed` method, as a list of 2-element tuples.
|
||||
Each tuple will contain ``(success, image_info_or_failure)`` where:
|
||||
|
||||
* ``success`` is a boolean which is ``True`` if the image was downloaded
|
||||
successfully or ``False`` if it failed for some reason
|
||||
|
||||
* ``image_info_or_error`` is a dict containing the following keys (if success
|
||||
is ``True``) or a `Twisted Failure`_ if there was a problem.
|
||||
|
||||
* ``url`` - the url where the image was downloaded from. This is the url of
|
||||
the request returned from the :meth:`~get_media_requests`
|
||||
method.
|
||||
|
||||
* ``path`` - the path (relative to :setting:`IMAGES_STORE`) where the image
|
||||
was stored
|
||||
|
||||
* ``checksum`` - a `MD5 hash`_ of the image contents
|
||||
|
||||
The list of tuples received by :meth:`~item_completed` is
|
||||
guaranteed to retain the same order of the requests returned from the
|
||||
:meth:`~get_media_requests` method.
|
||||
|
||||
Here's a typical value of the ``results`` argument::
|
||||
|
||||
[(True,
|
||||
{'checksum': '2b00042f7481c7b056c4b410d28f33cf',
|
||||
'path': 'full/7d97e98f8af710c7e7fe703abc8f639e0ee507c4.jpg',
|
||||
'url': 'http://www.example.com/images/product1.jpg'}),
|
||||
(True,
|
||||
{'checksum': 'b9628c4ab9b595f72f280b90c4fd093d',
|
||||
'path': 'full/1ca5879492b8fd606df1964ea3c1e2f4520f076f.jpg',
|
||||
'url': 'http://www.example.com/images/product2.jpg'}),
|
||||
(False,
|
||||
Failure(...))]
|
||||
|
||||
By default the :meth:`get_media_requests` method returns ``None`` which
|
||||
means there are no images to download for the item.
|
||||
|
||||
.. method:: item_completed(results, items, info)
|
||||
|
||||
The :meth:`ImagesPipeline.item_completed` method called when all image
|
||||
requests for a single item have completed (either finished downloading, or
|
||||
failed for some reason).
|
||||
|
||||
The :meth:`~item_completed` method must return the
|
||||
output that will be sent to subsequent item pipeline stages, so you must
|
||||
return (or drop) the item, as you would in any pipeline.
|
||||
|
||||
Here is an example of the :meth:`~item_completed` method where we
|
||||
store the downloaded image paths (passed in results) in the ``image_paths``
|
||||
item field, and we drop the item if it doesn't contain any images::
|
||||
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
image_paths = [x['path'] for ok, x in results if ok]
|
||||
if not image_paths:
|
||||
raise DropItem("Item contains no images")
|
||||
item['image_paths'] = image_paths
|
||||
return item
|
||||
|
||||
By default, the :meth:`item_completed` method returns the item.
|
||||
|
||||
|
||||
Custom Images pipeline example
|
||||
==============================
|
||||
|
||||
Here is a full example of the Images Pipeline whose methods are examplified
|
||||
above::
|
||||
|
||||
import scrapy
|
||||
from scrapy.contrib.pipeline.images import ImagesPipeline
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
class MyImagesPipeline(ImagesPipeline):
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
for image_url in item['image_urls']:
|
||||
yield scrapy.Request(image_url)
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
image_paths = [x['path'] for ok, x in results if ok]
|
||||
if not image_paths:
|
||||
raise DropItem("Item contains no images")
|
||||
item['image_paths'] = image_paths
|
||||
return item
|
||||
|
||||
.. _Twisted Failure: http://twistedmatrix.com/documents/current/api/twisted.python.failure.Failure.html
|
||||
.. _MD5 hash: http://en.wikipedia.org/wiki/MD5
|
@ -5,14 +5,14 @@ Item Pipeline
|
||||
=============
|
||||
|
||||
After an item has been scraped by a spider, it is sent to the Item Pipeline
|
||||
which process it through several components that are executed sequentially.
|
||||
which processes it through several components that are executed sequentially.
|
||||
|
||||
Each item pipeline component (sometimes referred as just "Item Pipeline") is a
|
||||
Python class that implements a simple method. They receive an Item and perform
|
||||
an action over it, also deciding if the Item should continue through the
|
||||
Python class that implements a simple method. They receive an item and perform
|
||||
an action over it, also deciding if the item should continue through the
|
||||
pipeline or be dropped and no longer processed.
|
||||
|
||||
Typical use for item pipelines are:
|
||||
Typical uses of item pipelines are:
|
||||
|
||||
* cleansing HTML data
|
||||
* validating scraped data (checking that the items contain certain fields)
|
||||
@ -23,37 +23,53 @@ Typical use for item pipelines are:
|
||||
Writing your own item pipeline
|
||||
==============================
|
||||
|
||||
Writing your own item pipeline is easy. Each item pipeline component is a
|
||||
single Python class that must implement the following method:
|
||||
Each item pipeline component is a Python class that must implement the following method:
|
||||
|
||||
.. method:: process_item(item, spider)
|
||||
.. method:: process_item(self, item, spider)
|
||||
|
||||
This method is called for every item pipeline component and must either return
|
||||
a :class:`~scrapy.item.Item` (or any descendant class) object or raise a
|
||||
:exc:`~scrapy.exceptions.DropItem` exception. Dropped items are no longer
|
||||
processed by further pipeline components.
|
||||
This method is called for every item pipeline component.
|
||||
|
||||
:param item: the item scraped
|
||||
:type item: :class:`~scrapy.item.Item` object
|
||||
`item` is an :ref:`item object <item-types>`, see
|
||||
:ref:`supporting-item-types`.
|
||||
|
||||
:meth:`process_item` must either: return an :ref:`item object <item-types>`,
|
||||
return a :class:`~twisted.internet.defer.Deferred` or raise a
|
||||
:exc:`~scrapy.exceptions.DropItem` exception.
|
||||
|
||||
Dropped items are no longer processed by further pipeline components.
|
||||
|
||||
:param item: the scraped item
|
||||
:type item: :ref:`item object <item-types>`
|
||||
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
Additionally, they may also implement the following methods:
|
||||
|
||||
.. method:: open_spider(spider)
|
||||
.. method:: open_spider(self, spider)
|
||||
|
||||
This method is called when the spider is opened.
|
||||
|
||||
:param spider: the spider which was opened
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
.. method:: close_spider(spider)
|
||||
.. method:: close_spider(self, spider)
|
||||
|
||||
This method is called when the spider is closed.
|
||||
|
||||
:param spider: the spider which was closed
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
.. method:: from_crawler(cls, crawler)
|
||||
|
||||
If present, this classmethod is called to create a pipeline instance
|
||||
from a :class:`~scrapy.crawler.Crawler`. It must return a new instance
|
||||
of the pipeline. Crawler object provides access to all Scrapy core
|
||||
components like settings and signals; it is a way for pipeline to
|
||||
access them and hook its functionality into Scrapy.
|
||||
|
||||
:param crawler: crawler that uses this pipeline
|
||||
:type crawler: :class:`~scrapy.crawler.Crawler` object
|
||||
|
||||
|
||||
Item pipeline example
|
||||
@ -62,20 +78,22 @@ Item pipeline example
|
||||
Price validation and dropping items with no prices
|
||||
--------------------------------------------------
|
||||
|
||||
Let's take a look at the following hypothetical pipeline that adjusts the ``price``
|
||||
attribute for those items that do not include VAT (``price_excludes_vat``
|
||||
attribute), and drops those items which don't contain a price::
|
||||
Let's take a look at the following hypothetical pipeline that adjusts the
|
||||
``price`` attribute for those items that do not include VAT
|
||||
(``price_excludes_vat`` attribute), and drops those items which don't
|
||||
contain a price::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
class PricePipeline(object):
|
||||
class PricePipeline:
|
||||
|
||||
vat_factor = 1.15
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if item['price']:
|
||||
if item['price_excludes_vat']:
|
||||
item['price'] = item['price'] * self.vat_factor
|
||||
adapter = ItemAdapter(item)
|
||||
if adapter.get('price'):
|
||||
if adapter.get('price_excludes_vat'):
|
||||
adapter['price'] = adapter['price'] * self.vat_factor
|
||||
return item
|
||||
else:
|
||||
raise DropItem("Missing price in %s" % item)
|
||||
@ -84,19 +102,24 @@ attribute), and drops those items which don't contain a price::
|
||||
Write items to a JSON file
|
||||
--------------------------
|
||||
|
||||
The following pipeline stores all scraped items (from all spiders) into a a
|
||||
The following pipeline stores all scraped items (from all spiders) into a
|
||||
single ``items.jl`` file, containing one item per line serialized in JSON
|
||||
format::
|
||||
|
||||
import json
|
||||
|
||||
class JsonWriterPipeline(object):
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
def __init__(self):
|
||||
self.file = open('items.jl', 'wb')
|
||||
class JsonWriterPipeline:
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.file = open('items.jl', 'w')
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
line = json.dumps(dict(item)) + "\n"
|
||||
line = json.dumps(ItemAdapter(item).asdict()) + "\n"
|
||||
self.file.write(line)
|
||||
return item
|
||||
|
||||
@ -104,26 +127,122 @@ format::
|
||||
item pipelines. If you really want to store all scraped items into a JSON
|
||||
file you should use the :ref:`Feed exports <topics-feed-exports>`.
|
||||
|
||||
Write items to MongoDB
|
||||
----------------------
|
||||
|
||||
In this example we'll write items to MongoDB_ using pymongo_.
|
||||
MongoDB address and database name are specified in Scrapy settings;
|
||||
MongoDB collection is named after item class.
|
||||
|
||||
The main point of this example is to show how to use :meth:`from_crawler`
|
||||
method and how to clean up the resources properly.::
|
||||
|
||||
import pymongo
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
class MongoPipeline:
|
||||
|
||||
collection_name = 'scrapy_items'
|
||||
|
||||
def __init__(self, mongo_uri, mongo_db):
|
||||
self.mongo_uri = mongo_uri
|
||||
self.mongo_db = mongo_db
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(
|
||||
mongo_uri=crawler.settings.get('MONGO_URI'),
|
||||
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
|
||||
)
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.client = pymongo.MongoClient(self.mongo_uri)
|
||||
self.db = self.client[self.mongo_db]
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.client.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
|
||||
return item
|
||||
|
||||
.. _MongoDB: https://www.mongodb.com/
|
||||
.. _pymongo: https://api.mongodb.com/python/current/
|
||||
|
||||
|
||||
.. _ScreenshotPipeline:
|
||||
|
||||
Take screenshot of item
|
||||
-----------------------
|
||||
|
||||
This example demonstrates how to use :doc:`coroutine syntax <coroutines>` in
|
||||
the :meth:`process_item` method.
|
||||
|
||||
This item pipeline makes a request to a locally-running instance of Splash_ to
|
||||
render a screenshot of the item URL. After the request response is downloaded,
|
||||
the item pipeline saves the screenshot to a file and adds the filename to the
|
||||
item.
|
||||
|
||||
::
|
||||
|
||||
import hashlib
|
||||
from urllib.parse import quote
|
||||
|
||||
import scrapy
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
class ScreenshotPipeline:
|
||||
"""Pipeline that uses Splash to render screenshot of
|
||||
every Scrapy item."""
|
||||
|
||||
SPLASH_URL = "http://localhost:8050/render.png?url={}"
|
||||
|
||||
async def process_item(self, item, spider):
|
||||
adapter = ItemAdapter(item)
|
||||
encoded_item_url = quote(adapter["url"])
|
||||
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
|
||||
request = scrapy.Request(screenshot_url)
|
||||
response = await spider.crawler.engine.download(request, spider)
|
||||
|
||||
if response.status != 200:
|
||||
# Error happened, return item.
|
||||
return item
|
||||
|
||||
# Save screenshot to file, filename will be hash of url.
|
||||
url = adapter["url"]
|
||||
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
|
||||
filename = "{}.png".format(url_hash)
|
||||
with open(filename, "wb") as f:
|
||||
f.write(response.body)
|
||||
|
||||
# Store filename in item.
|
||||
adapter["screenshot_filename"] = filename
|
||||
return item
|
||||
|
||||
.. _Splash: https://splash.readthedocs.io/en/stable/
|
||||
|
||||
Duplicates filter
|
||||
-----------------
|
||||
|
||||
A filter that looks for duplicate items, and drops those items that were
|
||||
already processed. Let say that our items have an unique id, but our spider
|
||||
already processed. Let's say that our items have a unique id, but our spider
|
||||
returns multiples items with the same id::
|
||||
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
class DuplicatesPipeline(object):
|
||||
class DuplicatesPipeline:
|
||||
|
||||
def __init__(self):
|
||||
self.ids_seen = set()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if item['id'] in self.ids_seen:
|
||||
raise DropItem("Duplicate item found: %s" % item)
|
||||
adapter = ItemAdapter(item)
|
||||
if adapter['id'] in self.ids_seen:
|
||||
raise DropItem("Duplicate item found: %r" % item)
|
||||
else:
|
||||
self.ids_seen.add(item['id'])
|
||||
self.ids_seen.add(adapter['id'])
|
||||
return item
|
||||
|
||||
|
||||
@ -139,6 +258,5 @@ To activate an Item Pipeline component you must add its class to the
|
||||
}
|
||||
|
||||
The integer values you assign to classes in this setting determine the
|
||||
order they run in- items go through pipelines from order number low to
|
||||
high. It's customary to define these numbers in the 0-1000 range.
|
||||
|
||||
order in which they run: items go through from lower valued to higher
|
||||
valued classes. It's customary to define these numbers in the 0-1000 range.
|
||||
|
@ -8,22 +8,155 @@ Items
|
||||
:synopsis: Item and Field classes
|
||||
|
||||
The main goal in scraping is to extract structured data from unstructured
|
||||
sources, typically, web pages. Scrapy provides the :class:`Item` class for this
|
||||
purpose.
|
||||
sources, typically, web pages. :ref:`Spiders <topics-spiders>` may return the
|
||||
extracted data as `items`, Python objects that define key-value pairs.
|
||||
|
||||
:class:`Item` objects are simple containers used to collect the scraped data.
|
||||
They provide a `dictionary-like`_ API with a convenient syntax for declaring
|
||||
their available fields.
|
||||
Scrapy supports :ref:`multiple types of items <item-types>`. When you create an
|
||||
item, you may use whichever type of item you want. When you write code that
|
||||
receives an item, your code should :ref:`work for any item type
|
||||
<supporting-item-types>`.
|
||||
|
||||
.. _dictionary-like: http://docs.python.org/library/stdtypes.html#dict
|
||||
.. _item-types:
|
||||
|
||||
Item Types
|
||||
==========
|
||||
|
||||
Scrapy supports the following types of items, via the `itemadapter`_ library:
|
||||
:ref:`dictionaries <dict-items>`, :ref:`Item objects <item-objects>`,
|
||||
:ref:`dataclass objects <dataclass-items>`, and :ref:`attrs objects <attrs-items>`.
|
||||
|
||||
.. _itemadapter: https://github.com/scrapy/itemadapter
|
||||
|
||||
.. _dict-items:
|
||||
|
||||
Dictionaries
|
||||
------------
|
||||
|
||||
As an item type, :class:`dict` is convenient and familiar.
|
||||
|
||||
.. _item-objects:
|
||||
|
||||
Item objects
|
||||
------------
|
||||
|
||||
:class:`Item` provides a :class:`dict`-like API plus additional features that
|
||||
make it the most feature-complete item type:
|
||||
|
||||
.. class:: Item([arg])
|
||||
|
||||
:class:`Item` objects replicate the standard :class:`dict` API, including
|
||||
its ``__init__`` method.
|
||||
|
||||
:class:`Item` allows defining field names, so that:
|
||||
|
||||
- :class:`KeyError` is raised when using undefined field names (i.e.
|
||||
prevents typos going unnoticed)
|
||||
|
||||
- :ref:`Item exporters <topics-exporters>` can export all fields by
|
||||
default even if the first scraped object does not have values for all
|
||||
of them
|
||||
|
||||
:class:`Item` also allows defining field metadata, which can be used to
|
||||
:ref:`customize serialization <topics-exporters-field-serialization>`.
|
||||
|
||||
:mod:`trackref` tracks :class:`Item` objects to help find memory leaks
|
||||
(see :ref:`topics-leaks-trackrefs`).
|
||||
|
||||
:class:`Item` objects also provide the following additional API members:
|
||||
|
||||
.. automethod:: copy
|
||||
|
||||
.. automethod:: deepcopy
|
||||
|
||||
.. attribute:: fields
|
||||
|
||||
A dictionary containing *all declared fields* for this Item, not only
|
||||
those populated. The keys are the field names and the values are the
|
||||
:class:`Field` objects used in the :ref:`Item declaration
|
||||
<topics-items-declaring>`.
|
||||
|
||||
Example::
|
||||
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
class CustomItem(Item):
|
||||
one_field = Field()
|
||||
another_field = Field()
|
||||
|
||||
.. _dataclass-items:
|
||||
|
||||
Dataclass objects
|
||||
-----------------
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
:func:`~dataclasses.dataclass` allows defining item classes with field names,
|
||||
so that :ref:`item exporters <topics-exporters>` can export all fields by
|
||||
default even if the first scraped object does not have values for all of them.
|
||||
|
||||
Additionally, ``dataclass`` items also allow to:
|
||||
|
||||
* define the type and default value of each defined field.
|
||||
|
||||
* define custom field metadata through :func:`dataclasses.field`, which can be used to
|
||||
:ref:`customize serialization <topics-exporters-field-serialization>`.
|
||||
|
||||
They work natively in Python 3.7 or later, or using the `dataclasses
|
||||
backport`_ in Python 3.6.
|
||||
|
||||
.. _dataclasses backport: https://pypi.org/project/dataclasses/
|
||||
|
||||
Example::
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class CustomItem:
|
||||
one_field: str
|
||||
another_field: int
|
||||
|
||||
.. note:: Field types are not enforced at run time.
|
||||
|
||||
.. _attrs-items:
|
||||
|
||||
attr.s objects
|
||||
--------------
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
:func:`attr.s` allows defining item classes with field names,
|
||||
so that :ref:`item exporters <topics-exporters>` can export all fields by
|
||||
default even if the first scraped object does not have values for all of them.
|
||||
|
||||
Additionally, ``attr.s`` items also allow to:
|
||||
|
||||
* define the type and default value of each defined field.
|
||||
|
||||
* define custom field :ref:`metadata <attrs:metadata>`, which can be used to
|
||||
:ref:`customize serialization <topics-exporters-field-serialization>`.
|
||||
|
||||
In order to use this type, the :doc:`attrs package <attrs:index>` needs to be installed.
|
||||
|
||||
Example::
|
||||
|
||||
import attr
|
||||
|
||||
@attr.s
|
||||
class CustomItem:
|
||||
one_field = attr.ib()
|
||||
another_field = attr.ib()
|
||||
|
||||
|
||||
Working with Item objects
|
||||
=========================
|
||||
|
||||
.. _topics-items-declaring:
|
||||
|
||||
Declaring Items
|
||||
===============
|
||||
Declaring Item subclasses
|
||||
-------------------------
|
||||
|
||||
Items are declared using a simple class definition syntax and :class:`Field`
|
||||
objects. Here is an example::
|
||||
Item subclasses are declared using a simple class definition syntax and
|
||||
:class:`Field` objects. Here is an example::
|
||||
|
||||
import scrapy
|
||||
|
||||
@ -31,19 +164,21 @@ objects. Here is an example::
|
||||
name = scrapy.Field()
|
||||
price = scrapy.Field()
|
||||
stock = scrapy.Field()
|
||||
tags = scrapy.Field()
|
||||
last_updated = scrapy.Field(serializer=str)
|
||||
|
||||
.. note:: Those familiar with `Django`_ will notice that Scrapy Items are
|
||||
declared similar to `Django Models`_, except that Scrapy Items are much
|
||||
simpler as there is no concept of different field types.
|
||||
|
||||
.. _Django: http://www.djangoproject.com/
|
||||
.. _Django Models: http://docs.djangoproject.com/en/dev/topics/db/models/
|
||||
.. _Django: https://www.djangoproject.com/
|
||||
.. _Django Models: https://docs.djangoproject.com/en/dev/topics/db/models/
|
||||
|
||||
|
||||
.. _topics-items-fields:
|
||||
|
||||
Item Fields
|
||||
===========
|
||||
Declaring fields
|
||||
----------------
|
||||
|
||||
:class:`Field` objects are used to specify metadata for each field. For
|
||||
example, the serializer function for the ``last_updated`` field illustrated in
|
||||
@ -52,7 +187,7 @@ the example above.
|
||||
You can specify any kind of metadata for each field. There is no restriction on
|
||||
the values accepted by :class:`Field` objects. For this same
|
||||
reason, there is no reference list of all available metadata keys. Each key
|
||||
defined in :class:`Field` objects could be used by a different components, and
|
||||
defined in :class:`Field` objects could be used by a different component, and
|
||||
only those components know about it. You can also define and use any other
|
||||
:class:`Field` key in your project too, for your own needs. The main goal of
|
||||
:class:`Field` objects is to provide a way to define all field metadata in one
|
||||
@ -64,120 +199,153 @@ It's important to note that the :class:`Field` objects used to declare the item
|
||||
do not stay assigned as class attributes. Instead, they can be accessed through
|
||||
the :attr:`Item.fields` attribute.
|
||||
|
||||
And that's all you need to know about declaring items.
|
||||
.. class:: Field([arg])
|
||||
|
||||
Working with Items
|
||||
==================
|
||||
The :class:`Field` class is just an alias to the built-in :class:`dict` class and
|
||||
doesn't provide any extra functionality or attributes. In other words,
|
||||
:class:`Field` objects are plain-old Python dicts. A separate class is used
|
||||
to support the :ref:`item declaration syntax <topics-items-declaring>`
|
||||
based on class attributes.
|
||||
|
||||
.. note:: Field metadata can also be declared for ``dataclass`` and ``attrs``
|
||||
items. Please refer to the documentation for `dataclasses.field`_ and
|
||||
`attr.ib`_ for additional information.
|
||||
|
||||
.. _dataclasses.field: https://docs.python.org/3/library/dataclasses.html#dataclasses.field
|
||||
.. _attr.ib: https://www.attrs.org/en/stable/api.html#attr.ib
|
||||
|
||||
|
||||
Working with Item objects
|
||||
-------------------------
|
||||
|
||||
Here are some examples of common tasks performed with items, using the
|
||||
``Product`` item :ref:`declared above <topics-items-declaring>`. You will
|
||||
notice the API is very similar to the `dict API`_.
|
||||
notice the API is very similar to the :class:`dict` API.
|
||||
|
||||
Creating items
|
||||
--------------
|
||||
''''''''''''''
|
||||
|
||||
::
|
||||
>>> product = Product(name='Desktop PC', price=1000)
|
||||
>>> print(product)
|
||||
Product(name='Desktop PC', price=1000)
|
||||
|
||||
>>> product = Product(name='Desktop PC', price=1000)
|
||||
>>> print product
|
||||
Product(name='Desktop PC', price=1000)
|
||||
|
||||
Getting field values
|
||||
--------------------
|
||||
''''''''''''''''''''
|
||||
|
||||
::
|
||||
>>> product['name']
|
||||
Desktop PC
|
||||
>>> product.get('name')
|
||||
Desktop PC
|
||||
|
||||
>>> product['name']
|
||||
Desktop PC
|
||||
>>> product.get('name')
|
||||
Desktop PC
|
||||
>>> product['price']
|
||||
1000
|
||||
|
||||
>>> product['price']
|
||||
1000
|
||||
>>> product['last_updated']
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
KeyError: 'last_updated'
|
||||
|
||||
>>> product['last_updated']
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
KeyError: 'last_updated'
|
||||
>>> product.get('last_updated', 'not set')
|
||||
not set
|
||||
|
||||
>>> product.get('last_updated', 'not set')
|
||||
not set
|
||||
>>> product['lala'] # getting unknown field
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
KeyError: 'lala'
|
||||
|
||||
>>> product['lala'] # getting unknown field
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
KeyError: 'lala'
|
||||
>>> product.get('lala', 'unknown field')
|
||||
'unknown field'
|
||||
|
||||
>>> product.get('lala', 'unknown field')
|
||||
'unknown field'
|
||||
>>> 'name' in product # is name field populated?
|
||||
True
|
||||
|
||||
>>> 'name' in product # is name field populated?
|
||||
True
|
||||
>>> 'last_updated' in product # is last_updated populated?
|
||||
False
|
||||
|
||||
>>> 'last_updated' in product # is last_updated populated?
|
||||
False
|
||||
>>> 'last_updated' in product.fields # is last_updated a declared field?
|
||||
True
|
||||
|
||||
>>> 'last_updated' in product.fields # is last_updated a declared field?
|
||||
True
|
||||
>>> 'lala' in product.fields # is lala a declared field?
|
||||
False
|
||||
|
||||
>>> 'lala' in product.fields # is lala a declared field?
|
||||
False
|
||||
|
||||
Setting field values
|
||||
--------------------
|
||||
''''''''''''''''''''
|
||||
|
||||
::
|
||||
>>> product['last_updated'] = 'today'
|
||||
>>> product['last_updated']
|
||||
today
|
||||
|
||||
>>> product['last_updated'] = 'today'
|
||||
>>> product['last_updated']
|
||||
today
|
||||
>>> product['lala'] = 'test' # setting unknown field
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
KeyError: 'Product does not support field: lala'
|
||||
|
||||
>>> product['lala'] = 'test' # setting unknown field
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
KeyError: 'Product does not support field: lala'
|
||||
|
||||
Accessing all populated values
|
||||
------------------------------
|
||||
''''''''''''''''''''''''''''''
|
||||
|
||||
To access all populated values, just use the typical `dict API`_::
|
||||
To access all populated values, just use the typical :class:`dict` API:
|
||||
|
||||
>>> product.keys()
|
||||
['price', 'name']
|
||||
>>> product.keys()
|
||||
['price', 'name']
|
||||
|
||||
>>> product.items()
|
||||
[('price', 1000), ('name', 'Desktop PC')]
|
||||
|
||||
|
||||
.. _copying-items:
|
||||
|
||||
Copying items
|
||||
'''''''''''''
|
||||
|
||||
To copy an item, you must first decide whether you want a shallow copy or a
|
||||
deep copy.
|
||||
|
||||
If your item contains :term:`mutable` values like lists or dictionaries,
|
||||
a shallow copy will keep references to the same mutable values across all
|
||||
different copies.
|
||||
|
||||
For example, if you have an item with a list of tags, and you create a shallow
|
||||
copy of that item, both the original item and the copy have the same list of
|
||||
tags. Adding a tag to the list of one of the items will add the tag to the
|
||||
other item as well.
|
||||
|
||||
If that is not the desired behavior, use a deep copy instead.
|
||||
|
||||
See :mod:`copy` for more information.
|
||||
|
||||
To create a shallow copy of an item, you can either call
|
||||
:meth:`~scrapy.item.Item.copy` on an existing item
|
||||
(``product2 = product.copy()``) or instantiate your item class from an existing
|
||||
item (``product2 = Product(product)``).
|
||||
|
||||
To create a deep copy, call :meth:`~scrapy.item.Item.deepcopy` instead
|
||||
(``product2 = product.deepcopy()``).
|
||||
|
||||
>>> product.items()
|
||||
[('price', 1000), ('name', 'Desktop PC')]
|
||||
|
||||
Other common tasks
|
||||
------------------
|
||||
''''''''''''''''''
|
||||
|
||||
Copying items::
|
||||
Creating dicts from items:
|
||||
|
||||
>>> product2 = Product(product)
|
||||
>>> print product2
|
||||
Product(name='Desktop PC', price=1000)
|
||||
>>> dict(product) # create a dict from all populated values
|
||||
{'price': 1000, 'name': 'Desktop PC'}
|
||||
|
||||
>>> product3 = product2.copy()
|
||||
>>> print product3
|
||||
Product(name='Desktop PC', price=1000)
|
||||
Creating items from dicts:
|
||||
|
||||
Creating dicts from items::
|
||||
>>> Product({'name': 'Laptop PC', 'price': 1500})
|
||||
Product(price=1500, name='Laptop PC')
|
||||
|
||||
>>> dict(product) # create a dict from all populated values
|
||||
{'price': 1000, 'name': 'Desktop PC'}
|
||||
>>> Product({'name': 'Laptop PC', 'lala': 1500}) # warning: unknown field in dict
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
KeyError: 'Product does not support field: lala'
|
||||
|
||||
Creating items from dicts::
|
||||
|
||||
>>> Product({'name': 'Laptop PC', 'price': 1500})
|
||||
Product(price=1500, name='Laptop PC')
|
||||
|
||||
>>> Product({'name': 'Laptop PC', 'lala': 1500}) # warning: unknown field in dict
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
KeyError: 'Product does not support field: lala'
|
||||
|
||||
Extending Items
|
||||
===============
|
||||
Extending Item subclasses
|
||||
-------------------------
|
||||
|
||||
You can extend Items (to add more fields or to change some metadata for some
|
||||
fields) by declaring a subclass of your original Item.
|
||||
@ -197,36 +365,25 @@ appending more values, or changing existing values, like this::
|
||||
That adds (or replaces) the ``serializer`` metadata key for the ``name`` field,
|
||||
keeping all the previously existing metadata values.
|
||||
|
||||
Item objects
|
||||
============
|
||||
|
||||
.. class:: Item([arg])
|
||||
.. _supporting-item-types:
|
||||
|
||||
Return a new Item optionally initialized from the given argument.
|
||||
Supporting All Item Types
|
||||
=========================
|
||||
|
||||
Items replicate the standard `dict API`_, including its constructor. The
|
||||
only additional attribute provided by Items is:
|
||||
In code that receives an item, such as methods of :ref:`item pipelines
|
||||
<topics-item-pipeline>` or :ref:`spider middlewares
|
||||
<topics-spider-middleware>`, it is a good practice to use the
|
||||
:class:`~itemadapter.ItemAdapter` class and the
|
||||
:func:`~itemadapter.is_item` function to write code that works for
|
||||
any :ref:`supported item type <item-types>`:
|
||||
|
||||
.. attribute:: fields
|
||||
.. autoclass:: itemadapter.ItemAdapter
|
||||
|
||||
A dictionary containing *all declared fields* for this Item, not only
|
||||
those populated. The keys are the field names and the values are the
|
||||
:class:`Field` objects used in the :ref:`Item declaration
|
||||
<topics-items-declaring>`.
|
||||
|
||||
.. _dict API: http://docs.python.org/library/stdtypes.html#dict
|
||||
|
||||
Field objects
|
||||
=============
|
||||
|
||||
.. class:: Field([arg])
|
||||
|
||||
The :class:`Field` class is just an alias to the built-in `dict`_ class and
|
||||
doesn't provide any extra functionality or attributes. In other words,
|
||||
:class:`Field` objects are plain-old Python dicts. A separate class is used
|
||||
to support the :ref:`item declaration syntax <topics-items-declaring>`
|
||||
based on class attributes.
|
||||
|
||||
.. _dict: http://docs.python.org/library/stdtypes.html#dict
|
||||
.. autofunction:: itemadapter.is_item
|
||||
|
||||
|
||||
Other classes related to items
|
||||
==============================
|
||||
|
||||
.. autoclass:: ItemMeta
|
||||
|
@ -22,7 +22,7 @@ Job directory
|
||||
|
||||
To enable persistence support you just need to define a *job directory* through
|
||||
the ``JOBDIR`` setting. This directory will be for storing all required data to
|
||||
keep the state of a single job (ie. a spider run). It's important to note that
|
||||
keep the state of a single job (i.e. a spider run). It's important to note that
|
||||
this directory must not be shared by different spiders, or even different
|
||||
jobs/runs of the same spider, as it's meant to be used for storing the state of
|
||||
a *single* job.
|
||||
@ -30,7 +30,7 @@ a *single* job.
|
||||
How to use it
|
||||
=============
|
||||
|
||||
To start a spider with persistence supported enabled, run it like this::
|
||||
To start a spider with persistence support enabled, run it like this::
|
||||
|
||||
scrapy crawl somespider -s JOBDIR=crawls/somespider-1
|
||||
|
||||
@ -68,32 +68,17 @@ Cookies may expire. So, if you don't resume your spider quickly the requests
|
||||
scheduled may no longer work. This won't be an issue if you spider doesn't rely
|
||||
on cookies.
|
||||
|
||||
|
||||
.. _request-serialization:
|
||||
|
||||
Request serialization
|
||||
---------------------
|
||||
|
||||
Requests must be serializable by the `pickle` module, in order for persistence
|
||||
to work, so you should make sure that your requests are serializable.
|
||||
For persistence to work, :class:`~scrapy.http.Request` objects must be
|
||||
serializable with :mod:`pickle`, except for the ``callback`` and ``errback``
|
||||
values passed to their ``__init__`` method, which must be methods of the
|
||||
running :class:`~scrapy.spiders.Spider` class.
|
||||
|
||||
The most common issue here is to use ``lambda`` functions on request callbacks that
|
||||
can't be persisted.
|
||||
|
||||
So, for example, this won't work::
|
||||
|
||||
def some_callback(self, response):
|
||||
somearg = 'test'
|
||||
return scrapy.Request('http://www.example.com', callback=lambda r: self.other_callback(r, somearg))
|
||||
|
||||
def other_callback(self, response, somearg):
|
||||
print "the argument passed is:", somearg
|
||||
|
||||
But this will::
|
||||
|
||||
def some_callback(self, response):
|
||||
somearg = 'test'
|
||||
return scrapy.Request('http://www.example.com', meta={'somearg': somearg})
|
||||
|
||||
def other_callback(self, response):
|
||||
somearg = response.meta['somearg']
|
||||
print "the argument passed is:", somearg
|
||||
|
||||
.. _pickle: http://docs.python.org/library/pickle.html
|
||||
If you wish to log the requests that couldn't be serialized, you can set the
|
||||
:setting:`SCHEDULER_DEBUG` setting to ``True`` in the project's settings page.
|
||||
It is ``False`` by default.
|
||||
|
@ -4,7 +4,7 @@
|
||||
Debugging memory leaks
|
||||
======================
|
||||
|
||||
In Scrapy, objects such as Requests, Responses and Items have a finite
|
||||
In Scrapy, objects such as requests, responses and items have a finite
|
||||
lifetime: they are created, used for a while, and finally destroyed.
|
||||
|
||||
From all those objects, the Request is probably the one with the longest
|
||||
@ -17,8 +17,8 @@ what is known as a "memory leak".
|
||||
|
||||
To help debugging memory leaks, Scrapy provides a built-in mechanism for
|
||||
tracking objects references called :ref:`trackref <topics-leaks-trackrefs>`,
|
||||
and you can also use a third-party library called :ref:`Guppy
|
||||
<topics-leaks-guppy>` for more advanced memory debugging (see below for more
|
||||
and you can also use a third-party library called :ref:`muppy
|
||||
<topics-leaks-muppy>` for more advanced memory debugging (see below for more
|
||||
info). Both mechanisms must be used from the :ref:`Telnet Console
|
||||
<topics-telnetconsole>`.
|
||||
|
||||
@ -27,34 +27,42 @@ Common causes of memory leaks
|
||||
|
||||
It happens quite often (sometimes by accident, sometimes on purpose) that the
|
||||
Scrapy developer passes objects referenced in Requests (for example, using the
|
||||
:attr:`~scrapy.http.Request.meta` attribute or the request callback function)
|
||||
and that effectively bounds the lifetime of those referenced objects to the
|
||||
lifetime of the Request. This is, by far, the most common cause of memory leaks
|
||||
in Scrapy projects, and a quite difficult one to debug for newcomers.
|
||||
:attr:`~scrapy.http.Request.cb_kwargs` or :attr:`~scrapy.http.Request.meta`
|
||||
attributes or the request callback function) and that effectively bounds the
|
||||
lifetime of those referenced objects to the lifetime of the Request. This is,
|
||||
by far, the most common cause of memory leaks in Scrapy projects, and a quite
|
||||
difficult one to debug for newcomers.
|
||||
|
||||
In big projects, the spiders are typically written by different people and some
|
||||
of those spiders could be "leaking" and thus affecting the rest of the other
|
||||
(well-written) spiders when they get to run concurrently, which, in turn,
|
||||
affects the whole crawling process.
|
||||
|
||||
At the same time, it's hard to avoid the reasons that cause these leaks
|
||||
without restricting the power of the framework, so we have decided not to
|
||||
restrict the functionally but provide useful tools for debugging these leaks,
|
||||
which quite often consist in an answer to the question: *which spider is leaking?*.
|
||||
affects the whole crawling process.
|
||||
|
||||
The leak could also come from a custom middleware, pipeline or extension that
|
||||
you have written, if you are not releasing the (previously allocated) resources
|
||||
properly. For example, if you're allocating resources on
|
||||
:signal:`spider_opened` but not releasing them on :signal:`spider_closed`.
|
||||
properly. For example, allocating resources on :signal:`spider_opened`
|
||||
but not releasing them on :signal:`spider_closed` may cause problems if
|
||||
you're running :ref:`multiple spiders per process <run-multiple-spiders>`.
|
||||
|
||||
Too Many Requests?
|
||||
------------------
|
||||
|
||||
By default Scrapy keeps the request queue in memory; it includes
|
||||
:class:`~scrapy.http.Request` objects and all objects
|
||||
referenced in Request attributes (e.g. in :attr:`~scrapy.http.Request.cb_kwargs`
|
||||
and :attr:`~scrapy.http.Request.meta`).
|
||||
While not necessarily a leak, this can take a lot of memory. Enabling
|
||||
:ref:`persistent job queue <topics-jobs>` could help keeping memory usage
|
||||
in control.
|
||||
|
||||
.. _topics-leaks-trackrefs:
|
||||
|
||||
Debugging memory leaks with ``trackref``
|
||||
========================================
|
||||
|
||||
``trackref`` is a module provided by Scrapy to debug the most common cases of
|
||||
memory leaks. It basically tracks the references to all live Requests,
|
||||
Responses, Item and Selector objects.
|
||||
:mod:`trackref` is a module provided by Scrapy to debug the most common cases of
|
||||
memory leaks. It basically tracks the references to all live Request,
|
||||
Response, Item, Spider and Selector objects.
|
||||
|
||||
You can enter the telnet console and inspect how many objects (of the classes
|
||||
mentioned above) are currently alive using the ``prefs()`` function which is an
|
||||
@ -71,12 +79,10 @@ alias to the :func:`~scrapy.utils.trackref.print_live_refs` function::
|
||||
FormRequest 878 oldest: 7s ago
|
||||
|
||||
As you can see, that report also shows the "age" of the oldest object in each
|
||||
class.
|
||||
|
||||
If you do have leaks, chances are you can figure out which spider is leaking by
|
||||
looking at the oldest request or response. You can get the oldest object of
|
||||
each class using the :func:`~scrapy.utils.trackref.get_oldest` function like
|
||||
this (from the telnet console).
|
||||
class. If you're running multiple spiders per process chances are you can
|
||||
figure out which spider is leaking by looking at the oldest request or response.
|
||||
You can get the oldest object of each class using the
|
||||
:func:`~scrapy.utils.trackref.get_oldest` function (from the telnet console).
|
||||
|
||||
Which objects are tracked?
|
||||
--------------------------
|
||||
@ -84,28 +90,27 @@ Which objects are tracked?
|
||||
The objects tracked by ``trackrefs`` are all from these classes (and all its
|
||||
subclasses):
|
||||
|
||||
* ``scrapy.http.Request``
|
||||
* ``scrapy.http.Response``
|
||||
* ``scrapy.item.Item``
|
||||
* ``scrapy.selector.Selector``
|
||||
* ``scrapy.spider.Spider``
|
||||
* :class:`scrapy.http.Request`
|
||||
* :class:`scrapy.http.Response`
|
||||
* :class:`scrapy.item.Item`
|
||||
* :class:`scrapy.selector.Selector`
|
||||
* :class:`scrapy.spiders.Spider`
|
||||
|
||||
A real example
|
||||
--------------
|
||||
|
||||
Let's see a concrete example of an hypothetical case of memory leaks.
|
||||
|
||||
Let's see a concrete example of a hypothetical case of memory leaks.
|
||||
Suppose we have some spider with a line similar to this one::
|
||||
|
||||
return Request("http://www.somenastyspider.com/product.php?pid=%d" % product_id,
|
||||
callback=self.parse, meta={referer: response}")
|
||||
callback=self.parse, cb_kwargs={'referer': response})
|
||||
|
||||
That line is passing a response reference inside a request which effectively
|
||||
ties the response lifetime to the requests' one, and that would definitely
|
||||
cause memory leaks.
|
||||
|
||||
Let's see how we can discover which one is the nasty spider (without knowing it
|
||||
a-priori, of course) by using the ``trackref`` tool.
|
||||
Let's see how we can discover the cause (without knowing it
|
||||
a priori, of course) by using the ``trackref`` tool.
|
||||
|
||||
After the crawler is running for a few minutes and we notice its memory usage
|
||||
has grown a lot, we can enter its telnet console and check the live
|
||||
@ -121,39 +126,39 @@ references::
|
||||
|
||||
The fact that there are so many live responses (and that they're so old) is
|
||||
definitely suspicious, as responses should have a relatively short lifetime
|
||||
compared to Requests. So let's check the oldest response::
|
||||
compared to Requests. The number of responses is similar to the number
|
||||
of requests, so it looks like they are tied in a some way. We can now go
|
||||
and check the code of the spider to discover the nasty line that is
|
||||
generating the leaks (passing response references inside requests).
|
||||
|
||||
>>> from scrapy.utils.trackref import get_oldest
|
||||
>>> r = get_oldest('HtmlResponse')
|
||||
>>> r.url
|
||||
'http://www.somenastyspider.com/product.php?pid=123'
|
||||
Sometimes extra information about live objects can be helpful.
|
||||
Let's check the oldest response:
|
||||
|
||||
There it is. By looking at the URL of the oldest response we can see it belongs
|
||||
to the ``somenastyspider.com`` spider. We can now go and check the code of that
|
||||
spider to discover the nasty line that is generating the leaks (passing
|
||||
response references inside requests).
|
||||
>>> from scrapy.utils.trackref import get_oldest
|
||||
>>> r = get_oldest('HtmlResponse')
|
||||
>>> r.url
|
||||
'http://www.somenastyspider.com/product.php?pid=123'
|
||||
|
||||
If you want to iterate over all objects, instead of getting the oldest one, you
|
||||
can use the :func:`iter_all` function::
|
||||
can use the :func:`scrapy.utils.trackref.iter_all` function:
|
||||
|
||||
>>> from scrapy.utils.trackref import iter_all
|
||||
>>> [r.url for r in iter_all('HtmlResponse')]
|
||||
['http://www.somenastyspider.com/product.php?pid=123',
|
||||
'http://www.somenastyspider.com/product.php?pid=584',
|
||||
...
|
||||
>>> from scrapy.utils.trackref import iter_all
|
||||
>>> [r.url for r in iter_all('HtmlResponse')]
|
||||
['http://www.somenastyspider.com/product.php?pid=123',
|
||||
'http://www.somenastyspider.com/product.php?pid=584',
|
||||
...]
|
||||
|
||||
Too many spiders?
|
||||
-----------------
|
||||
|
||||
If your project has too many spiders, the output of ``prefs()`` can be
|
||||
difficult to read. For this reason, that function has a ``ignore`` argument
|
||||
which can be used to ignore a particular class (and all its subclases). For
|
||||
example, using::
|
||||
If your project has too many spiders executed in parallel,
|
||||
the output of :func:`prefs()` can be difficult to read.
|
||||
For this reason, that function has a ``ignore`` argument which can be used to
|
||||
ignore a particular class (and all its subclases). For
|
||||
example, this won't show any live references to spiders:
|
||||
|
||||
>>> from scrapy.spider import Spider
|
||||
>>> prefs(ignore=Spider)
|
||||
|
||||
Won't show any live references to spiders.
|
||||
>>> from scrapy.spiders import Spider
|
||||
>>> prefs(ignore=Spider)
|
||||
|
||||
.. module:: scrapy.utils.trackref
|
||||
:synopsis: Track references of live objects
|
||||
@ -165,7 +170,7 @@ Here are the functions available in the :mod:`~scrapy.utils.trackref` module.
|
||||
|
||||
.. class:: object_ref
|
||||
|
||||
Inherit from this class (instead of object) if you want to track live
|
||||
Inherit from this class if you want to track live
|
||||
instances with the ``trackref`` module.
|
||||
|
||||
.. function:: print_live_refs(class_name, ignore=NoneType)
|
||||
@ -174,7 +179,7 @@ Here are the functions available in the :mod:`~scrapy.utils.trackref` module.
|
||||
|
||||
:param ignore: if given, all objects from the specified class (or tuple of
|
||||
classes) will be ignored.
|
||||
:type ignore: class or classes tuple
|
||||
:type ignore: type or tuple
|
||||
|
||||
.. function:: get_oldest(class_name)
|
||||
|
||||
@ -188,69 +193,57 @@ Here are the functions available in the :mod:`~scrapy.utils.trackref` module.
|
||||
``None`` if none is found. Use :func:`print_live_refs` first to get a list
|
||||
of all tracked live objects per class name.
|
||||
|
||||
.. _topics-leaks-guppy:
|
||||
.. _topics-leaks-muppy:
|
||||
|
||||
Debugging memory leaks with Guppy
|
||||
Debugging memory leaks with muppy
|
||||
=================================
|
||||
|
||||
``trackref`` provides a very convenient mechanism for tracking down memory
|
||||
leaks, but it only keeps track of the objects that are more likely to cause
|
||||
memory leaks (Requests, Responses, Items, and Selectors). However, there are
|
||||
other cases where the memory leaks could come from other (more or less obscure)
|
||||
objects. If this is your case, and you can't find your leaks using ``trackref``,
|
||||
you still have another resource: the `Guppy library`_.
|
||||
memory leaks. However, there are other cases where the memory leaks could come
|
||||
from other (more or less obscure) objects. If this is your case, and you can't
|
||||
find your leaks using ``trackref``, you still have another resource: the muppy
|
||||
library.
|
||||
|
||||
.. _Guppy library: http://pypi.python.org/pypi/guppy
|
||||
You can use muppy from `Pympler`_.
|
||||
|
||||
If you use ``setuptools``, you can install Guppy with the following command::
|
||||
.. _Pympler: https://pypi.org/project/Pympler/
|
||||
|
||||
easy_install guppy
|
||||
If you use ``pip``, you can install muppy with the following command::
|
||||
|
||||
.. _setuptools: http://pypi.python.org/pypi/setuptools
|
||||
pip install Pympler
|
||||
|
||||
The telnet console also comes with a built-in shortcut (``hpy``) for accessing
|
||||
Guppy heap objects. Here's an example to view all Python objects available in
|
||||
the heap using Guppy::
|
||||
Here's an example to view all Python objects available in
|
||||
the heap using muppy:
|
||||
|
||||
>>> x = hpy.heap()
|
||||
>>> x.bytype
|
||||
Partition of a set of 297033 objects. Total size = 52587824 bytes.
|
||||
Index Count % Size % Cumulative % Type
|
||||
0 22307 8 16423880 31 16423880 31 dict
|
||||
1 122285 41 12441544 24 28865424 55 str
|
||||
2 68346 23 5966696 11 34832120 66 tuple
|
||||
3 227 0 5836528 11 40668648 77 unicode
|
||||
4 2461 1 2222272 4 42890920 82 type
|
||||
5 16870 6 2024400 4 44915320 85 function
|
||||
6 13949 5 1673880 3 46589200 89 types.CodeType
|
||||
7 13422 5 1653104 3 48242304 92 list
|
||||
8 3735 1 1173680 2 49415984 94 _sre.SRE_Pattern
|
||||
9 1209 0 456936 1 49872920 95 scrapy.http.headers.Headers
|
||||
<1676 more rows. Type e.g. '_.more' to view.>
|
||||
>>> from pympler import muppy
|
||||
>>> all_objects = muppy.get_objects()
|
||||
>>> len(all_objects)
|
||||
28667
|
||||
>>> from pympler import summary
|
||||
>>> suml = summary.summarize(all_objects)
|
||||
>>> summary.print_(suml)
|
||||
types | # objects | total size
|
||||
==================================== | =========== | ============
|
||||
<class 'str | 9822 | 1.10 MB
|
||||
<class 'dict | 1658 | 856.62 KB
|
||||
<class 'type | 436 | 443.60 KB
|
||||
<class 'code | 2974 | 419.56 KB
|
||||
<class '_io.BufferedWriter | 2 | 256.34 KB
|
||||
<class 'set | 420 | 159.88 KB
|
||||
<class '_io.BufferedReader | 1 | 128.17 KB
|
||||
<class 'wrapper_descriptor | 1130 | 88.28 KB
|
||||
<class 'tuple | 1304 | 86.57 KB
|
||||
<class 'weakref | 1013 | 79.14 KB
|
||||
<class 'builtin_function_or_method | 958 | 67.36 KB
|
||||
<class 'method_descriptor | 865 | 60.82 KB
|
||||
<class 'abc.ABCMeta | 62 | 59.96 KB
|
||||
<class 'list | 446 | 58.52 KB
|
||||
<class 'int | 1425 | 43.20 KB
|
||||
|
||||
You can see that most space is used by dicts. Then, if you want to see from
|
||||
which attribute those dicts are referenced, you could do::
|
||||
For more info about muppy, refer to the `muppy documentation`_.
|
||||
|
||||
>>> x.bytype[0].byvia
|
||||
Partition of a set of 22307 objects. Total size = 16423880 bytes.
|
||||
Index Count % Size % Cumulative % Referred Via:
|
||||
0 10982 49 9416336 57 9416336 57 '.__dict__'
|
||||
1 1820 8 2681504 16 12097840 74 '.__dict__', '.func_globals'
|
||||
2 3097 14 1122904 7 13220744 80
|
||||
3 990 4 277200 2 13497944 82 "['cookies']"
|
||||
4 987 4 276360 2 13774304 84 "['cache']"
|
||||
5 985 4 275800 2 14050104 86 "['meta']"
|
||||
6 897 4 251160 2 14301264 87 '[2]'
|
||||
7 1 0 196888 1 14498152 88 "['moduleDict']", "['modules']"
|
||||
8 672 3 188160 1 14686312 89 "['cb_kwargs']"
|
||||
9 27 0 155016 1 14841328 90 '[1]'
|
||||
<333 more rows. Type e.g. '_.more' to view.>
|
||||
|
||||
As you can see, the Guppy module is very powerful but also requires some deep
|
||||
knowledge about Python internals. For more info about Guppy, refer to the
|
||||
`Guppy documentation`_.
|
||||
|
||||
.. _Guppy documentation: http://guppy-pe.sourceforge.net/
|
||||
.. _muppy documentation: https://pythonhosted.org/Pympler/muppy.html
|
||||
|
||||
.. _topics-leaks-without-leaks:
|
||||
|
||||
@ -263,9 +256,9 @@ though neither Scrapy nor your project are leaking memory. This is due to a
|
||||
(not so well) known problem of Python, which may not return released memory to
|
||||
the operating system in some cases. For more information on this issue see:
|
||||
|
||||
* `Python Memory Management <http://evanjones.ca/python-memory.html>`_
|
||||
* `Python Memory Management Part 2 <http://evanjones.ca/python-memory-part2.html>`_
|
||||
* `Python Memory Management Part 3 <http://evanjones.ca/python-memory-part3.html>`_
|
||||
* `Python Memory Management <https://www.evanjones.ca/python-memory.html>`_
|
||||
* `Python Memory Management Part 2 <https://www.evanjones.ca/python-memory-part2.html>`_
|
||||
* `Python Memory Management Part 3 <https://www.evanjones.ca/python-memory-part3.html>`_
|
||||
|
||||
The improvements proposed by Evan Jones, which are detailed in `this paper`_,
|
||||
got merged in Python 2.5, but this only reduces the problem, it doesn't fix it
|
||||
@ -279,7 +272,8 @@ completely. To quote the paper:
|
||||
to move to a compacting garbage collector, which is able to move objects in
|
||||
memory. This would require significant changes to the Python interpreter.*
|
||||
|
||||
This problem will be fixed in future Scrapy releases, where we plan to adopt a
|
||||
new process model and run spiders in a pool of recyclable sub-processes.
|
||||
.. _this paper: https://www.evanjones.ca/memoryallocator/
|
||||
|
||||
.. _this paper: http://evanjones.ca/memoryallocator/
|
||||
To keep memory consumption reasonable you can split the job into several
|
||||
smaller jobs or enable :ref:`persistent job queue <topics-jobs>`
|
||||
and stop/start spider from time to time.
|
||||
|
@ -4,54 +4,41 @@
|
||||
Link Extractors
|
||||
===============
|
||||
|
||||
Link extractors are objects whose only purpose is to extract links from web
|
||||
pages (:class:`scrapy.http.Response` objects) which will be eventually
|
||||
followed.
|
||||
A link extractor is an object that extracts links from responses.
|
||||
|
||||
There is ``scrapy.contrib.linkextractors import LinkExtractor`` available
|
||||
in Scrapy, but you can create your own custom Link Extractors to suit your
|
||||
needs by implementing a simple interface.
|
||||
|
||||
The only public method that every link extractor has is ``extract_links``,
|
||||
which receives a :class:`~scrapy.http.Response` object and returns a list
|
||||
of :class:`scrapy.link.Link` objects. Link extractors are meant to be
|
||||
instantiated once and their ``extract_links`` method called several times
|
||||
with different responses to extract links to follow.
|
||||
|
||||
Link extractors are used in the :class:`~scrapy.contrib.spiders.CrawlSpider`
|
||||
class (available in Scrapy), through a set of rules, but you can also use it in
|
||||
your spiders, even if you don't subclass from
|
||||
:class:`~scrapy.contrib.spiders.CrawlSpider`, as its purpose is very simple: to
|
||||
extract links.
|
||||
The ``__init__`` method of
|
||||
:class:`~scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor` takes settings that
|
||||
determine which links may be extracted. :class:`LxmlLinkExtractor.extract_links
|
||||
<scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor.extract_links>` returns a
|
||||
list of matching :class:`scrapy.link.Link` objects from a
|
||||
:class:`~scrapy.http.Response` object.
|
||||
|
||||
Link extractors are used in :class:`~scrapy.spiders.CrawlSpider` spiders
|
||||
through a set of :class:`~scrapy.spiders.Rule` objects. You can also use link
|
||||
extractors in regular spiders.
|
||||
|
||||
.. _topics-link-extractors-ref:
|
||||
|
||||
Built-in link extractors reference
|
||||
==================================
|
||||
Link extractor reference
|
||||
========================
|
||||
|
||||
.. module:: scrapy.contrib.linkextractors
|
||||
.. module:: scrapy.linkextractors
|
||||
:synopsis: Link extractors classes
|
||||
|
||||
Link extractors classes bundled with Scrapy are provided in the
|
||||
:mod:`scrapy.contrib.linkextractors` module.
|
||||
The link extractor class is
|
||||
:class:`scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor`. For convenience it
|
||||
can also be imported as ``scrapy.linkextractors.LinkExtractor``::
|
||||
|
||||
The default link extractor is ``LinkExtractor``, which is the same as
|
||||
:class:`~.LxmlLinkExtractor`::
|
||||
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
|
||||
There used to be other link extractor classes in previous Scrapy versions,
|
||||
but they are deprecated now.
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
||||
LxmlLinkExtractor
|
||||
-----------------
|
||||
|
||||
.. module:: scrapy.contrib.linkextractors.lxmlhtml
|
||||
.. module:: scrapy.linkextractors.lxmlhtml
|
||||
:synopsis: lxml's HTMLParser-based link extractors
|
||||
|
||||
|
||||
.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)
|
||||
.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), restrict_css=(), tags=('a', 'area'), attrs=('href',), canonicalize=False, unique=True, process_value=None, strip=True)
|
||||
|
||||
LxmlLinkExtractor is the recommended link extractor with handy filtering
|
||||
options. It is implemented using lxml's robust HTMLParser.
|
||||
@ -59,13 +46,13 @@ LxmlLinkExtractor
|
||||
:param allow: a single regular expression (or list of regular expressions)
|
||||
that the (absolute) urls must match in order to be extracted. If not
|
||||
given (or empty), it will match all links.
|
||||
:type allow: a regular expression (or list of)
|
||||
:type allow: str or list
|
||||
|
||||
:param deny: a single regular expression (or list of regular expressions)
|
||||
that the (absolute) urls must match in order to be excluded (ie. not
|
||||
that the (absolute) urls must match in order to be excluded (i.e. not
|
||||
extracted). It has precedence over the ``allow`` parameter. If not
|
||||
given (or empty) it won't exclude any links.
|
||||
:type deny: a regular expression (or list of)
|
||||
:type deny: str or list
|
||||
|
||||
:param allow_domains: a single value or a list of string containing
|
||||
domains which will be considered for extracting the links
|
||||
@ -77,17 +64,32 @@ LxmlLinkExtractor
|
||||
|
||||
:param deny_extensions: a single value or list of strings containing
|
||||
extensions that should be ignored when extracting links.
|
||||
If not given, it will default to the
|
||||
``IGNORED_EXTENSIONS`` list defined in the `scrapy.linkextractor`_
|
||||
module.
|
||||
If not given, it will default to
|
||||
:data:`scrapy.linkextractors.IGNORED_EXTENSIONS`.
|
||||
|
||||
.. versionchanged:: 2.0
|
||||
:data:`~scrapy.linkextractors.IGNORED_EXTENSIONS` now includes
|
||||
``7z``, ``7zip``, ``apk``, ``bz2``, ``cdr``, ``dmg``, ``ico``,
|
||||
``iso``, ``tar``, ``tar.gz``, ``webm``, and ``xz``.
|
||||
:type deny_extensions: list
|
||||
|
||||
:param restrict_xpaths: is a XPath (or list of XPath's) which defines
|
||||
:param restrict_xpaths: is an XPath (or list of XPath's) which defines
|
||||
regions inside the response where links should be extracted from.
|
||||
If given, only the text selected by those XPath will be scanned for
|
||||
links. See examples below.
|
||||
:type restrict_xpaths: str or list
|
||||
|
||||
:param restrict_css: a CSS selector (or list of selectors) which defines
|
||||
regions inside the response where links should be extracted from.
|
||||
Has the same behaviour as ``restrict_xpaths``.
|
||||
:type restrict_css: str or list
|
||||
|
||||
:param restrict_text: a single regular expression (or list of regular expressions)
|
||||
that the link's text must match in order to be extracted. If not
|
||||
given (or empty), it will match all links. If a list of regular expressions is
|
||||
given, the link will be extracted if it matches at least one.
|
||||
:type restrict_text: str or list
|
||||
|
||||
:param tags: a tag or a list of tags to consider when extracting links.
|
||||
Defaults to ``('a', 'area')``.
|
||||
:type tags: str or list
|
||||
@ -98,12 +100,17 @@ LxmlLinkExtractor
|
||||
:type attrs: list
|
||||
|
||||
:param canonicalize: canonicalize each extracted url (using
|
||||
scrapy.utils.url.canonicalize_url). Defaults to ``True``.
|
||||
:type canonicalize: boolean
|
||||
w3lib.url.canonicalize_url). Defaults to ``False``.
|
||||
Note that canonicalize_url is meant for duplicate checking;
|
||||
it can change the URL visible at server side, so the response can be
|
||||
different for requests with canonicalized and raw URLs. If you're
|
||||
using LinkExtractor to follow links it is more robust to
|
||||
keep the default ``canonicalize=False``.
|
||||
:type canonicalize: bool
|
||||
|
||||
:param unique: whether duplicate filtering should be applied to extracted
|
||||
links.
|
||||
:type unique: boolean
|
||||
:type unique: bool
|
||||
|
||||
:param process_value: a function which receives each value extracted from
|
||||
the tag and attributes scanned and can modify the value and return a
|
||||
@ -125,6 +132,17 @@ LxmlLinkExtractor
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
:type process_value: callable
|
||||
:type process_value: collections.abc.Callable
|
||||
|
||||
.. _scrapy.linkextractor: https://github.com/scrapy/scrapy/blob/master/scrapy/linkextractor.py
|
||||
:param strip: whether to strip whitespaces from extracted attributes.
|
||||
According to HTML5 standard, leading and trailing whitespaces
|
||||
must be stripped from ``href`` attributes of ``<a>``, ``<area>``
|
||||
and many other elements, ``src`` attribute of ``<img>``, ``<iframe>``
|
||||
elements, etc., so LinkExtractor strips space chars by default.
|
||||
Set ``strip=False`` to turn it off (e.g. if you're extracting urls
|
||||
from elements or attributes which allow leading/trailing whitespaces).
|
||||
:type strip: bool
|
||||
|
||||
.. automethod:: extract_links
|
||||
|
||||
.. _scrapy.linkextractors: https://github.com/scrapy/scrapy/blob/master/scrapy/linkextractors/__init__.py
|
||||
|
@ -4,16 +4,15 @@
|
||||
Item Loaders
|
||||
============
|
||||
|
||||
.. module:: scrapy.contrib.loader
|
||||
.. module:: scrapy.loader
|
||||
:synopsis: Item Loader class
|
||||
|
||||
Item Loaders provide a convenient mechanism for populating scraped :ref:`Items
|
||||
<topics-items>`. Even though Items can be populated using their own
|
||||
dictionary-like API, the Item Loaders provide a much more convenient API for
|
||||
populating them from a scraping process, by automating some common tasks like
|
||||
parsing the raw extracted data before assigning it.
|
||||
Item Loaders provide a convenient mechanism for populating scraped :ref:`items
|
||||
<topics-items>`. Even though items can be populated directly, Item Loaders provide a
|
||||
much more convenient API for populating them from a scraping process, by automating
|
||||
some common tasks like parsing the raw extracted data before assigning it.
|
||||
|
||||
In other words, :ref:`Items <topics-items>` provide the *container* of
|
||||
In other words, :ref:`items <topics-items>` provide the *container* of
|
||||
scraped data, while Item Loaders provide the mechanism for *populating* that
|
||||
container.
|
||||
|
||||
@ -21,25 +20,35 @@ Item Loaders are designed to provide a flexible, efficient and easy mechanism
|
||||
for extending and overriding different field parsing rules, either by spider,
|
||||
or by source format (HTML, XML, etc) without becoming a nightmare to maintain.
|
||||
|
||||
.. note:: Item Loaders are an extension of the itemloaders_ library that make it
|
||||
easier to work with Scrapy by adding support for
|
||||
:ref:`responses <topics-request-response>`.
|
||||
|
||||
Using Item Loaders to populate items
|
||||
====================================
|
||||
|
||||
To use an Item Loader, you must first instantiate it. You can either
|
||||
instantiate it with an dict-like object (e.g. Item or dict) or without one, in
|
||||
which case an Item is automatically instantiated in the Item Loader constructor
|
||||
using the Item class specified in the :attr:`ItemLoader.default_item_class`
|
||||
attribute.
|
||||
instantiate it with an :ref:`item object <topics-items>` or without one, in which
|
||||
case an :ref:`item object <topics-items>` is automatically created in the
|
||||
Item Loader ``__init__`` method using the :ref:`item <topics-items>` class
|
||||
specified in the :attr:`ItemLoader.default_item_class` attribute.
|
||||
|
||||
Then, you start collecting values into the Item Loader, typically using
|
||||
:ref:`Selectors <topics-selectors>`. You can add more than one value to
|
||||
the same item field; the Item Loader will know how to "join" those values later
|
||||
using a proper processing function.
|
||||
|
||||
.. note:: Collected data is internally stored as lists,
|
||||
allowing to add several values to the same field.
|
||||
If an ``item`` argument is passed when creating a loader,
|
||||
each of the item's values will be stored as-is if it's already
|
||||
an iterable, or wrapped with a list if it's a single value.
|
||||
|
||||
Here is a typical Item Loader usage in a :ref:`Spider <topics-spiders>`, using
|
||||
the :ref:`Product item <topics-items-declaring>` declared in the :ref:`Items
|
||||
chapter <topics-items>`::
|
||||
|
||||
from scrapy.contrib.loader import ItemLoader
|
||||
from scrapy.loader import ItemLoader
|
||||
from myproject.items import Product
|
||||
|
||||
def parse(self, response):
|
||||
@ -61,16 +70,41 @@ In other words, data is being collected by extracting it from two XPath
|
||||
locations, using the :meth:`~ItemLoader.add_xpath` method. This is the
|
||||
data that will be assigned to the ``name`` field later.
|
||||
|
||||
Afterwords, similar calls are used for ``price`` and ``stock`` fields
|
||||
(the later using a CSS selector with the :meth:`~ItemLoader.add_css` method),
|
||||
Afterwards, similar calls are used for ``price`` and ``stock`` fields
|
||||
(the latter using a CSS selector with the :meth:`~ItemLoader.add_css` method),
|
||||
and finally the ``last_update`` field is populated directly with a literal value
|
||||
(``today``) using a different method: :meth:`~ItemLoader.add_value`.
|
||||
|
||||
Finally, when all data is collected, the :meth:`ItemLoader.load_item` method is
|
||||
called which actually populates and returns the item populated with the data
|
||||
called which actually returns the item populated with the data
|
||||
previously extracted and collected with the :meth:`~ItemLoader.add_xpath`,
|
||||
:meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls.
|
||||
|
||||
|
||||
.. _topics-loaders-dataclass:
|
||||
|
||||
Working with dataclass items
|
||||
============================
|
||||
|
||||
By default, :ref:`dataclass items <dataclass-items>` require all fields to be
|
||||
passed when created. This could be an issue when using dataclass items with
|
||||
item loaders: unless a pre-populated item is passed to the loader, fields
|
||||
will be populated incrementally using the loader's :meth:`~ItemLoader.add_xpath`,
|
||||
:meth:`~ItemLoader.add_css` and :meth:`~ItemLoader.add_value` methods.
|
||||
|
||||
One approach to overcome this is to define items using the
|
||||
:func:`~dataclasses.field` function, with a ``default`` argument::
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
@dataclass
|
||||
class InventoryItem:
|
||||
name: Optional[str] = field(default=None)
|
||||
price: Optional[float] = field(default=None)
|
||||
stock: Optional[int] = field(default=None)
|
||||
|
||||
|
||||
.. _topics-loaders-processors:
|
||||
|
||||
Input and Output processors
|
||||
@ -82,7 +116,7 @@ received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`
|
||||
:meth:`~ItemLoader.add_value` methods) and the result of the input processor is
|
||||
collected and kept inside the ItemLoader. After collecting all data, the
|
||||
:meth:`ItemLoader.load_item` method is called to populate and get the populated
|
||||
:class:`~scrapy.item.Item` object. That's when the output processor is
|
||||
:ref:`item object <topics-items>`. That's when the output processor is
|
||||
called with the data previously collected (and processed using the input
|
||||
processor). The result of the output processor is the final value that gets
|
||||
assigned to the item.
|
||||
@ -128,9 +162,12 @@ So what happens is:
|
||||
It's worth noticing that processors are just callable objects, which are called
|
||||
with the data to be parsed, and return a parsed value. So you can use any
|
||||
function as input or output processor. The only requirement is that they must
|
||||
accept one (and only one) positional argument, which will be an iterator.
|
||||
accept one (and only one) positional argument, which will be an iterable.
|
||||
|
||||
.. note:: Both input and output processors must receive an iterator as their
|
||||
.. versionchanged:: 2.0
|
||||
Processors no longer need to be methods.
|
||||
|
||||
.. note:: Both input and output processors must receive an iterable as their
|
||||
first argument. The output of those functions can be anything. The result of
|
||||
input processors will be appended to an internal list (in the Loader)
|
||||
containing the collected values (for that field). The result of the output
|
||||
@ -140,27 +177,26 @@ The other thing you need to keep in mind is that the values returned by input
|
||||
processors are collected internally (in lists) and then passed to output
|
||||
processors to populate the fields.
|
||||
|
||||
Last, but not least, Scrapy comes with some :ref:`commonly used processors
|
||||
<topics-loaders-available-processors>` built-in for convenience.
|
||||
Last, but not least, itemloaders_ comes with some :ref:`commonly used
|
||||
processors <itemloaders:built-in-processors>` built-in for convenience.
|
||||
|
||||
|
||||
Declaring Item Loaders
|
||||
======================
|
||||
|
||||
Item Loaders are declared like Items, by using a class definition syntax. Here
|
||||
is an example::
|
||||
Item Loaders are declared using a class definition syntax. Here is an example::
|
||||
|
||||
from scrapy.contrib.loader import ItemLoader
|
||||
from scrapy.contrib.loader.processor import TakeFirst, MapCompose, Join
|
||||
from itemloaders.processors import TakeFirst, MapCompose, Join
|
||||
from scrapy.loader import ItemLoader
|
||||
|
||||
class ProductLoader(ItemLoader):
|
||||
|
||||
default_output_processor = TakeFirst()
|
||||
|
||||
name_in = MapCompose(unicode.title)
|
||||
name_in = MapCompose(str.title)
|
||||
name_out = Join()
|
||||
|
||||
price_in = MapCompose(unicode.strip)
|
||||
price_in = MapCompose(str.strip)
|
||||
|
||||
# ...
|
||||
|
||||
@ -182,7 +218,7 @@ output processors to use: in the :ref:`Item Field <topics-items-fields>`
|
||||
metadata. Here is an example::
|
||||
|
||||
import scrapy
|
||||
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
|
||||
from itemloaders.processors import Join, MapCompose, TakeFirst
|
||||
from w3lib.html import remove_tags
|
||||
|
||||
def filter_price(value):
|
||||
@ -199,14 +235,12 @@ metadata. Here is an example::
|
||||
output_processor=TakeFirst(),
|
||||
)
|
||||
|
||||
::
|
||||
|
||||
>>> from scrapy.contrib.loader import ItemLoader
|
||||
>>> il = ItemLoader(item=Product())
|
||||
>>> il.add_value('name', [u'Welcome to my', u'<strong>website</strong>'])
|
||||
>>> il.add_value('price', [u'€', u'<span>1000</span>'])
|
||||
>>> il.load_item()
|
||||
{'name': u'Welcome to my website', 'price': u'1000'}
|
||||
>>> from scrapy.loader import ItemLoader
|
||||
>>> il = ItemLoader(item=Product())
|
||||
>>> il.add_value('name', ['Welcome to my', '<strong>website</strong>'])
|
||||
>>> il.add_value('price', ['€', '<span>1000</span>'])
|
||||
>>> il.load_item()
|
||||
{'name': 'Welcome to my website', 'price': '1000'}
|
||||
|
||||
The precedence order, for both input and output processors, is as follows:
|
||||
|
||||
@ -250,7 +284,7 @@ There are several ways to modify Item Loader context values:
|
||||
loader.context['unit'] = 'cm'
|
||||
|
||||
2. On Item Loader instantiation (the keyword arguments of Item Loader
|
||||
constructor are stored in the Item Loader context)::
|
||||
``__init__`` method are stored in the Item Loader context)::
|
||||
|
||||
loader = ItemLoader(product, unit='cm')
|
||||
|
||||
@ -265,230 +299,55 @@ There are several ways to modify Item Loader context values:
|
||||
ItemLoader objects
|
||||
==================
|
||||
|
||||
.. class:: ItemLoader([item, selector, response], \**kwargs)
|
||||
.. autoclass:: scrapy.loader.ItemLoader
|
||||
:members:
|
||||
:inherited-members:
|
||||
|
||||
Return a new Item Loader for populating the given Item. If no item is
|
||||
given, one is instantiated automatically using the class in
|
||||
:attr:`default_item_class`.
|
||||
.. _topics-loaders-nested:
|
||||
|
||||
When instantiated with a `selector` or a `response` parameters
|
||||
the :class:`ItemLoader` class provides convenient mechanisms for extracting
|
||||
data from web pages using :ref:`selectors <topics-selectors>`.
|
||||
Nested Loaders
|
||||
==============
|
||||
|
||||
:param item: The item instance to populate using subsequent calls to
|
||||
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
|
||||
or :meth:`~ItemLoader.add_value`.
|
||||
:type item: :class:`~scrapy.item.Item` object
|
||||
When parsing related values from a subsection of a document, it can be
|
||||
useful to create nested loaders. Imagine you're extracting details from
|
||||
a footer of a page that looks something like:
|
||||
|
||||
:param selector: The selector to extract data from, when using the
|
||||
:meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
|
||||
(resp. :meth:`replace_css`) method.
|
||||
:type selector: :class:`~scrapy.selector.Selector` object
|
||||
Example::
|
||||
|
||||
:param response: The response used to construct the selector using the
|
||||
:attr:`default_selector_class`, unless the selector argument is given,
|
||||
in which case this argument is ignored.
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
<footer>
|
||||
<a class="social" href="https://facebook.com/whatever">Like Us</a>
|
||||
<a class="social" href="https://twitter.com/whatever">Follow Us</a>
|
||||
<a class="email" href="mailto:whatever@example.com">Email Us</a>
|
||||
</footer>
|
||||
|
||||
The item, selector, response and the remaining keyword arguments are
|
||||
assigned to the Loader context (accessible through the :attr:`context` attribute).
|
||||
Without nested loaders, you need to specify the full xpath (or css) for each value
|
||||
that you wish to extract.
|
||||
|
||||
:class:`ItemLoader` instances have the following methods:
|
||||
Example::
|
||||
|
||||
.. method:: get_value(value, \*processors, \**kwargs)
|
||||
loader = ItemLoader(item=Item())
|
||||
# load stuff not in the footer
|
||||
loader.add_xpath('social', '//footer/a[@class = "social"]/@href')
|
||||
loader.add_xpath('email', '//footer/a[@class = "email"]/@href')
|
||||
loader.load_item()
|
||||
|
||||
Process the given ``value`` by the given ``processors`` and keyword
|
||||
arguments.
|
||||
Instead, you can create a nested loader with the footer selector and add values
|
||||
relative to the footer. The functionality is the same but you avoid repeating
|
||||
the footer selector.
|
||||
|
||||
Available keyword arguments:
|
||||
Example::
|
||||
|
||||
:param re: a regular expression to use for extracting data from the
|
||||
given value using :meth:`~scrapy.utils.misc.extract_regex` method,
|
||||
applied before processors
|
||||
:type re: str or compiled regex
|
||||
loader = ItemLoader(item=Item())
|
||||
# load stuff not in the footer
|
||||
footer_loader = loader.nested_xpath('//footer')
|
||||
footer_loader.add_xpath('social', 'a[@class = "social"]/@href')
|
||||
footer_loader.add_xpath('email', 'a[@class = "email"]/@href')
|
||||
# no need to call footer_loader.load_item()
|
||||
loader.load_item()
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import TakeFirst
|
||||
>>> loader.get_value(u'name: foo', TakeFirst(), unicode.upper, re='name: (.+)')
|
||||
'FOO`
|
||||
|
||||
.. method:: add_value(field_name, value, \*processors, \**kwargs)
|
||||
|
||||
Process and then add the given ``value`` for the given field.
|
||||
|
||||
The value is first passed through :meth:`get_value` by giving the
|
||||
``processors`` and ``kwargs``, and then passed through the
|
||||
:ref:`field input processor <topics-loaders-processors>` and its result
|
||||
appended to the data collected for that field. If the field already
|
||||
contains collected data, the new data is added.
|
||||
|
||||
The given ``field_name`` can be ``None``, in which case values for
|
||||
multiple fields may be added. And the processed value should be a dict
|
||||
with field_name mapped to values.
|
||||
|
||||
Examples::
|
||||
|
||||
loader.add_value('name', u'Color TV')
|
||||
loader.add_value('colours', [u'white', u'blue'])
|
||||
loader.add_value('length', u'100')
|
||||
loader.add_value('name', u'name: foo', TakeFirst(), re='name: (.+)')
|
||||
loader.add_value(None, {'name': u'foo', 'sex': u'male'})
|
||||
|
||||
.. method:: replace_value(field_name, value, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`add_value` but replaces the collected data with the
|
||||
new value instead of adding it.
|
||||
.. method:: get_xpath(xpath, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of unicode strings from the
|
||||
selector associated with this :class:`ItemLoader`.
|
||||
|
||||
:param xpath: the XPath to extract data from
|
||||
:type xpath: str
|
||||
|
||||
:param re: a regular expression to use for extracting data from the
|
||||
selected XPath region
|
||||
:type re: str or compiled regex
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.get_xpath('//p[@class="product-name"]')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
|
||||
|
||||
.. method:: add_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of unicode strings from the
|
||||
selector associated with this :class:`ItemLoader`.
|
||||
|
||||
See :meth:`get_xpath` for ``kwargs``.
|
||||
|
||||
:param xpath: the XPath to extract data from
|
||||
:type xpath: str
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.add_xpath('name', '//p[@class="product-name"]')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
||||
|
||||
.. method:: replace_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`add_xpath` but replaces collected data instead of
|
||||
adding it.
|
||||
|
||||
.. method:: get_css(css, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
|
||||
instead of a value, which is used to extract a list of unicode strings
|
||||
from the selector associated with this :class:`ItemLoader`.
|
||||
|
||||
:param css: the CSS selector to extract data from
|
||||
:type css: str
|
||||
|
||||
:param re: a regular expression to use for extracting data from the
|
||||
selected CSS region
|
||||
:type re: str or compiled regex
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.get_css('p.product-name')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
|
||||
|
||||
.. method:: add_css(field_name, css, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
|
||||
instead of a value, which is used to extract a list of unicode strings
|
||||
from the selector associated with this :class:`ItemLoader`.
|
||||
|
||||
See :meth:`get_css` for ``kwargs``.
|
||||
|
||||
:param css: the CSS selector to extract data from
|
||||
:type css: str
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.add_css('name', 'p.product-name')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.add_css('price', 'p#price', re='the price is (.*)')
|
||||
|
||||
.. method:: replace_css(field_name, css, \*processors, \**kwargs)
|
||||
|
||||
Similar to :meth:`add_css` but replaces collected data instead of
|
||||
adding it.
|
||||
|
||||
.. method:: load_item()
|
||||
|
||||
Populate the item with the data collected so far, and return it. The
|
||||
data collected is first passed through the :ref:`output processors
|
||||
<topics-loaders-processors>` to get the final value to assign to each
|
||||
item field.
|
||||
|
||||
.. method:: get_collected_values(field_name)
|
||||
|
||||
Return the collected values for the given field.
|
||||
|
||||
.. method:: get_output_value(field_name)
|
||||
|
||||
Return the collected values parsed using the output processor, for the
|
||||
given field. This method doesn't populate or modify the item at all.
|
||||
|
||||
.. method:: get_input_processor(field_name)
|
||||
|
||||
Return the input processor for the given field.
|
||||
|
||||
.. method:: get_output_processor(field_name)
|
||||
|
||||
Return the output processor for the given field.
|
||||
|
||||
:class:`ItemLoader` instances have the following attributes:
|
||||
|
||||
.. attribute:: item
|
||||
|
||||
The :class:`~scrapy.item.Item` object being parsed by this Item Loader.
|
||||
|
||||
.. attribute:: context
|
||||
|
||||
The currently active :ref:`Context <topics-loaders-context>` of this
|
||||
Item Loader.
|
||||
|
||||
.. attribute:: default_item_class
|
||||
|
||||
An Item class (or factory), used to instantiate items when not given in
|
||||
the constructor.
|
||||
|
||||
.. attribute:: default_input_processor
|
||||
|
||||
The default input processor to use for those fields which don't specify
|
||||
one.
|
||||
|
||||
.. attribute:: default_output_processor
|
||||
|
||||
The default output processor to use for those fields which don't specify
|
||||
one.
|
||||
|
||||
.. attribute:: default_selector_class
|
||||
|
||||
The class used to construct the :attr:`selector` of this
|
||||
:class:`ItemLoader`, if only a response is given in the constructor.
|
||||
If a selector is given in the constructor this attribute is ignored.
|
||||
This attribute is sometimes overridden in subclasses.
|
||||
|
||||
.. attribute:: selector
|
||||
|
||||
The :class:`~scrapy.selector.Selector` object to extract data from.
|
||||
It's either the selector given in the constructor or one created from
|
||||
the response given in the constructor using the
|
||||
:attr:`default_selector_class`. This attribute is meant to be
|
||||
read-only.
|
||||
You can nest loaders arbitrarily and they work with either xpath or css selectors.
|
||||
As a general guideline, use nested loaders when they make your code simpler but do
|
||||
not go overboard with nesting or your parser can become difficult to read.
|
||||
|
||||
.. _topics-loaders-extending:
|
||||
|
||||
@ -513,7 +372,7 @@ those dashes in the final product names.
|
||||
Here's how you can remove those dashes by reusing and extending the default
|
||||
Product Item Loader (``ProductLoader``)::
|
||||
|
||||
from scrapy.contrib.loader.processor import MapCompose
|
||||
from itemloaders.processors import MapCompose
|
||||
from myproject.ItemLoaders import ProductLoader
|
||||
|
||||
def strip_dashes(x):
|
||||
@ -526,7 +385,7 @@ Another case where extending Item Loaders can be very helpful is when you have
|
||||
multiple source formats, for example XML and HTML. In the XML version you may
|
||||
want to remove ``CDATA`` occurrences. Here's an example of how to do it::
|
||||
|
||||
from scrapy.contrib.loader.processor import MapCompose
|
||||
from itemloaders.processors import MapCompose
|
||||
from myproject.ItemLoaders import ProductLoader
|
||||
from myproject.utils.xml import remove_cdata
|
||||
|
||||
@ -546,132 +405,5 @@ projects. Scrapy only provides the mechanism; it doesn't impose any specific
|
||||
organization of your Loaders collection - that's up to you and your project's
|
||||
needs.
|
||||
|
||||
.. _topics-loaders-available-processors:
|
||||
|
||||
Available built-in processors
|
||||
=============================
|
||||
|
||||
.. module:: scrapy.contrib.loader.processor
|
||||
:synopsis: A collection of processors to use with Item Loaders
|
||||
|
||||
Even though you can use any callable function as input and output processors,
|
||||
Scrapy provides some commonly used processors, which are described below. Some
|
||||
of them, like the :class:`MapCompose` (which is typically used as input
|
||||
processor) compose the output of several functions executed in order, to
|
||||
produce the final parsed value.
|
||||
|
||||
Here is a list of all built-in processors:
|
||||
|
||||
.. class:: Identity
|
||||
|
||||
The simplest processor, which doesn't do anything. It returns the original
|
||||
values unchanged. It doesn't receive any constructor arguments nor accepts
|
||||
Loader contexts.
|
||||
|
||||
Example::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import Identity
|
||||
>>> proc = Identity()
|
||||
>>> proc(['one', 'two', 'three'])
|
||||
['one', 'two', 'three']
|
||||
|
||||
.. class:: TakeFirst
|
||||
|
||||
Returns the first non-null/non-empty value from the values received,
|
||||
so it's typically used as an output processor to single-valued fields.
|
||||
It doesn't receive any constructor arguments, nor accept Loader contexts.
|
||||
|
||||
Example::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import TakeFirst
|
||||
>>> proc = TakeFirst()
|
||||
>>> proc(['', 'one', 'two', 'three'])
|
||||
'one'
|
||||
|
||||
.. class:: Join(separator=u' ')
|
||||
|
||||
Returns the values joined with the separator given in the constructor, which
|
||||
defaults to ``u' '``. It doesn't accept Loader contexts.
|
||||
|
||||
When using the default separator, this processor is equivalent to the
|
||||
function: ``u' '.join``
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import Join
|
||||
>>> proc = Join()
|
||||
>>> proc(['one', 'two', 'three'])
|
||||
u'one two three'
|
||||
>>> proc = Join('<br>')
|
||||
>>> proc(['one', 'two', 'three'])
|
||||
u'one<br>two<br>three'
|
||||
|
||||
.. class:: Compose(\*functions, \**default_loader_context)
|
||||
|
||||
A processor which is constructed from the composition of the given
|
||||
functions. This means that each input value of this processor is passed to
|
||||
the first function, and the result of that function is passed to the second
|
||||
function, and so on, until the last function returns the output value of
|
||||
this processor.
|
||||
|
||||
By default, stop process on ``None`` value. This behaviour can be changed by
|
||||
passing keyword argument ``stop_on_none=False``.
|
||||
|
||||
Example::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import Compose
|
||||
>>> proc = Compose(lambda v: v[0], str.upper)
|
||||
>>> proc(['hello', 'world'])
|
||||
'HELLO'
|
||||
|
||||
Each function can optionally receive a ``loader_context`` parameter. For
|
||||
those which do, this processor will pass the currently active :ref:`Loader
|
||||
context <topics-loaders-context>` through that parameter.
|
||||
|
||||
The keyword arguments passed in the constructor are used as the default
|
||||
Loader context values passed to each function call. However, the final
|
||||
Loader context values passed to functions are overridden with the currently
|
||||
active Loader context accessible through the :meth:`ItemLoader.context`
|
||||
attribute.
|
||||
|
||||
.. class:: MapCompose(\*functions, \**default_loader_context)
|
||||
|
||||
A processor which is constructed from the composition of the given
|
||||
functions, similar to the :class:`Compose` processor. The difference with
|
||||
this processor is the way internal results are passed among functions,
|
||||
which is as follows:
|
||||
|
||||
The input value of this processor is *iterated* and the first function is
|
||||
applied to each element. The results of these function calls (one for each element)
|
||||
are concatenated to construct a new iterable, which is then used to apply the
|
||||
second function, and so on, until the last function is applied to each
|
||||
value of the list of values collected so far. The output values of the last
|
||||
function are concatenated together to produce the output of this processor.
|
||||
|
||||
Each particular function can return a value or a list of values, which is
|
||||
flattened with the list of values returned by the same function applied to
|
||||
the other input values. The functions can also return ``None`` in which
|
||||
case the output of that function is ignored for further processing over the
|
||||
chain.
|
||||
|
||||
This processor provides a convenient way to compose functions that only
|
||||
work with single values (instead of iterables). For this reason the
|
||||
:class:`MapCompose` processor is typically used as input processor, since
|
||||
data is often extracted using the
|
||||
:meth:`~scrapy.selector.Selector.extract` method of :ref:`selectors
|
||||
<topics-selectors>`, which returns a list of unicode strings.
|
||||
|
||||
The example below should clarify how it works::
|
||||
|
||||
>>> def filter_world(x):
|
||||
... return None if x == 'world' else x
|
||||
...
|
||||
>>> from scrapy.contrib.loader.processor import MapCompose
|
||||
>>> proc = MapCompose(filter_world, unicode.upper)
|
||||
>>> proc([u'hello', u'world', u'this', u'is', u'scrapy'])
|
||||
[u'HELLO, u'THIS', u'IS', u'SCRAPY']
|
||||
|
||||
As with the Compose processor, functions can receive Loader contexts, and
|
||||
constructor keyword arguments are used as default context values. See
|
||||
:class:`Compose` processor for more info.
|
||||
|
||||
.. _itemloaders: https://itemloaders.readthedocs.io/en/latest/
|
||||
.. _processors: https://itemloaders.readthedocs.io/en/latest/built-in-processors.html
|
||||
|
@ -4,119 +4,273 @@
|
||||
Logging
|
||||
=======
|
||||
|
||||
Scrapy provides a logging facility which can be used through the
|
||||
:mod:`scrapy.log` module. The current underlying implementation uses `Twisted
|
||||
logging`_ but this may change in the future.
|
||||
.. note::
|
||||
:mod:`scrapy.log` has been deprecated alongside its functions in favor of
|
||||
explicit calls to the Python standard logging. Keep reading to learn more
|
||||
about the new logging system.
|
||||
|
||||
.. _Twisted logging: http://twistedmatrix.com/projects/core/documentation/howto/logging.html
|
||||
Scrapy uses :mod:`logging` for event logging. We'll
|
||||
provide some simple examples to get you started, but for more advanced
|
||||
use-cases it's strongly suggested to read thoroughly its documentation.
|
||||
|
||||
The logging service must be explicitly started through the :func:`scrapy.log.start` function.
|
||||
Logging works out of the box, and can be configured to some extent with the
|
||||
Scrapy settings listed in :ref:`topics-logging-settings`.
|
||||
|
||||
Scrapy calls :func:`scrapy.utils.log.configure_logging` to set some reasonable
|
||||
defaults and handle those settings in :ref:`topics-logging-settings` when
|
||||
running commands, so it's recommended to manually call it if you're running
|
||||
Scrapy from scripts as described in :ref:`run-from-script`.
|
||||
|
||||
.. _topics-logging-levels:
|
||||
|
||||
Log levels
|
||||
==========
|
||||
|
||||
Scrapy provides 5 logging levels:
|
||||
Python's builtin logging defines 5 different levels to indicate the severity of a
|
||||
given log message. Here are the standard ones, listed in decreasing order:
|
||||
|
||||
1. :data:`~scrapy.log.CRITICAL` - for critical errors
|
||||
2. :data:`~scrapy.log.ERROR` - for regular errors
|
||||
3. :data:`~scrapy.log.WARNING` - for warning messages
|
||||
4. :data:`~scrapy.log.INFO` - for informational messages
|
||||
5. :data:`~scrapy.log.DEBUG` - for debugging messages
|
||||
|
||||
How to set the log level
|
||||
========================
|
||||
|
||||
You can set the log level using the `--loglevel/-L` command line option, or
|
||||
using the :setting:`LOG_LEVEL` setting.
|
||||
1. ``logging.CRITICAL`` - for critical errors (highest severity)
|
||||
2. ``logging.ERROR`` - for regular errors
|
||||
3. ``logging.WARNING`` - for warning messages
|
||||
4. ``logging.INFO`` - for informational messages
|
||||
5. ``logging.DEBUG`` - for debugging messages (lowest severity)
|
||||
|
||||
How to log messages
|
||||
===================
|
||||
|
||||
Here's a quick example of how to log a message using the ``WARNING`` level::
|
||||
Here's a quick example of how to log a message using the ``logging.WARNING``
|
||||
level::
|
||||
|
||||
from scrapy import log
|
||||
log.msg("This is a warning", level=log.WARNING)
|
||||
import logging
|
||||
logging.warning("This is a warning")
|
||||
|
||||
There are shortcuts for issuing log messages on any of the standard 5 levels,
|
||||
and there's also a general ``logging.log`` method which takes a given level as
|
||||
argument. If needed, the last example could be rewritten as::
|
||||
|
||||
import logging
|
||||
logging.log(logging.WARNING, "This is a warning")
|
||||
|
||||
On top of that, you can create different "loggers" to encapsulate messages. (For
|
||||
example, a common practice is to create different loggers for every module).
|
||||
These loggers can be configured independently, and they allow hierarchical
|
||||
constructions.
|
||||
|
||||
The previous examples use the root logger behind the scenes, which is a top level
|
||||
logger where all messages are propagated to (unless otherwise specified). Using
|
||||
``logging`` helpers is merely a shortcut for getting the root logger
|
||||
explicitly, so this is also an equivalent of the last snippets::
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger()
|
||||
logger.warning("This is a warning")
|
||||
|
||||
You can use a different logger just by getting its name with the
|
||||
``logging.getLogger`` function::
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger('mycustomlogger')
|
||||
logger.warning("This is a warning")
|
||||
|
||||
Finally, you can ensure having a custom logger for any module you're working on
|
||||
by using the ``__name__`` variable, which is populated with current module's
|
||||
path::
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("This is a warning")
|
||||
|
||||
.. seealso::
|
||||
|
||||
Module logging, :doc:`HowTo <howto/logging>`
|
||||
Basic Logging Tutorial
|
||||
|
||||
Module logging, :ref:`Loggers <logger>`
|
||||
Further documentation on loggers
|
||||
|
||||
.. _topics-logging-from-spiders:
|
||||
|
||||
Logging from Spiders
|
||||
====================
|
||||
|
||||
The recommended way to log from spiders is by using the Spider
|
||||
:meth:`~scrapy.spider.Spider.log` method, which already populates the
|
||||
``spider`` argument of the :func:`scrapy.log.msg` function. The other arguments
|
||||
are passed directly to the :func:`~scrapy.log.msg` function.
|
||||
Scrapy provides a :data:`~scrapy.spiders.Spider.logger` within each Spider
|
||||
instance, which can be accessed and used like this::
|
||||
|
||||
scrapy.log module
|
||||
=================
|
||||
import scrapy
|
||||
|
||||
.. module:: scrapy.log
|
||||
:synopsis: Logging facility
|
||||
class MySpider(scrapy.Spider):
|
||||
|
||||
.. function:: start(logfile=None, loglevel=None, logstdout=None)
|
||||
name = 'myspider'
|
||||
start_urls = ['https://scrapinghub.com']
|
||||
|
||||
Start the logging facility. This must be called before actually logging any
|
||||
messages. Otherwise, messages logged before this call will get lost.
|
||||
def parse(self, response):
|
||||
self.logger.info('Parse function called on %s', response.url)
|
||||
|
||||
:param logfile: the file path to use for logging output. If omitted, the
|
||||
:setting:`LOG_FILE` setting will be used. If both are ``None``, the log
|
||||
will be sent to standard error.
|
||||
:type logfile: str
|
||||
That logger is created using the Spider's name, but you can use any custom
|
||||
Python logger you want. For example::
|
||||
|
||||
:param loglevel: the minimum logging level to log. Available values are:
|
||||
:data:`CRITICAL`, :data:`ERROR`, :data:`WARNING`, :data:`INFO` and
|
||||
:data:`DEBUG`.
|
||||
import logging
|
||||
import scrapy
|
||||
|
||||
:param logstdout: if ``True``, all standard output (and error) of your
|
||||
application will be logged instead. For example if you "print 'hello'"
|
||||
it will appear in the Scrapy log. If omitted, the :setting:`LOG_STDOUT`
|
||||
setting will be used.
|
||||
:type logstdout: boolean
|
||||
logger = logging.getLogger('mycustomlogger')
|
||||
|
||||
.. function:: msg(message, level=INFO, spider=None)
|
||||
class MySpider(scrapy.Spider):
|
||||
|
||||
Log a message
|
||||
name = 'myspider'
|
||||
start_urls = ['https://scrapinghub.com']
|
||||
|
||||
:param message: the message to log
|
||||
:type message: str
|
||||
def parse(self, response):
|
||||
logger.info('Parse function called on %s', response.url)
|
||||
|
||||
:param level: the log level for this message. See
|
||||
:ref:`topics-logging-levels`.
|
||||
.. _topics-logging-configuration:
|
||||
|
||||
:param spider: the spider to use for logging this message. This parameter
|
||||
should always be used when logging things related to a particular
|
||||
spider.
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
Logging configuration
|
||||
=====================
|
||||
|
||||
.. data:: CRITICAL
|
||||
Loggers on their own don't manage how messages sent through them are displayed.
|
||||
For this task, different "handlers" can be attached to any logger instance and
|
||||
they will redirect those messages to appropriate destinations, such as the
|
||||
standard output, files, emails, etc.
|
||||
|
||||
Log level for critical errors
|
||||
By default, Scrapy sets and configures a handler for the root logger, based on
|
||||
the settings below.
|
||||
|
||||
.. data:: ERROR
|
||||
|
||||
Log level for errors
|
||||
|
||||
.. data:: WARNING
|
||||
|
||||
Log level for warnings
|
||||
|
||||
.. data:: INFO
|
||||
|
||||
Log level for informational messages (recommended level for production
|
||||
deployments)
|
||||
|
||||
.. data:: DEBUG
|
||||
|
||||
Log level for debugging messages (recommended level for development)
|
||||
.. _topics-logging-settings:
|
||||
|
||||
Logging settings
|
||||
================
|
||||
----------------
|
||||
|
||||
These settings can be used to configure the logging:
|
||||
|
||||
* :setting:`LOG_FILE`
|
||||
* :setting:`LOG_ENABLED`
|
||||
* :setting:`LOG_ENCODING`
|
||||
* :setting:`LOG_FILE`
|
||||
* :setting:`LOG_LEVEL`
|
||||
* :setting:`LOG_FORMAT`
|
||||
* :setting:`LOG_DATEFORMAT`
|
||||
* :setting:`LOG_STDOUT`
|
||||
* :setting:`LOG_SHORT_NAMES`
|
||||
|
||||
The first couple of settings define a destination for log messages. If
|
||||
:setting:`LOG_FILE` is set, messages sent through the root logger will be
|
||||
redirected to a file named :setting:`LOG_FILE` with encoding
|
||||
:setting:`LOG_ENCODING`. If unset and :setting:`LOG_ENABLED` is ``True``, log
|
||||
messages will be displayed on the standard error. Lastly, if
|
||||
:setting:`LOG_ENABLED` is ``False``, there won't be any visible log output.
|
||||
|
||||
:setting:`LOG_LEVEL` determines the minimum level of severity to display, those
|
||||
messages with lower severity will be filtered out. It ranges through the
|
||||
possible levels listed in :ref:`topics-logging-levels`.
|
||||
|
||||
:setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings
|
||||
used as layouts for all messages. Those strings can contain any placeholders
|
||||
listed in :ref:`logging's logrecord attributes docs <logrecord-attributes>` and
|
||||
:ref:`datetime's strftime and strptime directives <strftime-strptime-behavior>`
|
||||
respectively.
|
||||
|
||||
If :setting:`LOG_SHORT_NAMES` is set, then the logs will not display the Scrapy
|
||||
component that prints the log. It is unset by default, hence logs contain the
|
||||
Scrapy component responsible for that log output.
|
||||
|
||||
Command-line options
|
||||
--------------------
|
||||
|
||||
There are command-line arguments, available for all commands, that you can use
|
||||
to override some of the Scrapy settings regarding logging.
|
||||
|
||||
* ``--logfile FILE``
|
||||
Overrides :setting:`LOG_FILE`
|
||||
* ``--loglevel/-L LEVEL``
|
||||
Overrides :setting:`LOG_LEVEL`
|
||||
* ``--nolog``
|
||||
Sets :setting:`LOG_ENABLED` to ``False``
|
||||
|
||||
.. seealso::
|
||||
|
||||
Module :mod:`logging.handlers`
|
||||
Further documentation on available handlers
|
||||
|
||||
.. _custom-log-formats:
|
||||
|
||||
Custom Log Formats
|
||||
------------------
|
||||
|
||||
A custom log format can be set for different actions by extending
|
||||
:class:`~scrapy.logformatter.LogFormatter` class and making
|
||||
:setting:`LOG_FORMATTER` point to your new class.
|
||||
|
||||
.. autoclass:: scrapy.logformatter.LogFormatter
|
||||
:members:
|
||||
|
||||
|
||||
.. _topics-logging-advanced-customization:
|
||||
|
||||
Advanced customization
|
||||
----------------------
|
||||
|
||||
Because Scrapy uses stdlib logging module, you can customize logging using
|
||||
all features of stdlib logging.
|
||||
|
||||
For example, let's say you're scraping a website which returns many
|
||||
HTTP 404 and 500 responses, and you want to hide all messages like this::
|
||||
|
||||
2016-12-16 22:00:06 [scrapy.spidermiddlewares.httperror] INFO: Ignoring
|
||||
response <500 http://quotes.toscrape.com/page/1-34/>: HTTP status code
|
||||
is not handled or not allowed
|
||||
|
||||
The first thing to note is a logger name - it is in brackets:
|
||||
``[scrapy.spidermiddlewares.httperror]``. If you get just ``[scrapy]`` then
|
||||
:setting:`LOG_SHORT_NAMES` is likely set to True; set it to False and re-run
|
||||
the crawl.
|
||||
|
||||
Next, we can see that the message has INFO level. To hide it
|
||||
we should set logging level for ``scrapy.spidermiddlewares.httperror``
|
||||
higher than INFO; next level after INFO is WARNING. It could be done
|
||||
e.g. in the spider's ``__init__`` method::
|
||||
|
||||
import logging
|
||||
import scrapy
|
||||
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
# ...
|
||||
def __init__(self, *args, **kwargs):
|
||||
logger = logging.getLogger('scrapy.spidermiddlewares.httperror')
|
||||
logger.setLevel(logging.WARNING)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
If you run this spider again then INFO messages from
|
||||
``scrapy.spidermiddlewares.httperror`` logger will be gone.
|
||||
|
||||
scrapy.utils.log module
|
||||
=======================
|
||||
|
||||
.. module:: scrapy.utils.log
|
||||
:synopsis: Logging utils
|
||||
|
||||
.. autofunction:: configure_logging
|
||||
|
||||
``configure_logging`` is automatically called when using Scrapy commands
|
||||
or :class:`~scrapy.crawler.CrawlerProcess`, but needs to be called explicitly
|
||||
when running custom scripts using :class:`~scrapy.crawler.CrawlerRunner`.
|
||||
In that case, its usage is not required but it's recommended.
|
||||
|
||||
Another option when running custom scripts is to manually configure the logging.
|
||||
To do this you can use :func:`logging.basicConfig` to set a basic root handler.
|
||||
|
||||
Note that :class:`~scrapy.crawler.CrawlerProcess` automatically calls ``configure_logging``,
|
||||
so it is recommended to only use :func:`logging.basicConfig` together with
|
||||
:class:`~scrapy.crawler.CrawlerRunner`.
|
||||
|
||||
This is an example on how to redirect ``INFO`` or higher messages to a file::
|
||||
|
||||
import logging
|
||||
|
||||
logging.basicConfig(
|
||||
filename='log.txt',
|
||||
format='%(levelname)s: %(message)s',
|
||||
level=logging.INFO
|
||||
)
|
||||
|
||||
Refer to :ref:`run-from-script` for more details about using Scrapy this
|
||||
way.
|
||||
|
639
docs/topics/media-pipeline.rst
Normal file
@ -0,0 +1,639 @@
|
||||
.. _topics-media-pipeline:
|
||||
|
||||
===========================================
|
||||
Downloading and processing files and images
|
||||
===========================================
|
||||
|
||||
.. currentmodule:: scrapy.pipelines.images
|
||||
|
||||
Scrapy provides reusable :doc:`item pipelines </topics/item-pipeline>` for
|
||||
downloading files attached to a particular item (for example, when you scrape
|
||||
products and also want to download their images locally). These pipelines share
|
||||
a bit of functionality and structure (we refer to them as media pipelines), but
|
||||
typically you'll either use the Files Pipeline or the Images Pipeline.
|
||||
|
||||
Both pipelines implement these features:
|
||||
|
||||
* Avoid re-downloading media that was downloaded recently
|
||||
* Specifying where to store the media (filesystem directory, Amazon S3 bucket,
|
||||
Google Cloud Storage bucket)
|
||||
|
||||
The Images Pipeline has a few extra functions for processing images:
|
||||
|
||||
* Convert all downloaded images to a common format (JPG) and mode (RGB)
|
||||
* Thumbnail generation
|
||||
* Check images width/height to make sure they meet a minimum constraint
|
||||
|
||||
The pipelines also keep an internal queue of those media URLs which are currently
|
||||
being scheduled for download, and connect those responses that arrive containing
|
||||
the same media to that queue. This avoids downloading the same media more than
|
||||
once when it's shared by several items.
|
||||
|
||||
Using the Files Pipeline
|
||||
========================
|
||||
|
||||
The typical workflow, when using the :class:`FilesPipeline` goes like
|
||||
this:
|
||||
|
||||
1. In a Spider, you scrape an item and put the URLs of the desired into a
|
||||
``file_urls`` field.
|
||||
|
||||
2. The item is returned from the spider and goes to the item pipeline.
|
||||
|
||||
3. When the item reaches the :class:`FilesPipeline`, the URLs in the
|
||||
``file_urls`` field are scheduled for download using the standard
|
||||
Scrapy scheduler and downloader (which means the scheduler and downloader
|
||||
middlewares are reused), but with a higher priority, processing them before other
|
||||
pages are scraped. The item remains "locked" at that particular pipeline stage
|
||||
until the files have finish downloading (or fail for some reason).
|
||||
|
||||
4. When the files are downloaded, another field (``files``) will be populated
|
||||
with the results. This field will contain a list of dicts with information
|
||||
about the downloaded files, such as the downloaded path, the original
|
||||
scraped url (taken from the ``file_urls`` field), the file checksum and the file status.
|
||||
The files in the list of the ``files`` field will retain the same order of
|
||||
the original ``file_urls`` field. If some file failed downloading, an
|
||||
error will be logged and the file won't be present in the ``files`` field.
|
||||
|
||||
|
||||
Using the Images Pipeline
|
||||
=========================
|
||||
|
||||
Using the :class:`ImagesPipeline` is a lot like using the :class:`FilesPipeline`,
|
||||
except the default field names used are different: you use ``image_urls`` for
|
||||
the image URLs of an item and it will populate an ``images`` field for the information
|
||||
about the downloaded images.
|
||||
|
||||
The advantage of using the :class:`ImagesPipeline` for image files is that you
|
||||
can configure some extra functions like generating thumbnails and filtering
|
||||
the images based on their size.
|
||||
|
||||
The Images Pipeline uses `Pillow`_ for thumbnailing and normalizing images to
|
||||
JPEG/RGB format, so you need to install this library in order to use it.
|
||||
`Python Imaging Library`_ (PIL) should also work in most cases, but it is known
|
||||
to cause troubles in some setups, so we recommend to use `Pillow`_ instead of
|
||||
PIL.
|
||||
|
||||
.. _Pillow: https://github.com/python-pillow/Pillow
|
||||
.. _Python Imaging Library: http://www.pythonware.com/products/pil/
|
||||
|
||||
|
||||
.. _topics-media-pipeline-enabling:
|
||||
|
||||
Enabling your Media Pipeline
|
||||
============================
|
||||
|
||||
.. setting:: IMAGES_STORE
|
||||
.. setting:: FILES_STORE
|
||||
|
||||
To enable your media pipeline you must first add it to your project
|
||||
:setting:`ITEM_PIPELINES` setting.
|
||||
|
||||
For Images Pipeline, use::
|
||||
|
||||
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
|
||||
|
||||
For Files Pipeline, use::
|
||||
|
||||
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
|
||||
|
||||
.. note::
|
||||
You can also use both the Files and Images Pipeline at the same time.
|
||||
|
||||
|
||||
Then, configure the target storage setting to a valid value that will be used
|
||||
for storing the downloaded images. Otherwise the pipeline will remain disabled,
|
||||
even if you include it in the :setting:`ITEM_PIPELINES` setting.
|
||||
|
||||
For the Files Pipeline, set the :setting:`FILES_STORE` setting::
|
||||
|
||||
FILES_STORE = '/path/to/valid/dir'
|
||||
|
||||
For the Images Pipeline, set the :setting:`IMAGES_STORE` setting::
|
||||
|
||||
IMAGES_STORE = '/path/to/valid/dir'
|
||||
|
||||
Supported Storage
|
||||
=================
|
||||
|
||||
File system storage
|
||||
-------------------
|
||||
|
||||
The files are stored using a `SHA1 hash`_ of their URLs for the file names.
|
||||
|
||||
For example, the following image URL::
|
||||
|
||||
http://www.example.com/image.jpg
|
||||
|
||||
Whose ``SHA1 hash`` is::
|
||||
|
||||
3afec3b4765f8f0a07b78f98c07b83f013567a0a
|
||||
|
||||
Will be downloaded and stored in the following file::
|
||||
|
||||
<IMAGES_STORE>/full/3afec3b4765f8f0a07b78f98c07b83f013567a0a.jpg
|
||||
|
||||
Where:
|
||||
|
||||
* ``<IMAGES_STORE>`` is the directory defined in :setting:`IMAGES_STORE` setting
|
||||
for the Images Pipeline.
|
||||
|
||||
* ``full`` is a sub-directory to separate full images from thumbnails (if
|
||||
used). For more info see :ref:`topics-images-thumbnails`.
|
||||
|
||||
.. _media-pipeline-ftp:
|
||||
|
||||
FTP server storage
|
||||
------------------
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
:setting:`FILES_STORE` and :setting:`IMAGES_STORE` can point to an FTP server.
|
||||
Scrapy will automatically upload the files to the server.
|
||||
|
||||
:setting:`FILES_STORE` and :setting:`IMAGES_STORE` should be written in one of the
|
||||
following forms::
|
||||
|
||||
ftp://username:password@address:port/path
|
||||
ftp://address:port/path
|
||||
|
||||
If ``username`` and ``password`` are not provided, they are taken from the :setting:`FTP_USER` and
|
||||
:setting:`FTP_PASSWORD` settings respectively.
|
||||
|
||||
FTP supports two different connection modes: active or passive. Scrapy uses
|
||||
the passive connection mode by default. To use the active connection mode instead,
|
||||
set the :setting:`FEED_STORAGE_FTP_ACTIVE` setting to ``True``.
|
||||
|
||||
Amazon S3 storage
|
||||
-----------------
|
||||
|
||||
.. setting:: FILES_STORE_S3_ACL
|
||||
.. setting:: IMAGES_STORE_S3_ACL
|
||||
|
||||
:setting:`FILES_STORE` and :setting:`IMAGES_STORE` can represent an Amazon S3
|
||||
bucket. Scrapy will automatically upload the files to the bucket.
|
||||
|
||||
For example, this is a valid :setting:`IMAGES_STORE` value::
|
||||
|
||||
IMAGES_STORE = 's3://bucket/images'
|
||||
|
||||
You can modify the Access Control List (ACL) policy used for the stored files,
|
||||
which is defined by the :setting:`FILES_STORE_S3_ACL` and
|
||||
:setting:`IMAGES_STORE_S3_ACL` settings. By default, the ACL is set to
|
||||
``private``. To make the files publicly available use the ``public-read``
|
||||
policy::
|
||||
|
||||
IMAGES_STORE_S3_ACL = 'public-read'
|
||||
|
||||
For more information, see `canned ACLs`_ in the Amazon S3 Developer Guide.
|
||||
|
||||
Because Scrapy uses ``botocore`` internally you can also use other S3-like storages. Storages like
|
||||
self-hosted `Minio`_ or `s3.scality`_. All you need to do is set endpoint option in you Scrapy settings::
|
||||
|
||||
AWS_ENDPOINT_URL = 'http://minio.example.com:9000'
|
||||
|
||||
For self-hosting you also might feel the need not to use SSL and not to verify SSL connection::
|
||||
|
||||
AWS_USE_SSL = False # or True (None by default)
|
||||
AWS_VERIFY = False # or True (None by default)
|
||||
|
||||
.. _Minio: https://github.com/minio/minio
|
||||
.. _s3.scality: https://s3.scality.com/
|
||||
.. _canned ACLs: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl
|
||||
|
||||
|
||||
.. _media-pipeline-gcs:
|
||||
|
||||
Google Cloud Storage
|
||||
---------------------
|
||||
|
||||
.. setting:: FILES_STORE_GCS_ACL
|
||||
.. setting:: IMAGES_STORE_GCS_ACL
|
||||
|
||||
:setting:`FILES_STORE` and :setting:`IMAGES_STORE` can represent a Google Cloud Storage
|
||||
bucket. Scrapy will automatically upload the files to the bucket. (requires `google-cloud-storage`_ )
|
||||
|
||||
.. _google-cloud-storage: https://cloud.google.com/storage/docs/reference/libraries#client-libraries-install-python
|
||||
|
||||
For example, these are valid :setting:`IMAGES_STORE` and :setting:`GCS_PROJECT_ID` settings::
|
||||
|
||||
IMAGES_STORE = 'gs://bucket/images/'
|
||||
GCS_PROJECT_ID = 'project_id'
|
||||
|
||||
For information about authentication, see this `documentation`_.
|
||||
|
||||
.. _documentation: https://cloud.google.com/docs/authentication/production
|
||||
|
||||
You can modify the Access Control List (ACL) policy used for the stored files,
|
||||
which is defined by the :setting:`FILES_STORE_GCS_ACL` and
|
||||
:setting:`IMAGES_STORE_GCS_ACL` settings. By default, the ACL is set to
|
||||
``''`` (empty string) which means that Cloud Storage applies the bucket's default object ACL to the object.
|
||||
To make the files publicly available use the ``publicRead``
|
||||
policy::
|
||||
|
||||
IMAGES_STORE_GCS_ACL = 'publicRead'
|
||||
|
||||
For more information, see `Predefined ACLs`_ in the Google Cloud Platform Developer Guide.
|
||||
|
||||
.. _Predefined ACLs: https://cloud.google.com/storage/docs/access-control/lists#predefined-acl
|
||||
|
||||
Usage example
|
||||
=============
|
||||
|
||||
.. setting:: FILES_URLS_FIELD
|
||||
.. setting:: FILES_RESULT_FIELD
|
||||
.. setting:: IMAGES_URLS_FIELD
|
||||
.. setting:: IMAGES_RESULT_FIELD
|
||||
|
||||
In order to use a media pipeline, first :ref:`enable it
|
||||
<topics-media-pipeline-enabling>`.
|
||||
|
||||
Then, if a spider returns an :ref:`item object <topics-items>` with the URLs
|
||||
field (``file_urls`` or ``image_urls``, for the Files or Images Pipeline
|
||||
respectively), the pipeline will put the results under the respective field
|
||||
(``files`` or ``images``).
|
||||
|
||||
When using :ref:`item types <item-types>` for which fields are defined beforehand,
|
||||
you must define both the URLs field and the results field. For example, when
|
||||
using the images pipeline, items must define both the ``image_urls`` and the
|
||||
``images`` field. For instance, using the :class:`~scrapy.item.Item` class::
|
||||
|
||||
import scrapy
|
||||
|
||||
class MyItem(scrapy.Item):
|
||||
# ... other item fields ...
|
||||
image_urls = scrapy.Field()
|
||||
images = scrapy.Field()
|
||||
|
||||
If you want to use another field name for the URLs key or for the results key,
|
||||
it is also possible to override it.
|
||||
|
||||
For the Files Pipeline, set :setting:`FILES_URLS_FIELD` and/or
|
||||
:setting:`FILES_RESULT_FIELD` settings::
|
||||
|
||||
FILES_URLS_FIELD = 'field_name_for_your_files_urls'
|
||||
FILES_RESULT_FIELD = 'field_name_for_your_processed_files'
|
||||
|
||||
For the Images Pipeline, set :setting:`IMAGES_URLS_FIELD` and/or
|
||||
:setting:`IMAGES_RESULT_FIELD` settings::
|
||||
|
||||
IMAGES_URLS_FIELD = 'field_name_for_your_images_urls'
|
||||
IMAGES_RESULT_FIELD = 'field_name_for_your_processed_images'
|
||||
|
||||
If you need something more complex and want to override the custom pipeline
|
||||
behaviour, see :ref:`topics-media-pipeline-override`.
|
||||
|
||||
If you have multiple image pipelines inheriting from ImagePipeline and you want
|
||||
to have different settings in different pipelines you can set setting keys
|
||||
preceded with uppercase name of your pipeline class. E.g. if your pipeline is
|
||||
called MyPipeline and you want to have custom IMAGES_URLS_FIELD you define
|
||||
setting MYPIPELINE_IMAGES_URLS_FIELD and your custom settings will be used.
|
||||
|
||||
|
||||
Additional features
|
||||
===================
|
||||
|
||||
File expiration
|
||||
---------------
|
||||
|
||||
.. setting:: IMAGES_EXPIRES
|
||||
.. setting:: FILES_EXPIRES
|
||||
|
||||
The Image Pipeline avoids downloading files that were downloaded recently. To
|
||||
adjust this retention delay use the :setting:`FILES_EXPIRES` setting (or
|
||||
:setting:`IMAGES_EXPIRES`, in case of Images Pipeline), which
|
||||
specifies the delay in number of days::
|
||||
|
||||
# 120 days of delay for files expiration
|
||||
FILES_EXPIRES = 120
|
||||
|
||||
# 30 days of delay for images expiration
|
||||
IMAGES_EXPIRES = 30
|
||||
|
||||
The default value for both settings is 90 days.
|
||||
|
||||
If you have pipeline that subclasses FilesPipeline and you'd like to have
|
||||
different setting for it you can set setting keys preceded by uppercase
|
||||
class name. E.g. given pipeline class called MyPipeline you can set setting key:
|
||||
|
||||
MYPIPELINE_FILES_EXPIRES = 180
|
||||
|
||||
and pipeline class MyPipeline will have expiration time set to 180.
|
||||
|
||||
.. _topics-images-thumbnails:
|
||||
|
||||
Thumbnail generation for images
|
||||
-------------------------------
|
||||
|
||||
The Images Pipeline can automatically create thumbnails of the downloaded
|
||||
images.
|
||||
|
||||
.. setting:: IMAGES_THUMBS
|
||||
|
||||
In order to use this feature, you must set :setting:`IMAGES_THUMBS` to a dictionary
|
||||
where the keys are the thumbnail names and the values are their dimensions.
|
||||
|
||||
For example::
|
||||
|
||||
IMAGES_THUMBS = {
|
||||
'small': (50, 50),
|
||||
'big': (270, 270),
|
||||
}
|
||||
|
||||
When you use this feature, the Images Pipeline will create thumbnails of the
|
||||
each specified size with this format::
|
||||
|
||||
<IMAGES_STORE>/thumbs/<size_name>/<image_id>.jpg
|
||||
|
||||
Where:
|
||||
|
||||
* ``<size_name>`` is the one specified in the :setting:`IMAGES_THUMBS`
|
||||
dictionary keys (``small``, ``big``, etc)
|
||||
|
||||
* ``<image_id>`` is the `SHA1 hash`_ of the image url
|
||||
|
||||
.. _SHA1 hash: https://en.wikipedia.org/wiki/SHA_hash_functions
|
||||
|
||||
Example of image files stored using ``small`` and ``big`` thumbnail names::
|
||||
|
||||
<IMAGES_STORE>/full/63bbfea82b8880ed33cdb762aa11fab722a90a24.jpg
|
||||
<IMAGES_STORE>/thumbs/small/63bbfea82b8880ed33cdb762aa11fab722a90a24.jpg
|
||||
<IMAGES_STORE>/thumbs/big/63bbfea82b8880ed33cdb762aa11fab722a90a24.jpg
|
||||
|
||||
The first one is the full image, as downloaded from the site.
|
||||
|
||||
Filtering out small images
|
||||
--------------------------
|
||||
|
||||
.. setting:: IMAGES_MIN_HEIGHT
|
||||
|
||||
.. setting:: IMAGES_MIN_WIDTH
|
||||
|
||||
When using the Images Pipeline, you can drop images which are too small, by
|
||||
specifying the minimum allowed size in the :setting:`IMAGES_MIN_HEIGHT` and
|
||||
:setting:`IMAGES_MIN_WIDTH` settings.
|
||||
|
||||
For example::
|
||||
|
||||
IMAGES_MIN_HEIGHT = 110
|
||||
IMAGES_MIN_WIDTH = 110
|
||||
|
||||
.. note::
|
||||
The size constraints don't affect thumbnail generation at all.
|
||||
|
||||
It is possible to set just one size constraint or both. When setting both of
|
||||
them, only images that satisfy both minimum sizes will be saved. For the
|
||||
above example, images of sizes (105 x 105) or (105 x 200) or (200 x 105) will
|
||||
all be dropped because at least one dimension is shorter than the constraint.
|
||||
|
||||
By default, there are no size constraints, so all images are processed.
|
||||
|
||||
Allowing redirections
|
||||
---------------------
|
||||
|
||||
.. setting:: MEDIA_ALLOW_REDIRECTS
|
||||
|
||||
By default media pipelines ignore redirects, i.e. an HTTP redirection
|
||||
to a media file URL request will mean the media download is considered failed.
|
||||
|
||||
To handle media redirections, set this setting to ``True``::
|
||||
|
||||
MEDIA_ALLOW_REDIRECTS = True
|
||||
|
||||
.. _topics-media-pipeline-override:
|
||||
|
||||
Extending the Media Pipelines
|
||||
=============================
|
||||
|
||||
.. module:: scrapy.pipelines.files
|
||||
:synopsis: Files Pipeline
|
||||
|
||||
See here the methods that you can override in your custom Files Pipeline:
|
||||
|
||||
.. class:: FilesPipeline
|
||||
|
||||
.. method:: file_path(self, request, response=None, info=None, *, item=None)
|
||||
|
||||
This method is called once per downloaded item. It returns the
|
||||
download path of the file originating from the specified
|
||||
:class:`response <scrapy.http.Response>`.
|
||||
|
||||
In addition to ``response``, this method receives the original
|
||||
:class:`request <scrapy.Request>`,
|
||||
:class:`info <scrapy.pipelines.media.MediaPipeline.SpiderInfo>` and
|
||||
:class:`item <scrapy.item.Item>`
|
||||
|
||||
You can override this method to customize the download path of each file.
|
||||
|
||||
For example, if file URLs end like regular paths (e.g.
|
||||
``https://example.com/a/b/c/foo.png``), you can use the following
|
||||
approach to download all files into the ``files`` folder with their
|
||||
original filenames (e.g. ``files/foo.png``)::
|
||||
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from scrapy.pipelines.files import FilesPipeline
|
||||
|
||||
class MyFilesPipeline(FilesPipeline):
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return 'files/' + os.path.basename(urlparse(request.url).path)
|
||||
|
||||
Similarly, you can use the ``item`` to determine the file path based on some item
|
||||
property.
|
||||
|
||||
By default the :meth:`file_path` method returns
|
||||
``full/<request URL hash>.<extension>``.
|
||||
|
||||
.. method:: FilesPipeline.get_media_requests(item, info)
|
||||
|
||||
As seen on the workflow, the pipeline will get the URLs of the images to
|
||||
download from the item. In order to do this, you can override the
|
||||
:meth:`~get_media_requests` method and return a Request for each
|
||||
file URL::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
adapter = ItemAdapter(item)
|
||||
for file_url in adapter['file_urls']:
|
||||
yield scrapy.Request(file_url)
|
||||
|
||||
Those requests will be processed by the pipeline and, when they have finished
|
||||
downloading, the results will be sent to the
|
||||
:meth:`~item_completed` method, as a list of 2-element tuples.
|
||||
Each tuple will contain ``(success, file_info_or_error)`` where:
|
||||
|
||||
* ``success`` is a boolean which is ``True`` if the image was downloaded
|
||||
successfully or ``False`` if it failed for some reason
|
||||
|
||||
* ``file_info_or_error`` is a dict containing the following keys (if
|
||||
success is ``True``) or a :exc:`~twisted.python.failure.Failure` if
|
||||
there was a problem.
|
||||
|
||||
* ``url`` - the url where the file was downloaded from. This is the url of
|
||||
the request returned from the :meth:`~get_media_requests`
|
||||
method.
|
||||
|
||||
* ``path`` - the path (relative to :setting:`FILES_STORE`) where the file
|
||||
was stored
|
||||
|
||||
* ``checksum`` - a `MD5 hash`_ of the image contents
|
||||
|
||||
* ``status`` - the file status indication.
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
It can be one of the following:
|
||||
|
||||
* ``downloaded`` - file was downloaded.
|
||||
* ``uptodate`` - file was not downloaded, as it was downloaded recently,
|
||||
according to the file expiration policy.
|
||||
* ``cached`` - file was already scheduled for download, by another item
|
||||
sharing the same file.
|
||||
|
||||
The list of tuples received by :meth:`~item_completed` is
|
||||
guaranteed to retain the same order of the requests returned from the
|
||||
:meth:`~get_media_requests` method.
|
||||
|
||||
Here's a typical value of the ``results`` argument::
|
||||
|
||||
[(True,
|
||||
{'checksum': '2b00042f7481c7b056c4b410d28f33cf',
|
||||
'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
|
||||
'url': 'http://www.example.com/files/product1.pdf',
|
||||
'status': 'downloaded'}),
|
||||
(False,
|
||||
Failure(...))]
|
||||
|
||||
By default the :meth:`get_media_requests` method returns ``None`` which
|
||||
means there are no files to download for the item.
|
||||
|
||||
.. method:: FilesPipeline.item_completed(results, item, info)
|
||||
|
||||
The :meth:`FilesPipeline.item_completed` method called when all file
|
||||
requests for a single item have completed (either finished downloading, or
|
||||
failed for some reason).
|
||||
|
||||
The :meth:`~item_completed` method must return the
|
||||
output that will be sent to subsequent item pipeline stages, so you must
|
||||
return (or drop) the item, as you would in any pipeline.
|
||||
|
||||
Here is an example of the :meth:`~item_completed` method where we
|
||||
store the downloaded file paths (passed in results) in the ``file_paths``
|
||||
item field, and we drop the item if it doesn't contain any files::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
file_paths = [x['path'] for ok, x in results if ok]
|
||||
if not file_paths:
|
||||
raise DropItem("Item contains no files")
|
||||
adapter = ItemAdapter(item)
|
||||
adapter['file_paths'] = file_paths
|
||||
return item
|
||||
|
||||
By default, the :meth:`item_completed` method returns the item.
|
||||
|
||||
|
||||
.. module:: scrapy.pipelines.images
|
||||
:synopsis: Images Pipeline
|
||||
|
||||
See here the methods that you can override in your custom Images Pipeline:
|
||||
|
||||
.. class:: ImagesPipeline
|
||||
|
||||
The :class:`ImagesPipeline` is an extension of the :class:`FilesPipeline`,
|
||||
customizing the field names and adding custom behavior for images.
|
||||
|
||||
.. method:: file_path(self, request, response=None, info=None, *, item=None)
|
||||
|
||||
This method is called once per downloaded item. It returns the
|
||||
download path of the file originating from the specified
|
||||
:class:`response <scrapy.http.Response>`.
|
||||
|
||||
In addition to ``response``, this method receives the original
|
||||
:class:`request <scrapy.Request>`,
|
||||
:class:`info <scrapy.pipelines.media.MediaPipeline.SpiderInfo>` and
|
||||
:class:`item <scrapy.item.Item>`
|
||||
|
||||
You can override this method to customize the download path of each file.
|
||||
|
||||
For example, if file URLs end like regular paths (e.g.
|
||||
``https://example.com/a/b/c/foo.png``), you can use the following
|
||||
approach to download all files into the ``files`` folder with their
|
||||
original filenames (e.g. ``files/foo.png``)::
|
||||
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
|
||||
class MyImagesPipeline(ImagesPipeline):
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return 'files/' + os.path.basename(urlparse(request.url).path)
|
||||
|
||||
Similarly, you can use the ``item`` to determine the file path based on some item
|
||||
property.
|
||||
|
||||
By default the :meth:`file_path` method returns
|
||||
``full/<request URL hash>.<extension>``.
|
||||
|
||||
.. method:: ImagesPipeline.get_media_requests(item, info)
|
||||
|
||||
Works the same way as :meth:`FilesPipeline.get_media_requests` method,
|
||||
but using a different field name for image urls.
|
||||
|
||||
Must return a Request for each image URL.
|
||||
|
||||
.. method:: ImagesPipeline.item_completed(results, item, info)
|
||||
|
||||
The :meth:`ImagesPipeline.item_completed` method is called when all image
|
||||
requests for a single item have completed (either finished downloading, or
|
||||
failed for some reason).
|
||||
|
||||
Works the same way as :meth:`FilesPipeline.item_completed` method,
|
||||
but using a different field names for storing image downloading results.
|
||||
|
||||
By default, the :meth:`item_completed` method returns the item.
|
||||
|
||||
|
||||
.. _media-pipeline-example:
|
||||
|
||||
Custom Images pipeline example
|
||||
==============================
|
||||
|
||||
Here is a full example of the Images Pipeline whose methods are exemplified
|
||||
above::
|
||||
|
||||
import scrapy
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
|
||||
class MyImagesPipeline(ImagesPipeline):
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
for image_url in item['image_urls']:
|
||||
yield scrapy.Request(image_url)
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
image_paths = [x['path'] for ok, x in results if ok]
|
||||
if not image_paths:
|
||||
raise DropItem("Item contains no images")
|
||||
adapter = ItemAdapter(item)
|
||||
adapter['image_paths'] = image_paths
|
||||
return item
|
||||
|
||||
|
||||
To enable your custom media pipeline component you must add its class import path to the
|
||||
:setting:`ITEM_PIPELINES` setting, like in the following example::
|
||||
|
||||
ITEM_PIPELINES = {
|
||||
'myproject.pipelines.MyImagesPipeline': 300
|
||||
}
|
||||
|
||||
.. _MD5 hash: https://en.wikipedia.org/wiki/MD5
|
@ -18,32 +18,93 @@ the typical way of running Scrapy via ``scrapy crawl``.
|
||||
Remember that Scrapy is built on top of the Twisted
|
||||
asynchronous networking library, so you need to run it inside the Twisted reactor.
|
||||
|
||||
Note that you will also have to shutdown the Twisted reactor yourself after the
|
||||
spider is finished. This can be achieved by connecting a handler to the
|
||||
``signals.spider_closed`` signal.
|
||||
The first utility you can use to run your spiders is
|
||||
:class:`scrapy.crawler.CrawlerProcess`. This class will start a Twisted reactor
|
||||
for you, configuring the logging and setting shutdown handlers. This class is
|
||||
the one used by all Scrapy commands.
|
||||
|
||||
Here's an example showing how to run a single spider with it.
|
||||
|
||||
::
|
||||
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
# Your spider definition
|
||||
...
|
||||
|
||||
process = CrawlerProcess(settings={
|
||||
"FEEDS": {
|
||||
"items.json": {"format": "json"},
|
||||
},
|
||||
})
|
||||
|
||||
process.crawl(MySpider)
|
||||
process.start() # the script will block here until the crawling is finished
|
||||
|
||||
Define settings within dictionary in CrawlerProcess. Make sure to check :class:`~scrapy.crawler.CrawlerProcess`
|
||||
documentation to get acquainted with its usage details.
|
||||
|
||||
If you are inside a Scrapy project there are some additional helpers you can
|
||||
use to import those components within the project. You can automatically import
|
||||
your spiders passing their name to :class:`~scrapy.crawler.CrawlerProcess`, and
|
||||
use ``get_project_settings`` to get a :class:`~scrapy.settings.Settings`
|
||||
instance with your project settings.
|
||||
|
||||
What follows is a working example of how to do that, using the `testspiders`_
|
||||
project as example.
|
||||
|
||||
::
|
||||
|
||||
from twisted.internet import reactor
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy import log, signals
|
||||
from testspiders.spiders.followall import FollowAllSpider
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
spider = FollowAllSpider(domain='scrapinghub.com')
|
||||
settings = get_project_settings()
|
||||
crawler = Crawler(settings)
|
||||
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
|
||||
crawler.configure()
|
||||
crawler.crawl(spider)
|
||||
crawler.start()
|
||||
log.start()
|
||||
reactor.run() # the script will block here until the spider_closed signal was sent
|
||||
process = CrawlerProcess(get_project_settings())
|
||||
|
||||
.. seealso:: `Twisted Reactor Overview`_.
|
||||
# 'followall' is the name of one of the spiders of the project.
|
||||
process.crawl('followall', domain='scrapinghub.com')
|
||||
process.start() # the script will block here until the crawling is finished
|
||||
|
||||
There's another Scrapy utility that provides more control over the crawling
|
||||
process: :class:`scrapy.crawler.CrawlerRunner`. This class is a thin wrapper
|
||||
that encapsulates some simple helpers to run multiple crawlers, but it won't
|
||||
start or interfere with existing reactors in any way.
|
||||
|
||||
Using this class the reactor should be explicitly run after scheduling your
|
||||
spiders. It's recommended you use :class:`~scrapy.crawler.CrawlerRunner`
|
||||
instead of :class:`~scrapy.crawler.CrawlerProcess` if your application is
|
||||
already using Twisted and you want to run Scrapy in the same reactor.
|
||||
|
||||
Note that you will also have to shutdown the Twisted reactor yourself after the
|
||||
spider is finished. This can be achieved by adding callbacks to the deferred
|
||||
returned by the :meth:`CrawlerRunner.crawl
|
||||
<scrapy.crawler.CrawlerRunner.crawl>` method.
|
||||
|
||||
Here's an example of its usage, along with a callback to manually stop the
|
||||
reactor after ``MySpider`` has finished running.
|
||||
|
||||
::
|
||||
|
||||
from twisted.internet import reactor
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
# Your spider definition
|
||||
...
|
||||
|
||||
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
|
||||
runner = CrawlerRunner()
|
||||
|
||||
d = runner.crawl(MySpider)
|
||||
d.addBoth(lambda _: reactor.stop())
|
||||
reactor.run() # the script will block here until the crawling is finished
|
||||
|
||||
.. seealso:: :doc:`twisted:core/howto/reactor-basics`
|
||||
|
||||
.. _run-multiple-spiders:
|
||||
|
||||
Running multiple spiders in the same process
|
||||
============================================
|
||||
@ -52,28 +113,79 @@ By default, Scrapy runs a single spider per process when you run ``scrapy
|
||||
crawl``. However, Scrapy supports running multiple spiders per process using
|
||||
the :ref:`internal API <topics-api>`.
|
||||
|
||||
Here is an example, using the `testspiders`_ project:
|
||||
Here is an example that runs multiple spiders simultaneously:
|
||||
|
||||
::
|
||||
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
|
||||
class MySpider1(scrapy.Spider):
|
||||
# Your first spider definition
|
||||
...
|
||||
|
||||
class MySpider2(scrapy.Spider):
|
||||
# Your second spider definition
|
||||
...
|
||||
|
||||
process = CrawlerProcess()
|
||||
process.crawl(MySpider1)
|
||||
process.crawl(MySpider2)
|
||||
process.start() # the script will block here until all crawling jobs are finished
|
||||
|
||||
Same example using :class:`~scrapy.crawler.CrawlerRunner`:
|
||||
|
||||
::
|
||||
|
||||
import scrapy
|
||||
from twisted.internet import reactor
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy import log
|
||||
from testspiders.spiders.followall import FollowAllSpider
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
|
||||
def setup_crawler(domain):
|
||||
spider = FollowAllSpider(domain=domain)
|
||||
settings = get_project_settings()
|
||||
crawler = Crawler(settings)
|
||||
crawler.configure()
|
||||
crawler.crawl(spider)
|
||||
crawler.start()
|
||||
class MySpider1(scrapy.Spider):
|
||||
# Your first spider definition
|
||||
...
|
||||
|
||||
for domain in ['scrapinghub.com', 'insophia.com']:
|
||||
setup_crawler(domain)
|
||||
log.start()
|
||||
reactor.run()
|
||||
class MySpider2(scrapy.Spider):
|
||||
# Your second spider definition
|
||||
...
|
||||
|
||||
configure_logging()
|
||||
runner = CrawlerRunner()
|
||||
runner.crawl(MySpider1)
|
||||
runner.crawl(MySpider2)
|
||||
d = runner.join()
|
||||
d.addBoth(lambda _: reactor.stop())
|
||||
|
||||
reactor.run() # the script will block here until all crawling jobs are finished
|
||||
|
||||
Same example but running the spiders sequentially by chaining the deferreds:
|
||||
|
||||
::
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
|
||||
class MySpider1(scrapy.Spider):
|
||||
# Your first spider definition
|
||||
...
|
||||
|
||||
class MySpider2(scrapy.Spider):
|
||||
# Your second spider definition
|
||||
...
|
||||
|
||||
configure_logging()
|
||||
runner = CrawlerRunner()
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def crawl():
|
||||
yield runner.crawl(MySpider1)
|
||||
yield runner.crawl(MySpider2)
|
||||
reactor.stop()
|
||||
|
||||
crawl()
|
||||
reactor.run() # the script will block here until the last crawl call is finished
|
||||
|
||||
.. seealso:: :ref:`run-from-script`.
|
||||
|
||||
@ -118,7 +230,7 @@ with varying degrees of sophistication. Getting around those measures can be
|
||||
difficult and tricky, and may sometimes require special infrastructure. Please
|
||||
consider contacting `commercial support`_ if in doubt.
|
||||
|
||||
Here are some tips to keep in mind when dealing with these kind of sites:
|
||||
Here are some tips to keep in mind when dealing with these kinds of sites:
|
||||
|
||||
* rotate your user agent from a pool of well-known ones from browsers (google
|
||||
around to get a list of them)
|
||||
@ -128,7 +240,8 @@ Here are some tips to keep in mind when dealing with these kind of sites:
|
||||
* if possible, use `Google cache`_ to fetch pages, instead of hitting the sites
|
||||
directly
|
||||
* use a pool of rotating IPs. For example, the free `Tor project`_ or paid
|
||||
services like `ProxyMesh`_
|
||||
services like `ProxyMesh`_. An open source alternative is `scrapoxy`_, a
|
||||
super proxy that you can attach your own proxies to.
|
||||
* use a highly distributed downloader that circumvents bans internally, so you
|
||||
can just focus on parsing clean pages. One example of such downloaders is
|
||||
`Crawlera`_
|
||||
@ -137,30 +250,9 @@ If you are still unable to prevent your bot getting banned, consider contacting
|
||||
`commercial support`_.
|
||||
|
||||
.. _Tor project: https://www.torproject.org/
|
||||
.. _commercial support: http://scrapy.org/support/
|
||||
.. _ProxyMesh: http://proxymesh.com/
|
||||
.. _commercial support: https://scrapy.org/support/
|
||||
.. _ProxyMesh: https://proxymesh.com/
|
||||
.. _Google cache: http://www.googleguide.com/cached_pages.html
|
||||
.. _testspiders: https://github.com/scrapinghub/testspiders
|
||||
.. _Twisted Reactor Overview: http://twistedmatrix.com/documents/current/core/howto/reactor-basics.html
|
||||
.. _Crawlera: http://crawlera.com
|
||||
|
||||
.. _dynamic-item-classes:
|
||||
|
||||
Dynamic Creation of Item Classes
|
||||
================================
|
||||
|
||||
For applications in which the structure of item class is to be determined by
|
||||
user input, or other changing conditions, you can dynamically create item
|
||||
classes instead of manually coding them.
|
||||
|
||||
::
|
||||
|
||||
|
||||
from scrapy.item import DictItem, Field
|
||||
|
||||
def create_item_class(class_name, field_list):
|
||||
field_dict = {}
|
||||
for field_name in field_list:
|
||||
field_dict[field_name] = Field()
|
||||
|
||||
return type(class_name, (DictItem,), field_dict)
|
||||
.. _Crawlera: https://scrapinghub.com/crawlera
|
||||
.. _scrapoxy: https://scrapoxy.io/
|
||||
|
@ -24,37 +24,39 @@ below in :ref:`topics-request-response-ref-request-subclasses` and
|
||||
Request objects
|
||||
===============
|
||||
|
||||
.. class:: Request(url[, callback, method='GET', headers, body, cookies, meta, encoding='utf-8', priority=0, dont_filter=False, errback])
|
||||
.. autoclass:: Request
|
||||
|
||||
A :class:`Request` object represents an HTTP request, which is usually
|
||||
generated in the Spider and executed by the Downloader, and thus generating
|
||||
a :class:`Response`.
|
||||
|
||||
:param url: the URL of this request
|
||||
:type url: string
|
||||
|
||||
If the URL is invalid, a :exc:`ValueError` exception is raised.
|
||||
:type url: str
|
||||
|
||||
:param callback: the function that will be called with the response of this
|
||||
request (once its downloaded) as its first parameter. For more information
|
||||
request (once it's downloaded) as its first parameter. For more information
|
||||
see :ref:`topics-request-response-ref-request-callback-arguments` below.
|
||||
If a Request doesn't specify a callback, the spider's
|
||||
:meth:`~scrapy.spider.Spider.parse` method will be used.
|
||||
:meth:`~scrapy.spiders.Spider.parse` method will be used.
|
||||
Note that if exceptions are raised during processing, errback is called instead.
|
||||
|
||||
:type callback: callable
|
||||
:type callback: collections.abc.Callable
|
||||
|
||||
:param method: the HTTP method of this request. Defaults to ``'GET'``.
|
||||
:type method: string
|
||||
:type method: str
|
||||
|
||||
:param meta: the initial values for the :attr:`Request.meta` attribute. If
|
||||
given, the dict passed in this parameter will be shallow copied.
|
||||
:type meta: dict
|
||||
|
||||
:param body: the request body. If a ``unicode`` is passed, then it's encoded to
|
||||
``str`` using the `encoding` passed (which defaults to ``utf-8``). If
|
||||
``body`` is not given,, an empty string is stored. Regardless of the
|
||||
type of this argument, the final value stored will be a ``str`` (never
|
||||
``unicode`` or ``None``).
|
||||
:type body: str or unicode
|
||||
:param body: the request body. If a string is passed, then it's encoded as
|
||||
bytes using the ``encoding`` passed (which defaults to ``utf-8``). If
|
||||
``body`` is not given, an empty bytes object is stored. Regardless of the
|
||||
type of this argument, the final value stored will be a bytes object
|
||||
(never a string or ``None``).
|
||||
:type body: bytes or str
|
||||
|
||||
:param headers: the headers of this request. The dict values can be strings
|
||||
(for single valued headers) or lists (for multi-valued headers). If
|
||||
@ -67,6 +69,7 @@ Request objects
|
||||
|
||||
request_with_cookies = Request(url="http://www.example.com",
|
||||
cookies={'currency': 'USD', 'country': 'UY'})
|
||||
|
||||
2. Using a list of dicts::
|
||||
|
||||
request_with_cookies = Request(url="http://www.example.com",
|
||||
@ -79,26 +82,32 @@ Request objects
|
||||
attributes of the cookie. This is only useful if the cookies are saved
|
||||
for later requests.
|
||||
|
||||
.. reqmeta:: dont_merge_cookies
|
||||
|
||||
When some site returns cookies (in a response) those are stored in the
|
||||
cookies for that domain and will be sent again in future requests. That's
|
||||
the typical behaviour of any regular web browser. However, if, for some
|
||||
reason, you want to avoid merging with existing cookies you can instruct
|
||||
Scrapy to do so by setting the ``dont_merge_cookies`` key to True in the
|
||||
:attr:`Request.meta`.
|
||||
cookies for that domain and will be sent again in future requests.
|
||||
That's the typical behaviour of any regular web browser.
|
||||
|
||||
Example of request without merging cookies::
|
||||
To create a request that does not send stored cookies and does not
|
||||
store received cookies, set the ``dont_merge_cookies`` key to ``True``
|
||||
in :attr:`request.meta <scrapy.http.Request.meta>`.
|
||||
|
||||
request_with_cookies = Request(url="http://www.example.com",
|
||||
cookies={'currency': 'USD', 'country': 'UY'},
|
||||
meta={'dont_merge_cookies': True})
|
||||
Example of a request that sends manually-defined cookies and ignores
|
||||
cookie storage::
|
||||
|
||||
Request(
|
||||
url="http://www.example.com",
|
||||
cookies={'currency': 'USD', 'country': 'UY'},
|
||||
meta={'dont_merge_cookies': True},
|
||||
)
|
||||
|
||||
For more info see :ref:`cookies-mw`.
|
||||
:type cookies: dict or list
|
||||
|
||||
:param encoding: the encoding of this request (defaults to ``'utf-8'``).
|
||||
This encoding will be used to percent-encode the URL and to convert the
|
||||
body to ``str`` (if given as ``unicode``).
|
||||
:type encoding: string
|
||||
body to bytes (if given as a string).
|
||||
:type encoding: str
|
||||
|
||||
:param priority: the priority of this request (defaults to ``0``).
|
||||
The priority is used by the scheduler to define the order used to process
|
||||
@ -110,19 +119,31 @@ Request objects
|
||||
the scheduler. This is used when you want to perform an identical
|
||||
request multiple times, to ignore the duplicates filter. Use it with
|
||||
care, or you will get into crawling loops. Default to ``False``.
|
||||
:type dont_filter: boolean
|
||||
:type dont_filter: bool
|
||||
|
||||
:param errback: a function that will be called if any exception was
|
||||
raised while processing the request. This includes pages that failed
|
||||
with 404 HTTP errors and such. It receives a `Twisted Failure`_ instance
|
||||
as first parameter.
|
||||
:type errback: callable
|
||||
with 404 HTTP errors and such. It receives a
|
||||
:exc:`~twisted.python.failure.Failure` as first parameter.
|
||||
For more information,
|
||||
see :ref:`topics-request-response-ref-errbacks` below.
|
||||
|
||||
.. versionchanged:: 2.0
|
||||
The *callback* parameter is no longer required when the *errback*
|
||||
parameter is specified.
|
||||
:type errback: collections.abc.Callable
|
||||
|
||||
:param flags: Flags sent to the request, can be used for logging or similar purposes.
|
||||
:type flags: list
|
||||
|
||||
:param cb_kwargs: A dict with arbitrary data that will be passed as keyword arguments to the Request's callback.
|
||||
:type cb_kwargs: dict
|
||||
|
||||
.. attribute:: Request.url
|
||||
|
||||
A string containing the URL of this request. Keep in mind that this
|
||||
attribute contains the escaped URL, so it can differ from the URL passed in
|
||||
the constructor.
|
||||
the ``__init__`` method.
|
||||
|
||||
This attribute is read-only. To change the URL of a Request use
|
||||
:meth:`replace`.
|
||||
@ -138,7 +159,7 @@ Request objects
|
||||
|
||||
.. attribute:: Request.body
|
||||
|
||||
A str that contains the request body.
|
||||
The request body as bytes.
|
||||
|
||||
This attribute is read-only. To change the body of a Request use
|
||||
:meth:`replace`.
|
||||
@ -153,25 +174,40 @@ Request objects
|
||||
See :ref:`topics-request-meta` for a list of special meta keys
|
||||
recognized by Scrapy.
|
||||
|
||||
This dict is `shallow copied`_ when the request is cloned using the
|
||||
``copy()`` or ``replace()`` methods, and can also be accessed, in your
|
||||
spider, from the ``response.meta`` attribute.
|
||||
This dict is :doc:`shallow copied <library/copy>` when the request is
|
||||
cloned using the ``copy()`` or ``replace()`` methods, and can also be
|
||||
accessed, in your spider, from the ``response.meta`` attribute.
|
||||
|
||||
.. _shallow copied: http://docs.python.org/library/copy.html
|
||||
.. attribute:: Request.cb_kwargs
|
||||
|
||||
A dictionary that contains arbitrary metadata for this request. Its contents
|
||||
will be passed to the Request's callback as keyword arguments. It is empty
|
||||
for new Requests, which means by default callbacks only get a :class:`Response`
|
||||
object as argument.
|
||||
|
||||
This dict is :doc:`shallow copied <library/copy>` when the request is
|
||||
cloned using the ``copy()`` or ``replace()`` methods, and can also be
|
||||
accessed, in your spider, from the ``response.cb_kwargs`` attribute.
|
||||
|
||||
In case of a failure to process the request, this dict can be accessed as
|
||||
``failure.request.cb_kwargs`` in the request's errback. For more information,
|
||||
see :ref:`errback-cb_kwargs`.
|
||||
|
||||
.. method:: Request.copy()
|
||||
|
||||
Return a new Request which is a copy of this Request. See also:
|
||||
:ref:`topics-request-response-ref-request-callback-arguments`.
|
||||
|
||||
.. method:: Request.replace([url, method, headers, body, cookies, meta, encoding, dont_filter, callback, errback])
|
||||
.. method:: Request.replace([url, method, headers, body, cookies, meta, flags, encoding, priority, dont_filter, callback, errback, cb_kwargs])
|
||||
|
||||
Return a Request object with the same members, except for those members
|
||||
given new values by whichever keyword arguments are specified. The
|
||||
attribute :attr:`Request.meta` is copied by default (unless a new value
|
||||
is given in the ``meta`` argument). See also
|
||||
:attr:`Request.cb_kwargs` and :attr:`Request.meta` attributes are shallow
|
||||
copied by default (unless new values are given as arguments). See also
|
||||
:ref:`topics-request-response-ref-request-callback-arguments`.
|
||||
|
||||
.. automethod:: from_curl
|
||||
|
||||
.. _topics-request-response-ref-request-callback-arguments:
|
||||
|
||||
Passing additional data to callback functions
|
||||
@ -189,27 +225,121 @@ Example::
|
||||
|
||||
def parse_page2(self, response):
|
||||
# this would log http://www.example.com/some_page.html
|
||||
self.log("Visited %s" % response.url)
|
||||
self.logger.info("Visited %s", response.url)
|
||||
|
||||
In some cases you may be interested in passing arguments to those callback
|
||||
functions so you can receive the arguments later, in the second callback. You
|
||||
can use the :attr:`Request.meta` attribute for that.
|
||||
functions so you can receive the arguments later, in the second callback.
|
||||
The following example shows how to achieve this by using the
|
||||
:attr:`Request.cb_kwargs` attribute:
|
||||
|
||||
Here's an example of how to pass an item using this mechanism, to populate
|
||||
different fields from different pages::
|
||||
::
|
||||
|
||||
def parse_page1(self, response):
|
||||
item = MyItem()
|
||||
item['main_url'] = response.url
|
||||
request = scrapy.Request("http://www.example.com/some_page.html",
|
||||
callback=self.parse_page2)
|
||||
request.meta['item'] = item
|
||||
return request
|
||||
def parse(self, response):
|
||||
request = scrapy.Request('http://www.example.com/index.html',
|
||||
callback=self.parse_page2,
|
||||
cb_kwargs=dict(main_url=response.url))
|
||||
request.cb_kwargs['foo'] = 'bar' # add more arguments for the callback
|
||||
yield request
|
||||
|
||||
def parse_page2(self, response):
|
||||
item = response.meta['item']
|
||||
item['other_url'] = response.url
|
||||
return item
|
||||
def parse_page2(self, response, main_url, foo):
|
||||
yield dict(
|
||||
main_url=main_url,
|
||||
other_url=response.url,
|
||||
foo=foo,
|
||||
)
|
||||
|
||||
.. caution:: :attr:`Request.cb_kwargs` was introduced in version ``1.7``.
|
||||
Prior to that, using :attr:`Request.meta` was recommended for passing
|
||||
information around callbacks. After ``1.7``, :attr:`Request.cb_kwargs`
|
||||
became the preferred way for handling user information, leaving :attr:`Request.meta`
|
||||
for communication with components like middlewares and extensions.
|
||||
|
||||
.. _topics-request-response-ref-errbacks:
|
||||
|
||||
Using errbacks to catch exceptions in request processing
|
||||
--------------------------------------------------------
|
||||
|
||||
The errback of a request is a function that will be called when an exception
|
||||
is raise while processing it.
|
||||
|
||||
It receives a :exc:`~twisted.python.failure.Failure` as first parameter and can
|
||||
be used to track connection establishment timeouts, DNS errors etc.
|
||||
|
||||
Here's an example spider logging all errors and catching some specific
|
||||
errors if needed::
|
||||
|
||||
import scrapy
|
||||
|
||||
from scrapy.spidermiddlewares.httperror import HttpError
|
||||
from twisted.internet.error import DNSLookupError
|
||||
from twisted.internet.error import TimeoutError, TCPTimedOutError
|
||||
|
||||
class ErrbackSpider(scrapy.Spider):
|
||||
name = "errback_example"
|
||||
start_urls = [
|
||||
"http://www.httpbin.org/", # HTTP 200 expected
|
||||
"http://www.httpbin.org/status/404", # Not found error
|
||||
"http://www.httpbin.org/status/500", # server issue
|
||||
"http://www.httpbin.org:12345/", # non-responding host, timeout expected
|
||||
"http://www.httphttpbinbin.org/", # DNS error expected
|
||||
]
|
||||
|
||||
def start_requests(self):
|
||||
for u in self.start_urls:
|
||||
yield scrapy.Request(u, callback=self.parse_httpbin,
|
||||
errback=self.errback_httpbin,
|
||||
dont_filter=True)
|
||||
|
||||
def parse_httpbin(self, response):
|
||||
self.logger.info('Got successful response from {}'.format(response.url))
|
||||
# do something useful here...
|
||||
|
||||
def errback_httpbin(self, failure):
|
||||
# log all failures
|
||||
self.logger.error(repr(failure))
|
||||
|
||||
# in case you want to do something special for some errors,
|
||||
# you may need the failure's type:
|
||||
|
||||
if failure.check(HttpError):
|
||||
# these exceptions come from HttpError spider middleware
|
||||
# you can get the non-200 response
|
||||
response = failure.value.response
|
||||
self.logger.error('HttpError on %s', response.url)
|
||||
|
||||
elif failure.check(DNSLookupError):
|
||||
# this is the original request
|
||||
request = failure.request
|
||||
self.logger.error('DNSLookupError on %s', request.url)
|
||||
|
||||
elif failure.check(TimeoutError, TCPTimedOutError):
|
||||
request = failure.request
|
||||
self.logger.error('TimeoutError on %s', request.url)
|
||||
|
||||
.. _errback-cb_kwargs:
|
||||
|
||||
Accessing additional data in errback functions
|
||||
----------------------------------------------
|
||||
|
||||
In case of a failure to process the request, you may be interested in
|
||||
accessing arguments to the callback functions so you can process further
|
||||
based on the arguments in the errback. The following example shows how to
|
||||
achieve this by using ``Failure.request.cb_kwargs``::
|
||||
|
||||
def parse(self, response):
|
||||
request = scrapy.Request('http://www.example.com/index.html',
|
||||
callback=self.parse_page2,
|
||||
errback=self.errback_page2,
|
||||
cb_kwargs=dict(main_url=response.url))
|
||||
yield request
|
||||
|
||||
def parse_page2(self, response, main_url):
|
||||
pass
|
||||
|
||||
def errback_page2(self, failure):
|
||||
yield dict(
|
||||
main_url=failure.request.cb_kwargs['main_url'],
|
||||
)
|
||||
|
||||
.. _topics-request-meta:
|
||||
|
||||
@ -224,10 +354,23 @@ Those are:
|
||||
* :reqmeta:`dont_redirect`
|
||||
* :reqmeta:`dont_retry`
|
||||
* :reqmeta:`handle_httpstatus_list`
|
||||
* ``dont_merge_cookies`` (see ``cookies`` parameter of :class:`Request` constructor)
|
||||
* :reqmeta:`handle_httpstatus_all`
|
||||
* :reqmeta:`dont_merge_cookies`
|
||||
* :reqmeta:`cookiejar`
|
||||
* :reqmeta:`dont_cache`
|
||||
* :reqmeta:`redirect_reasons`
|
||||
* :reqmeta:`redirect_urls`
|
||||
* :reqmeta:`bindaddress`
|
||||
* :reqmeta:`dont_obey_robotstxt`
|
||||
* :reqmeta:`download_timeout`
|
||||
* :reqmeta:`download_maxsize`
|
||||
* :reqmeta:`download_latency`
|
||||
* :reqmeta:`download_fail_on_dataloss`
|
||||
* :reqmeta:`proxy`
|
||||
* ``ftp_user`` (See :setting:`FTP_USER` for more info)
|
||||
* ``ftp_password`` (See :setting:`FTP_PASSWORD` for more info)
|
||||
* :reqmeta:`referrer_policy`
|
||||
* :reqmeta:`max_retry_times`
|
||||
|
||||
.. reqmeta:: bindaddress
|
||||
|
||||
@ -236,6 +379,86 @@ bindaddress
|
||||
|
||||
The IP of the outgoing IP address to use for the performing the request.
|
||||
|
||||
.. reqmeta:: download_timeout
|
||||
|
||||
download_timeout
|
||||
----------------
|
||||
|
||||
The amount of time (in secs) that the downloader will wait before timing out.
|
||||
See also: :setting:`DOWNLOAD_TIMEOUT`.
|
||||
|
||||
.. reqmeta:: download_latency
|
||||
|
||||
download_latency
|
||||
----------------
|
||||
|
||||
The amount of time spent to fetch the response, since the request has been
|
||||
started, i.e. HTTP message sent over the network. This meta key only becomes
|
||||
available when the response has been downloaded. While most other meta keys are
|
||||
used to control Scrapy behavior, this one is supposed to be read-only.
|
||||
|
||||
.. reqmeta:: download_fail_on_dataloss
|
||||
|
||||
download_fail_on_dataloss
|
||||
-------------------------
|
||||
|
||||
Whether or not to fail on broken responses. See:
|
||||
:setting:`DOWNLOAD_FAIL_ON_DATALOSS`.
|
||||
|
||||
.. reqmeta:: max_retry_times
|
||||
|
||||
max_retry_times
|
||||
---------------
|
||||
|
||||
The meta key is used set retry times per request. When initialized, the
|
||||
:reqmeta:`max_retry_times` meta key takes higher precedence over the
|
||||
:setting:`RETRY_TIMES` setting.
|
||||
|
||||
|
||||
.. _topics-stop-response-download:
|
||||
|
||||
Stopping the download of a Response
|
||||
===================================
|
||||
|
||||
Raising a :exc:`~scrapy.exceptions.StopDownload` exception from a
|
||||
:class:`~scrapy.signals.bytes_received` signal handler will stop the
|
||||
download of a given response. See the following example::
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class StopSpider(scrapy.Spider):
|
||||
name = "stop"
|
||||
start_urls = ["https://docs.scrapy.org/en/latest/"]
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
spider = super().from_crawler(crawler)
|
||||
crawler.signals.connect(spider.on_bytes_received, signal=scrapy.signals.bytes_received)
|
||||
return spider
|
||||
|
||||
def parse(self, response):
|
||||
# 'last_chars' show that the full response was not downloaded
|
||||
yield {"len": len(response.text), "last_chars": response.text[-40:]}
|
||||
|
||||
def on_bytes_received(self, data, request, spider):
|
||||
raise scrapy.exceptions.StopDownload(fail=False)
|
||||
|
||||
which produces the following output::
|
||||
|
||||
2020-05-19 17:26:12 [scrapy.core.engine] INFO: Spider opened
|
||||
2020-05-19 17:26:12 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
|
||||
2020-05-19 17:26:13 [scrapy.core.downloader.handlers.http11] DEBUG: Download stopped for <GET https://docs.scrapy.org/en/latest/> from signal handler StopSpider.on_bytes_received
|
||||
2020-05-19 17:26:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://docs.scrapy.org/en/latest/> (referer: None) ['download_stopped']
|
||||
2020-05-19 17:26:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://docs.scrapy.org/en/latest/>
|
||||
{'len': 279, 'last_chars': 'dth, initial-scale=1.0">\n \n <title>Scr'}
|
||||
2020-05-19 17:26:13 [scrapy.core.engine] INFO: Closing spider (finished)
|
||||
|
||||
By default, resulting responses are handled by their corresponding errbacks. To
|
||||
call their callback instead, like in this example, pass ``fail=False`` to the
|
||||
:exc:`~scrapy.exceptions.StopDownload` exception.
|
||||
|
||||
|
||||
.. _topics-request-response-ref-request-subclasses:
|
||||
|
||||
Request subclasses
|
||||
@ -251,23 +474,23 @@ The FormRequest class extends the base :class:`Request` with functionality for
|
||||
dealing with HTML forms. It uses `lxml.html forms`_ to pre-populate form
|
||||
fields with form data from :class:`Response` objects.
|
||||
|
||||
.. _lxml.html forms: http://lxml.de/lxmlhtml.html#forms
|
||||
.. _lxml.html forms: https://lxml.de/lxmlhtml.html#forms
|
||||
|
||||
.. class:: FormRequest(url, [formdata, ...])
|
||||
|
||||
The :class:`FormRequest` class adds a new argument to the constructor. The
|
||||
The :class:`FormRequest` class adds a new keyword parameter to the ``__init__`` method. The
|
||||
remaining arguments are the same as for the :class:`Request` class and are
|
||||
not documented here.
|
||||
|
||||
:param formdata: is a dictionary (or iterable of (key, value) tuples)
|
||||
containing HTML Form data which will be url-encoded and assigned to the
|
||||
body of the request.
|
||||
:type formdata: dict or iterable of tuples
|
||||
:type formdata: dict or collections.abc.Iterable
|
||||
|
||||
The :class:`FormRequest` objects support the following class method in
|
||||
addition to the standard :class:`Request` methods:
|
||||
|
||||
.. classmethod:: FormRequest.from_response(response, [formname=None, formnumber=0, formdata=None, formxpath=None, clickdata=None, dont_click=False, ...])
|
||||
.. classmethod:: FormRequest.from_response(response, [formname=None, formid=None, formnumber=0, formdata=None, formxpath=None, formcss=None, clickdata=None, dont_click=False, ...])
|
||||
|
||||
Returns a new :class:`FormRequest` object with its form field values
|
||||
pre-populated with those found in the HTML ``<form>`` element contained
|
||||
@ -285,23 +508,35 @@ fields with form data from :class:`Response` objects.
|
||||
control clicked (instead of disabling it) you can also use the
|
||||
``clickdata`` argument.
|
||||
|
||||
.. caution:: Using this method with select elements which have leading
|
||||
or trailing whitespace in the option values will not work due to a
|
||||
`bug in lxml`_, which should be fixed in lxml 3.8 and above.
|
||||
|
||||
:param response: the response containing a HTML form which will be used
|
||||
to pre-populate the form fields
|
||||
:type response: :class:`Response` object
|
||||
|
||||
:param formname: if given, the form with name attribute set to this value will be used.
|
||||
:type formname: string
|
||||
:type formname: str
|
||||
|
||||
:param formid: if given, the form with id attribute set to this value will be used.
|
||||
:type formid: str
|
||||
|
||||
:param formxpath: if given, the first form that matches the xpath will be used.
|
||||
:type formxpath: string
|
||||
:type formxpath: str
|
||||
|
||||
:param formcss: if given, the first form that matches the css selector will be used.
|
||||
:type formcss: str
|
||||
|
||||
:param formnumber: the number of form to use, when the response contains
|
||||
multiple forms. The first one (and also the default) is ``0``.
|
||||
:type formnumber: integer
|
||||
:type formnumber: int
|
||||
|
||||
:param formdata: fields to override in the form data. If a field was
|
||||
already present in the response ``<form>`` element, its value is
|
||||
overridden by the one passed in this parameter.
|
||||
overridden by the one passed in this parameter. If a value passed in
|
||||
this parameter is ``None``, the field will not be included in the
|
||||
request, even if it was present in the response ``<form>`` element.
|
||||
:type formdata: dict
|
||||
|
||||
:param clickdata: attributes to lookup the control clicked. If it's not
|
||||
@ -313,10 +548,10 @@ fields with form data from :class:`Response` objects.
|
||||
|
||||
:param dont_click: If True, the form data will be submitted without
|
||||
clicking in any element.
|
||||
:type dont_click: boolean
|
||||
:type dont_click: bool
|
||||
|
||||
The other parameters of this class method are passed directly to the
|
||||
:class:`FormRequest` constructor.
|
||||
:class:`FormRequest` ``__init__`` method.
|
||||
|
||||
.. versionadded:: 0.10.3
|
||||
The ``formname`` parameter.
|
||||
@ -324,6 +559,12 @@ fields with form data from :class:`Response` objects.
|
||||
.. versionadded:: 0.17
|
||||
The ``formxpath`` parameter.
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
The ``formcss`` parameter.
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
The ``formid`` parameter.
|
||||
|
||||
Request usage examples
|
||||
----------------------
|
||||
|
||||
@ -353,6 +594,11 @@ method for this job. Here's an example spider which uses it::
|
||||
|
||||
import scrapy
|
||||
|
||||
def authentication_failed(response):
|
||||
# TODO: Check the contents of the response and return True if it failed
|
||||
# or False if it succeeded.
|
||||
pass
|
||||
|
||||
class LoginSpider(scrapy.Spider):
|
||||
name = 'example.com'
|
||||
start_urls = ['http://www.example.com/users/login.php']
|
||||
@ -365,47 +611,91 @@ method for this job. Here's an example spider which uses it::
|
||||
)
|
||||
|
||||
def after_login(self, response):
|
||||
# check login succeed before going on
|
||||
if "authentication failed" in response.body:
|
||||
self.log("Login failed", level=log.ERROR)
|
||||
if authentication_failed(response):
|
||||
self.logger.error("Login failed")
|
||||
return
|
||||
|
||||
# continue scraping with authenticated session...
|
||||
|
||||
JsonRequest
|
||||
-----------
|
||||
|
||||
The JsonRequest class extends the base :class:`Request` class with functionality for
|
||||
dealing with JSON requests.
|
||||
|
||||
.. class:: JsonRequest(url, [... data, dumps_kwargs])
|
||||
|
||||
The :class:`JsonRequest` class adds two new keyword parameters to the ``__init__`` method. The
|
||||
remaining arguments are the same as for the :class:`Request` class and are
|
||||
not documented here.
|
||||
|
||||
Using the :class:`JsonRequest` will set the ``Content-Type`` header to ``application/json``
|
||||
and ``Accept`` header to ``application/json, text/javascript, */*; q=0.01``
|
||||
|
||||
:param data: is any JSON serializable object that needs to be JSON encoded and assigned to body.
|
||||
if :attr:`Request.body` argument is provided this parameter will be ignored.
|
||||
if :attr:`Request.body` argument is not provided and data argument is provided :attr:`Request.method` will be
|
||||
set to ``'POST'`` automatically.
|
||||
:type data: object
|
||||
|
||||
:param dumps_kwargs: Parameters that will be passed to underlying :func:`json.dumps` method which is used to serialize
|
||||
data into JSON format.
|
||||
:type dumps_kwargs: dict
|
||||
|
||||
JsonRequest usage example
|
||||
-------------------------
|
||||
|
||||
Sending a JSON POST request with a JSON payload::
|
||||
|
||||
data = {
|
||||
'name1': 'value1',
|
||||
'name2': 'value2',
|
||||
}
|
||||
yield JsonRequest(url='http://www.example.com/post/action', data=data)
|
||||
|
||||
|
||||
Response objects
|
||||
================
|
||||
|
||||
.. class:: Response(url, [status=200, headers, body, flags])
|
||||
.. autoclass:: Response
|
||||
|
||||
A :class:`Response` object represents an HTTP response, which is usually
|
||||
downloaded (by the Downloader) and fed to the Spiders for processing.
|
||||
|
||||
:param url: the URL of this response
|
||||
:type url: string
|
||||
:type url: str
|
||||
|
||||
:param status: the HTTP status of the response. Defaults to ``200``.
|
||||
:type status: int
|
||||
|
||||
:param headers: the headers of this response. The dict values can be strings
|
||||
(for single valued headers) or lists (for multi-valued headers).
|
||||
:type headers: dict
|
||||
|
||||
:param status: the HTTP status of the response. Defaults to ``200``.
|
||||
:type status: integer
|
||||
|
||||
:param body: the response body. It must be str, not unicode, unless you're
|
||||
using a encoding-aware :ref:`Response subclass
|
||||
<topics-request-response-ref-response-subclasses>`, such as
|
||||
:class:`TextResponse`.
|
||||
:type body: str
|
||||
|
||||
:param meta: the initial values for the :attr:`Response.meta` attribute. If
|
||||
given, the dict will be shallow copied.
|
||||
:type meta: dict
|
||||
:param body: the response body. To access the decoded text as a string, use
|
||||
``response.text`` from an encoding-aware
|
||||
:ref:`Response subclass <topics-request-response-ref-response-subclasses>`,
|
||||
such as :class:`TextResponse`.
|
||||
:type body: bytes
|
||||
|
||||
:param flags: is a list containing the initial values for the
|
||||
:attr:`Response.flags` attribute. If given, the list will be shallow
|
||||
copied.
|
||||
:type flags: list
|
||||
|
||||
:param request: the initial value of the :attr:`Response.request` attribute.
|
||||
This represents the :class:`Request` that generated this response.
|
||||
:type request: scrapy.http.Request
|
||||
|
||||
:param certificate: an object representing the server's SSL certificate.
|
||||
:type certificate: twisted.internet.ssl.Certificate
|
||||
|
||||
:param ip_address: The IP address of the server from which the Response originated.
|
||||
:type ip_address: :class:`ipaddress.IPv4Address` or :class:`ipaddress.IPv6Address`
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
The ``ip_address`` parameter.
|
||||
|
||||
.. attribute:: Response.url
|
||||
|
||||
A string containing the URL of the response.
|
||||
@ -420,14 +710,20 @@ Response objects
|
||||
|
||||
.. attribute:: Response.headers
|
||||
|
||||
A dictionary-like object which contains the response headers.
|
||||
A dictionary-like object which contains the response headers. Values can
|
||||
be accessed using :meth:`get` to return the first header value with the
|
||||
specified name or :meth:`getlist` to return all header values with the
|
||||
specified name. For example, this call will give you all cookies in the
|
||||
headers::
|
||||
|
||||
response.headers.getlist('Set-Cookie')
|
||||
|
||||
.. attribute:: Response.body
|
||||
|
||||
A str containing the body of this Response. Keep in mind that Response.body
|
||||
is always a str. If you want the unicode version use
|
||||
:meth:`TextResponse.body_as_unicode` (only available in
|
||||
:class:`TextResponse` and subclasses).
|
||||
The response body as bytes.
|
||||
|
||||
If you want the body as a string, use :attr:`TextResponse.text` (only
|
||||
available in :class:`TextResponse` and subclasses).
|
||||
|
||||
This attribute is read-only. To change the body of a Response use
|
||||
:meth:`replace`.
|
||||
@ -453,7 +749,7 @@ Response objects
|
||||
.. attribute:: Response.meta
|
||||
|
||||
A shortcut to the :attr:`Request.meta` attribute of the
|
||||
:attr:`Response.request` object (ie. ``self.request.meta``).
|
||||
:attr:`Response.request` object (i.e. ``self.request.meta``).
|
||||
|
||||
Unlike the :attr:`Response.request` attribute, the :attr:`Response.meta`
|
||||
attribute is propagated along redirects and retries, so you will get
|
||||
@ -461,13 +757,44 @@ Response objects
|
||||
|
||||
.. seealso:: :attr:`Request.meta` attribute
|
||||
|
||||
.. attribute:: Response.cb_kwargs
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
A shortcut to the :attr:`Request.cb_kwargs` attribute of the
|
||||
:attr:`Response.request` object (i.e. ``self.request.cb_kwargs``).
|
||||
|
||||
Unlike the :attr:`Response.request` attribute, the
|
||||
:attr:`Response.cb_kwargs` attribute is propagated along redirects and
|
||||
retries, so you will get the original :attr:`Request.cb_kwargs` sent
|
||||
from your spider.
|
||||
|
||||
.. seealso:: :attr:`Request.cb_kwargs` attribute
|
||||
|
||||
.. attribute:: Response.flags
|
||||
|
||||
A list that contains flags for this response. Flags are labels used for
|
||||
tagging Responses. For example: `'cached'`, `'redirected`', etc. And
|
||||
tagging Responses. For example: ``'cached'``, ``'redirected``', etc. And
|
||||
they're shown on the string representation of the Response (`__str__`
|
||||
method) which is used by the engine for logging.
|
||||
|
||||
.. attribute:: Response.certificate
|
||||
|
||||
A :class:`twisted.internet.ssl.Certificate` object representing
|
||||
the server's SSL certificate.
|
||||
|
||||
Only populated for ``https`` responses, ``None`` otherwise.
|
||||
|
||||
.. attribute:: Response.ip_address
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
The IP address of the server from which the Response originated.
|
||||
|
||||
This attribute is currently only populated by the HTTP 1.1 download
|
||||
handler, i.e. for ``http(s)`` responses. For other handlers,
|
||||
:attr:`ip_address` is always ``None``.
|
||||
|
||||
.. method:: Response.copy()
|
||||
|
||||
Returns a new Response which is a copy of this Response.
|
||||
@ -478,6 +805,21 @@ Response objects
|
||||
given new values by whichever keyword arguments are specified. The
|
||||
attribute :attr:`Response.meta` is copied by default.
|
||||
|
||||
.. method:: Response.urljoin(url)
|
||||
|
||||
Constructs an absolute url by combining the Response's :attr:`url` with
|
||||
a possible relative url.
|
||||
|
||||
This is a wrapper over :func:`~urllib.parse.urljoin`, it's merely an alias for
|
||||
making this call::
|
||||
|
||||
urllib.parse.urljoin(response.url, url)
|
||||
|
||||
.. automethod:: Response.follow
|
||||
|
||||
.. automethod:: Response.follow_all
|
||||
|
||||
|
||||
.. _topics-request-response-ref-response-subclasses:
|
||||
|
||||
Response subclasses
|
||||
@ -495,29 +837,46 @@ TextResponse objects
|
||||
:class:`Response` class, which is meant to be used only for binary data,
|
||||
such as images, sounds or any media file.
|
||||
|
||||
:class:`TextResponse` objects support a new constructor argument, in
|
||||
:class:`TextResponse` objects support a new ``__init__`` method argument, in
|
||||
addition to the base :class:`Response` objects. The remaining functionality
|
||||
is the same as for the :class:`Response` class and is not documented here.
|
||||
|
||||
:param encoding: is a string which contains the encoding to use for this
|
||||
response. If you create a :class:`TextResponse` object with a unicode
|
||||
body, it will be encoded using this encoding (remember the body attribute
|
||||
is always a string). If ``encoding`` is ``None`` (default value), the
|
||||
encoding will be looked up in the response headers and body instead.
|
||||
:type encoding: string
|
||||
response. If you create a :class:`TextResponse` object with a string as
|
||||
body, it will be converted to bytes encoded using this encoding. If
|
||||
*encoding* is ``None`` (default), the encoding will be looked up in the
|
||||
response headers and body instead.
|
||||
:type encoding: str
|
||||
|
||||
:class:`TextResponse` objects support the following attributes in addition
|
||||
to the standard :class:`Response` ones:
|
||||
|
||||
.. attribute:: TextResponse.text
|
||||
|
||||
Response body, as a string.
|
||||
|
||||
The same as ``response.body.decode(response.encoding)``, but the
|
||||
result is cached after the first call, so you can access
|
||||
``response.text`` multiple times without extra overhead.
|
||||
|
||||
.. note::
|
||||
|
||||
``str(response.body)`` is not a correct way to convert the response
|
||||
body into a string:
|
||||
|
||||
>>> str(b'body')
|
||||
"b'body'"
|
||||
|
||||
|
||||
.. attribute:: TextResponse.encoding
|
||||
|
||||
A string with the encoding of this response. The encoding is resolved by
|
||||
trying the following mechanisms, in order:
|
||||
|
||||
1. the encoding passed in the constructor `encoding` argument
|
||||
1. the encoding passed in the ``__init__`` method ``encoding`` argument
|
||||
|
||||
2. the encoding declared in the Content-Type HTTP header. If this
|
||||
encoding is not valid (ie. unknown), it is ignored and the next
|
||||
encoding is not valid (i.e. unknown), it is ignored and the next
|
||||
resolution mechanism is tried.
|
||||
|
||||
3. the encoding declared in the response body. The TextResponse class
|
||||
@ -535,20 +894,6 @@ TextResponse objects
|
||||
:class:`TextResponse` objects support the following methods in addition to
|
||||
the standard :class:`Response` ones:
|
||||
|
||||
.. method:: TextResponse.body_as_unicode()
|
||||
|
||||
Returns the body of the response as unicode. This is equivalent to::
|
||||
|
||||
response.body.decode(response.encoding)
|
||||
|
||||
But **not** equivalent to::
|
||||
|
||||
unicode(response.body)
|
||||
|
||||
Since, in the latter case, you would be using you system default encoding
|
||||
(typically `ascii`) to convert the body to unicode, instead of the response
|
||||
encoding.
|
||||
|
||||
.. method:: TextResponse.xpath(query)
|
||||
|
||||
A shortcut to ``TextResponse.selector.xpath(query)``::
|
||||
@ -561,6 +906,15 @@ TextResponse objects
|
||||
|
||||
response.css('p')
|
||||
|
||||
.. automethod:: TextResponse.follow
|
||||
|
||||
.. automethod:: TextResponse.follow_all
|
||||
|
||||
.. automethod:: TextResponse.json()
|
||||
|
||||
Returns a Python object from deserialized JSON document.
|
||||
The result is cached after the first call.
|
||||
|
||||
|
||||
HtmlResponse objects
|
||||
--------------------
|
||||
@ -571,7 +925,7 @@ HtmlResponse objects
|
||||
which adds encoding auto-discovering support by looking into the HTML `meta
|
||||
http-equiv`_ attribute. See :attr:`TextResponse.encoding`.
|
||||
|
||||
.. _meta http-equiv: http://www.w3schools.com/TAGS/att_meta_http_equiv.asp
|
||||
.. _meta http-equiv: https://www.w3schools.com/TAGS/att_meta_http_equiv.asp
|
||||
|
||||
XmlResponse objects
|
||||
-------------------
|
||||
@ -582,4 +936,4 @@ XmlResponse objects
|
||||
adds encoding auto-discovering support by looking into the XML declaration
|
||||
line. See :attr:`TextResponse.encoding`.
|
||||
|
||||
.. _Twisted Failure: http://twistedmatrix.com/documents/current/api/twisted.python.failure.Failure.html
|
||||
.. _bug in lxml: https://bugs.launchpad.net/lxml/+bug/1665241
|
||||
|
@ -1,3 +1,5 @@
|
||||
:orphan:
|
||||
|
||||
.. _topics-scrapyd:
|
||||
|
||||
=======
|
||||
@ -8,4 +10,4 @@ Scrapyd has been moved into a separate project.
|
||||
|
||||
Its documentation is now hosted at:
|
||||
|
||||
http://scrapyd.readthedocs.org/
|
||||
https://scrapyd.readthedocs.io/en/latest/
|
||||
|
@ -17,6 +17,9 @@ spider, without having to run the spider to test every change.
|
||||
Once you get familiarized with the Scrapy shell, you'll see that it's an
|
||||
invaluable tool for developing and debugging your spiders.
|
||||
|
||||
Configuring the shell
|
||||
=====================
|
||||
|
||||
If you have `IPython`_ installed, the Scrapy shell will use it (instead of the
|
||||
standard Python console). The `IPython`_ console is much more powerful and
|
||||
provides smart auto-completion and colorized output, among other things.
|
||||
@ -25,8 +28,20 @@ We highly recommend you install `IPython`_, specially if you're working on
|
||||
Unix systems (where `IPython`_ excels). See the `IPython installation guide`_
|
||||
for more info.
|
||||
|
||||
.. _IPython: http://ipython.org/
|
||||
.. _IPython installation guide: http://ipython.org/install.html
|
||||
Scrapy also has support for `bpython`_, and will try to use it where `IPython`_
|
||||
is unavailable.
|
||||
|
||||
Through Scrapy's settings you can configure it to use any one of
|
||||
``ipython``, ``bpython`` or the standard ``python`` shell, regardless of which
|
||||
are installed. This is done by setting the ``SCRAPY_PYTHON_SHELL`` environment
|
||||
variable; or by defining it in your :ref:`scrapy.cfg <topics-config-settings>`::
|
||||
|
||||
[settings]
|
||||
shell = bpython
|
||||
|
||||
.. _IPython: https://ipython.org/
|
||||
.. _IPython installation guide: https://ipython.org/install.html
|
||||
.. _bpython: https://bpython-interpreter.org/
|
||||
|
||||
Launch the shell
|
||||
================
|
||||
@ -38,6 +53,38 @@ this::
|
||||
|
||||
Where the ``<url>`` is the URL you want to scrape.
|
||||
|
||||
:command:`shell` also works for local files. This can be handy if you want
|
||||
to play around with a local copy of a web page. :command:`shell` understands
|
||||
the following syntaxes for local files::
|
||||
|
||||
# UNIX-style
|
||||
scrapy shell ./path/to/file.html
|
||||
scrapy shell ../other/path/to/file.html
|
||||
scrapy shell /absolute/path/to/file.html
|
||||
|
||||
# File URI
|
||||
scrapy shell file:///absolute/path/to/file.html
|
||||
|
||||
.. note:: When using relative file paths, be explicit and prepend them
|
||||
with ``./`` (or ``../`` when relevant).
|
||||
``scrapy shell index.html`` will not work as one might expect (and
|
||||
this is by design, not a bug).
|
||||
|
||||
Because :command:`shell` favors HTTP URLs over File URIs,
|
||||
and ``index.html`` being syntactically similar to ``example.com``,
|
||||
:command:`shell` will treat ``index.html`` as a domain name and trigger
|
||||
a DNS lookup error::
|
||||
|
||||
$ scrapy shell index.html
|
||||
[ ... scrapy shell starts ... ]
|
||||
[ ... traceback ... ]
|
||||
twisted.internet.error.DNSLookupError: DNS lookup failed:
|
||||
address 'index.html' not found: [Errno -5] No address associated with hostname.
|
||||
|
||||
:command:`shell` will not test beforehand if a file called ``index.html``
|
||||
exists in the current directory. Again, be explicit.
|
||||
|
||||
|
||||
Using the shell
|
||||
===============
|
||||
|
||||
@ -50,13 +97,17 @@ Available Shortcuts
|
||||
|
||||
* ``shelp()`` - print a help with the list of available objects and shortcuts
|
||||
|
||||
* ``fetch(request_or_url)`` - fetch a new response from the given request or
|
||||
URL and update all related objects accordingly.
|
||||
* ``fetch(url[, redirect=True])`` - fetch a new response from the given
|
||||
URL and update all related objects accordingly. You can optionaly ask for
|
||||
HTTP 3xx redirections to not be followed by passing ``redirect=False``
|
||||
|
||||
* ``fetch(request)`` - fetch a new response from the given request and
|
||||
update all related objects accordingly.
|
||||
|
||||
* ``view(response)`` - open the given response in your local web browser, for
|
||||
inspection. This will add a `\<base\> tag`_ to the response body in order
|
||||
for external links (such as images and style sheets) to display properly.
|
||||
Note, however,that this will create a temporary file in your computer,
|
||||
Note, however, that this will create a temporary file in your computer,
|
||||
which won't be removed automatically.
|
||||
|
||||
.. _<base> tag: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
|
||||
@ -74,7 +125,7 @@ Those objects are:
|
||||
* ``crawler`` - the current :class:`~scrapy.crawler.Crawler` object.
|
||||
|
||||
* ``spider`` - the Spider which is known to handle the URL, or a
|
||||
:class:`~scrapy.spider.Spider` object if there is no spider found for
|
||||
:class:`~scrapy.spiders.Spider` object if there is no spider found for
|
||||
the current URL
|
||||
|
||||
* ``request`` - a :class:`~scrapy.http.Request` object of the last fetched
|
||||
@ -85,19 +136,16 @@ Those objects are:
|
||||
* ``response`` - a :class:`~scrapy.http.Response` object containing the last
|
||||
fetched page
|
||||
|
||||
* ``sel`` - a :class:`~scrapy.selector.Selector` object constructed
|
||||
with the last response fetched
|
||||
|
||||
* ``settings`` - the current :ref:`Scrapy settings <topics-settings>`
|
||||
|
||||
Example of shell session
|
||||
========================
|
||||
|
||||
Here's an example of a typical shell session where we start by scraping the
|
||||
http://scrapy.org page, and then proceed to scrape the http://slashdot.org
|
||||
page. Finally, we modify the (Slashdot) request method to POST and re-fetch it
|
||||
getting a HTTP 405 (method not allowed) error. We end the session by typing
|
||||
Ctrl-D (in Unix systems) or Ctrl-Z in Windows.
|
||||
https://scrapy.org page, and then proceed to scrape the https://old.reddit.com/
|
||||
page. Finally, we modify the (Reddit) request method to POST and re-fetch it
|
||||
getting an error. We end the session by typing Ctrl-D (in Unix systems) or
|
||||
Ctrl-Z in Windows.
|
||||
|
||||
Keep in mind that the data extracted here may not be the same when you try it,
|
||||
as those pages are not static and could have changed by the time you test this.
|
||||
@ -106,57 +154,81 @@ shell works.
|
||||
|
||||
First, we launch the shell::
|
||||
|
||||
scrapy shell 'http://scrapy.org' --nolog
|
||||
scrapy shell 'https://scrapy.org' --nolog
|
||||
|
||||
.. note::
|
||||
|
||||
Remember to always enclose URLs in quotes when running the Scrapy shell from
|
||||
the command line, otherwise URLs containing arguments (i.e. the ``&`` character)
|
||||
will not work.
|
||||
|
||||
On Windows, use double quotes instead::
|
||||
|
||||
scrapy shell "https://scrapy.org" --nolog
|
||||
|
||||
|
||||
Then, the shell fetches the URL (using the Scrapy downloader) and prints the
|
||||
list of available objects and useful shortcuts (you'll notice that these lines
|
||||
all start with the ``[s]`` prefix)::
|
||||
|
||||
[s] Available Scrapy objects:
|
||||
[s] crawler <scrapy.crawler.Crawler object at 0x1e16b50>
|
||||
[s] scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)
|
||||
[s] crawler <scrapy.crawler.Crawler object at 0x7f07395dd690>
|
||||
[s] item {}
|
||||
[s] request <GET http://scrapy.org>
|
||||
[s] response <200 http://scrapy.org>
|
||||
[s] sel <Selector xpath=None data=u'<html>\n <head>\n <meta charset="utf-8'>
|
||||
[s] settings <scrapy.settings.Settings object at 0x2bfd650>
|
||||
[s] spider <Spider 'default' at 0x20c6f50>
|
||||
[s] request <GET https://scrapy.org>
|
||||
[s] response <200 https://scrapy.org/>
|
||||
[s] settings <scrapy.settings.Settings object at 0x7f07395dd710>
|
||||
[s] spider <DefaultSpider 'default' at 0x7f0735891690>
|
||||
[s] Useful shortcuts:
|
||||
[s] fetch(url[, redirect=True]) Fetch URL and update local objects (by default, redirects are followed)
|
||||
[s] fetch(req) Fetch a scrapy.Request and update local objects
|
||||
[s] shelp() Shell help (print this help)
|
||||
[s] fetch(req_or_url) Fetch request (or URL) and update local objects
|
||||
[s] view(response) View response in a browser
|
||||
|
||||
>>>
|
||||
|
||||
After that, we can start playing with the objects::
|
||||
|
||||
>>> sel.xpath("//h2/text()").extract()[0]
|
||||
u'Welcome to Scrapy'
|
||||
After that, we can start playing with the objects:
|
||||
|
||||
>>> fetch("http://slashdot.org")
|
||||
[s] Available Scrapy objects:
|
||||
[s] crawler <scrapy.crawler.Crawler object at 0x1a13b50>
|
||||
[s] item {}
|
||||
[s] request <GET http://slashdot.org>
|
||||
[s] response <200 http://slashdot.org>
|
||||
[s] sel <Selector xpath=None data=u'<html lang="en">\n<head>\n\n\n\n\n<script id="'>
|
||||
[s] settings <scrapy.settings.Settings object at 0x2bfd650>
|
||||
[s] spider <Spider 'default' at 0x20c6f50>
|
||||
[s] Useful shortcuts:
|
||||
[s] shelp() Shell help (print this help)
|
||||
[s] fetch(req_or_url) Fetch request (or URL) and update local objects
|
||||
[s] view(response) View response in a browser
|
||||
>>> response.xpath('//title/text()').get()
|
||||
'Scrapy | A Fast and Powerful Scraping and Web Crawling Framework'
|
||||
|
||||
>>> sel.xpath('//title/text()').extract()
|
||||
[u'Slashdot: News for nerds, stuff that matters']
|
||||
>>> fetch("https://old.reddit.com/")
|
||||
|
||||
>>> request = request.replace(method="POST")
|
||||
>>> response.xpath('//title/text()').get()
|
||||
'reddit: the front page of the internet'
|
||||
|
||||
>>> fetch(request)
|
||||
[s] Available Scrapy objects:
|
||||
[s] crawler <scrapy.crawler.Crawler object at 0x1e16b50>
|
||||
...
|
||||
>>> request = request.replace(method="POST")
|
||||
|
||||
>>> fetch(request)
|
||||
|
||||
>>> response.status
|
||||
404
|
||||
|
||||
>>> from pprint import pprint
|
||||
|
||||
>>> pprint(response.headers)
|
||||
{'Accept-Ranges': ['bytes'],
|
||||
'Cache-Control': ['max-age=0, must-revalidate'],
|
||||
'Content-Type': ['text/html; charset=UTF-8'],
|
||||
'Date': ['Thu, 08 Dec 2016 16:21:19 GMT'],
|
||||
'Server': ['snooserv'],
|
||||
'Set-Cookie': ['loid=KqNLou0V9SKMX4qb4n; Domain=reddit.com; Max-Age=63071999; Path=/; expires=Sat, 08-Dec-2018 16:21:19 GMT; secure',
|
||||
'loidcreated=2016-12-08T16%3A21%3A19.445Z; Domain=reddit.com; Max-Age=63071999; Path=/; expires=Sat, 08-Dec-2018 16:21:19 GMT; secure',
|
||||
'loid=vi0ZVe4NkxNWdlH7r7; Domain=reddit.com; Max-Age=63071999; Path=/; expires=Sat, 08-Dec-2018 16:21:19 GMT; secure',
|
||||
'loidcreated=2016-12-08T16%3A21%3A19.459Z; Domain=reddit.com; Max-Age=63071999; Path=/; expires=Sat, 08-Dec-2018 16:21:19 GMT; secure'],
|
||||
'Vary': ['accept-encoding'],
|
||||
'Via': ['1.1 varnish'],
|
||||
'X-Cache': ['MISS'],
|
||||
'X-Cache-Hits': ['0'],
|
||||
'X-Content-Type-Options': ['nosniff'],
|
||||
'X-Frame-Options': ['SAMEORIGIN'],
|
||||
'X-Moose': ['majestic'],
|
||||
'X-Served-By': ['cache-cdg8730-CDG'],
|
||||
'X-Timer': ['S1481214079.394283,VS0,VE159'],
|
||||
'X-Ua-Compatible': ['IE=edge'],
|
||||
'X-Xss-Protection': ['1; mode=block']}
|
||||
|
||||
>>>
|
||||
|
||||
.. _topics-shell-inspect-response:
|
||||
|
||||
@ -186,14 +258,14 @@ Here's an example of how you would call it from your spider::
|
||||
# We want to inspect one specific response.
|
||||
if ".org" in response.url:
|
||||
from scrapy.shell import inspect_response
|
||||
inspect_response(response)
|
||||
inspect_response(response, self)
|
||||
|
||||
# Rest of parsing code.
|
||||
|
||||
When you run the spider, you will get something similar to this::
|
||||
|
||||
2014-01-23 17:48:31-0400 [myspider] DEBUG: Crawled (200) <GET http://example.com> (referer: None)
|
||||
2014-01-23 17:48:31-0400 [myspider] DEBUG: Crawled (200) <GET http://example.org> (referer: None)
|
||||
2014-01-23 17:48:31-0400 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://example.com> (referer: None)
|
||||
2014-01-23 17:48:31-0400 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://example.org> (referer: None)
|
||||
[s] Available Scrapy objects:
|
||||
[s] crawler <scrapy.crawler.Crawler object at 0x1e16b50>
|
||||
...
|
||||
@ -201,22 +273,22 @@ When you run the spider, you will get something similar to this::
|
||||
>>> response.url
|
||||
'http://example.org'
|
||||
|
||||
Then, you can check if the extraction code is working::
|
||||
Then, you can check if the extraction code is working:
|
||||
|
||||
>>> sel.xpath('//h1[@class="fn"]')
|
||||
[]
|
||||
>>> response.xpath('//h1[@class="fn"]')
|
||||
[]
|
||||
|
||||
Nope, it doesn't. So you can open the response in your web browser and see if
|
||||
it's the response you were expecting::
|
||||
it's the response you were expecting:
|
||||
|
||||
>>> view(response)
|
||||
True
|
||||
>>> view(response)
|
||||
True
|
||||
|
||||
Finally you hit Ctrl-D (or Ctrl-Z in Windows) to exit the shell and resume the
|
||||
crawling::
|
||||
|
||||
>>> ^D
|
||||
2014-01-23 17:50:03-0400 [myspider] DEBUG: Crawled (200) <GET http://example.net> (referer: None)
|
||||
2014-01-23 17:50:03-0400 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://example.net> (referer: None)
|
||||
...
|
||||
|
||||
Note that you can't use the ``fetch`` shortcut here since the Scrapy engine is
|
||||
|
@ -16,13 +16,80 @@ deliver the arguments that the handler receives.
|
||||
You can connect to signals (or send your own) through the
|
||||
:ref:`topics-api-signals`.
|
||||
|
||||
Here is a simple example showing how you can catch signals and perform some action::
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy import Spider
|
||||
|
||||
|
||||
class DmozSpider(Spider):
|
||||
name = "dmoz"
|
||||
allowed_domains = ["dmoz.org"]
|
||||
start_urls = [
|
||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
|
||||
]
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
spider = super(DmozSpider, cls).from_crawler(crawler, *args, **kwargs)
|
||||
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
|
||||
return spider
|
||||
|
||||
|
||||
def spider_closed(self, spider):
|
||||
spider.logger.info('Spider closed: %s', spider.name)
|
||||
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
||||
|
||||
.. _signal-deferred:
|
||||
|
||||
Deferred signal handlers
|
||||
========================
|
||||
|
||||
Some signals support returning `Twisted deferreds`_ from their handlers, see
|
||||
the :ref:`topics-signals-ref` below to know which ones.
|
||||
Some signals support returning :class:`~twisted.internet.defer.Deferred`
|
||||
objects from their handlers, allowing you to run asynchronous code that
|
||||
does not block Scrapy. If a signal handler returns a
|
||||
:class:`~twisted.internet.defer.Deferred`, Scrapy waits for that
|
||||
:class:`~twisted.internet.defer.Deferred` to fire.
|
||||
|
||||
.. _Twisted deferreds: http://twistedmatrix.com/documents/current/core/howto/defer.html
|
||||
Let's take an example::
|
||||
|
||||
class SignalSpider(scrapy.Spider):
|
||||
name = 'signals'
|
||||
start_urls = ['http://quotes.toscrape.com/page/1/']
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
spider = super(SignalSpider, cls).from_crawler(crawler, *args, **kwargs)
|
||||
crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped)
|
||||
return spider
|
||||
|
||||
def item_scraped(self, item):
|
||||
# Send the scraped item to the server
|
||||
d = treq.post(
|
||||
'http://example.com/post',
|
||||
json.dumps(item).encode('ascii'),
|
||||
headers={b'Content-Type': [b'application/json']}
|
||||
)
|
||||
|
||||
# The next item will be scraped only after
|
||||
# deferred (d) is fired
|
||||
return d
|
||||
|
||||
def parse(self, response):
|
||||
for quote in response.css('div.quote'):
|
||||
yield {
|
||||
'text': quote.css('span.text::text').get(),
|
||||
'author': quote.css('small.author::text').get(),
|
||||
'tags': quote.css('div.tags a.tag::text').getall(),
|
||||
}
|
||||
|
||||
See the :ref:`topics-signals-ref` below to know which signals support
|
||||
:class:`~twisted.internet.defer.Deferred`.
|
||||
|
||||
.. _topics-signals-ref:
|
||||
|
||||
@ -34,22 +101,25 @@ Built-in signals reference
|
||||
|
||||
Here's the list of Scrapy built-in signals and their meaning.
|
||||
|
||||
engine_started
|
||||
Engine signals
|
||||
--------------
|
||||
|
||||
engine_started
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: engine_started
|
||||
.. function:: engine_started()
|
||||
|
||||
Sent when the Scrapy engine has started crawling.
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
.. note:: This signal may be fired *after* the :signal:`spider_opened` signal,
|
||||
depending on how the spider was started. So **don't** rely on this signal
|
||||
getting fired before :signal:`spider_opened`.
|
||||
|
||||
engine_stopped
|
||||
--------------
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: engine_stopped
|
||||
.. function:: engine_stopped()
|
||||
@ -57,10 +127,21 @@ engine_stopped
|
||||
Sent when the Scrapy engine is stopped (for example, when a crawling
|
||||
process has finished).
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
Item signals
|
||||
------------
|
||||
|
||||
.. note::
|
||||
As at max :setting:`CONCURRENT_ITEMS` items are processed in
|
||||
parallel, many deferreds are fired together using
|
||||
:class:`~twisted.internet.defer.DeferredList`. Hence the next
|
||||
batch waits for the :class:`~twisted.internet.defer.DeferredList`
|
||||
to fire and then runs the respective item signal handler for
|
||||
the next batch of scraped items.
|
||||
|
||||
item_scraped
|
||||
------------
|
||||
~~~~~~~~~~~~
|
||||
|
||||
.. signal:: item_scraped
|
||||
.. function:: item_scraped(item, response, spider)
|
||||
@ -68,19 +149,19 @@ item_scraped
|
||||
Sent when an item has been scraped, after it has passed all the
|
||||
:ref:`topics-item-pipeline` stages (without being dropped).
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param item: the item scraped
|
||||
:type item: :class:`~scrapy.item.Item` object
|
||||
:param item: the scraped item
|
||||
:type item: :ref:`item object <item-types>`
|
||||
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
:param response: the response from where the item was scraped
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
item_dropped
|
||||
------------
|
||||
~~~~~~~~~~~~
|
||||
|
||||
.. signal:: item_dropped
|
||||
.. function:: item_dropped(item, response, exception, spider)
|
||||
@ -88,13 +169,13 @@ item_dropped
|
||||
Sent after an item has been dropped from the :ref:`topics-item-pipeline`
|
||||
when some stage raised a :exc:`~scrapy.exceptions.DropItem` exception.
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param item: the item dropped from the :ref:`topics-item-pipeline`
|
||||
:type item: :class:`~scrapy.item.Item` object
|
||||
:type item: :ref:`item object <item-types>`
|
||||
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
:param response: the response from where the item was dropped
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
@ -104,8 +185,34 @@ item_dropped
|
||||
to be dropped
|
||||
:type exception: :exc:`~scrapy.exceptions.DropItem` exception
|
||||
|
||||
item_error
|
||||
~~~~~~~~~~
|
||||
|
||||
.. signal:: item_error
|
||||
.. function:: item_error(item, response, spider, failure)
|
||||
|
||||
Sent when a :ref:`topics-item-pipeline` generates an error (i.e. raises
|
||||
an exception), except :exc:`~scrapy.exceptions.DropItem` exception.
|
||||
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param item: the item that caused the error in the :ref:`topics-item-pipeline`
|
||||
:type item: :ref:`item object <item-types>`
|
||||
|
||||
:param response: the response being processed when the exception was raised
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param spider: the spider which raised the exception
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
:param failure: the exception raised
|
||||
:type failure: twisted.python.failure.Failure
|
||||
|
||||
Spider signals
|
||||
--------------
|
||||
|
||||
spider_closed
|
||||
-------------
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: spider_closed
|
||||
.. function:: spider_closed(spider, reason)
|
||||
@ -113,10 +220,10 @@ spider_closed
|
||||
Sent after a spider has been closed. This can be used to release per-spider
|
||||
resources reserved on :signal:`spider_opened`.
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param spider: the spider which has been closed
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
:param reason: a string which describes the reason why the spider was closed. If
|
||||
it was closed because the spider has completed scraping, the reason
|
||||
@ -128,7 +235,7 @@ spider_closed
|
||||
:type reason: str
|
||||
|
||||
spider_opened
|
||||
-------------
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: spider_opened
|
||||
.. function:: spider_opened(spider)
|
||||
@ -137,13 +244,13 @@ spider_opened
|
||||
reserve per-spider resources, but can be used for any task that needs to be
|
||||
performed when a spider is opened.
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param spider: the spider which has been opened
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
spider_idle
|
||||
-----------
|
||||
~~~~~~~~~~~
|
||||
|
||||
.. signal:: spider_idle
|
||||
.. function:: spider_idle(spider)
|
||||
@ -158,33 +265,44 @@ spider_idle
|
||||
the engine starts closing the spider. After the spider has finished
|
||||
closing, the :signal:`spider_closed` signal is sent.
|
||||
|
||||
You can, for example, schedule some requests in your :signal:`spider_idle`
|
||||
handler to prevent the spider from being closed.
|
||||
You may raise a :exc:`~scrapy.exceptions.DontCloseSpider` exception to
|
||||
prevent the spider from being closed.
|
||||
|
||||
This signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param spider: the spider which has gone idle
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
.. note:: Scheduling some requests in your :signal:`spider_idle` handler does
|
||||
**not** guarantee that it can prevent the spider from being closed,
|
||||
although it sometimes can. That's because the spider may still remain idle
|
||||
if all the scheduled requests are rejected by the scheduler (e.g. filtered
|
||||
due to duplication).
|
||||
|
||||
spider_error
|
||||
------------
|
||||
~~~~~~~~~~~~
|
||||
|
||||
.. signal:: spider_error
|
||||
.. function:: spider_error(failure, response, spider)
|
||||
|
||||
Sent when a spider callback generates an error (ie. raises an exception).
|
||||
Sent when a spider callback generates an error (i.e. raises an exception).
|
||||
|
||||
:param failure: the exception raised as a Twisted `Failure`_ object
|
||||
:type failure: `Failure`_ object
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param failure: the exception raised
|
||||
:type failure: twisted.python.failure.Failure
|
||||
|
||||
:param response: the response being processed when the exception was raised
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param spider: the spider which raised the exception
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
Request signals
|
||||
---------------
|
||||
|
||||
request_scheduled
|
||||
-----------------
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: request_scheduled
|
||||
.. function:: request_scheduled(request, spider)
|
||||
@ -192,16 +310,101 @@ request_scheduled
|
||||
Sent when the engine schedules a :class:`~scrapy.http.Request`, to be
|
||||
downloaded later.
|
||||
|
||||
The signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param request: the request that reached the scheduler
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider that yielded the request
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
request_dropped
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: request_dropped
|
||||
.. function:: request_dropped(request, spider)
|
||||
|
||||
Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be
|
||||
downloaded later, is rejected by the scheduler.
|
||||
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param request: the request that reached the scheduler
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider that yielded the request
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
request_reached_downloader
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: request_reached_downloader
|
||||
.. function:: request_reached_downloader(request, spider)
|
||||
|
||||
Sent when a :class:`~scrapy.http.Request` reached downloader.
|
||||
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param request: the request that reached downloader
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider that yielded the request
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
request_left_downloader
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: request_left_downloader
|
||||
.. function:: request_left_downloader(request, spider)
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Sent when a :class:`~scrapy.http.Request` leaves the downloader, even in case of
|
||||
failure.
|
||||
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param request: the request that reached the downloader
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider that yielded the request
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
bytes_received
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
.. signal:: bytes_received
|
||||
.. function:: bytes_received(data, request, spider)
|
||||
|
||||
Sent by the HTTP 1.1 and S3 download handlers when a group of bytes is
|
||||
received for a specific request. This signal might be fired multiple
|
||||
times for the same request, with partial data each time. For instance,
|
||||
a possible scenario for a 25 kb response would be two signals fired
|
||||
with 10 kb of data, and a final one with 5 kb of data.
|
||||
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param data: the data received by the download handler
|
||||
:type data: :class:`bytes` object
|
||||
|
||||
:param request: the request that generated the download
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider associated with the response
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
.. note:: Handlers of this signal can stop the download of a response while it
|
||||
is in progress by raising the :exc:`~scrapy.exceptions.StopDownload`
|
||||
exception. Please refer to the :ref:`topics-stop-response-download` topic
|
||||
for additional information and examples.
|
||||
|
||||
Response signals
|
||||
----------------
|
||||
|
||||
response_received
|
||||
-----------------
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: response_received
|
||||
.. function:: response_received(response, request, spider)
|
||||
@ -209,7 +412,7 @@ response_received
|
||||
Sent when the engine receives a new :class:`~scrapy.http.Response` from the
|
||||
downloader.
|
||||
|
||||
This signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param response: the response received
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
@ -218,17 +421,22 @@ response_received
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider for which the response is intended
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
.. note:: The ``request`` argument might not contain the original request that
|
||||
reached the downloader, if a :ref:`topics-downloader-middleware` modifies
|
||||
the :class:`~scrapy.http.Response` object and sets a specific ``request``
|
||||
attribute.
|
||||
|
||||
response_downloaded
|
||||
-------------------
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: response_downloaded
|
||||
.. function:: response_downloaded(response, request, spider)
|
||||
|
||||
Sent by the downloader right after a ``HTTPResponse`` is downloaded.
|
||||
|
||||
This signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param response: the response downloaded
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
@ -237,6 +445,4 @@ response_downloaded
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider for which the response is intended
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
.. _Failure: http://twistedmatrix.com/documents/current/api/twisted.python.failure.Failure.html
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
@ -28,7 +28,12 @@ The :setting:`SPIDER_MIDDLEWARES` setting is merged with the
|
||||
:setting:`SPIDER_MIDDLEWARES_BASE` setting defined in Scrapy (and not meant to
|
||||
be overridden) and then sorted by order to get the final sorted list of enabled
|
||||
middlewares: the first middleware is the one closer to the engine and the last
|
||||
is the one closer to the spider.
|
||||
is the one closer to the spider. In other words,
|
||||
the :meth:`~scrapy.spidermiddlewares.SpiderMiddleware.process_spider_input`
|
||||
method of each middleware will be invoked in increasing
|
||||
middleware order (100, 200, 300, ...), and the
|
||||
:meth:`~scrapy.spidermiddlewares.SpiderMiddleware.process_spider_output` method
|
||||
of each middleware will be invoked in decreasing order.
|
||||
|
||||
To decide which order to assign to your middleware see the
|
||||
:setting:`SPIDER_MIDDLEWARES_BASE` setting and pick a value according to where
|
||||
@ -38,24 +43,30 @@ previous (or subsequent) middleware being applied.
|
||||
|
||||
If you want to disable a builtin middleware (the ones defined in
|
||||
:setting:`SPIDER_MIDDLEWARES_BASE`, and enabled by default) you must define it
|
||||
in your project :setting:`SPIDER_MIDDLEWARES` setting and assign `None` as its
|
||||
in your project :setting:`SPIDER_MIDDLEWARES` setting and assign ``None`` as its
|
||||
value. For example, if you want to disable the off-site middleware::
|
||||
|
||||
SPIDER_MIDDLEWARES = {
|
||||
'myproject.middlewares.CustomSpiderMiddleware': 543,
|
||||
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None,
|
||||
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None,
|
||||
}
|
||||
|
||||
Finally, keep in mind that some middlewares may need to be enabled through a
|
||||
particular setting. See each middleware documentation for more info.
|
||||
|
||||
.. _custom-spider-middleware:
|
||||
|
||||
Writing your own spider middleware
|
||||
==================================
|
||||
|
||||
Writing your own spider middleware is easy. Each middleware component is a
|
||||
single Python class that defines one or more of the following methods:
|
||||
Each spider middleware is a Python class that defines one or more of the
|
||||
methods defined below.
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware
|
||||
The main entry point is the ``from_crawler`` class method, which receives a
|
||||
:class:`~scrapy.crawler.Crawler` instance. The :class:`~scrapy.crawler.Crawler`
|
||||
object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
||||
|
||||
.. module:: scrapy.spidermiddlewares
|
||||
|
||||
.. class:: SpiderMiddleware
|
||||
|
||||
@ -73,7 +84,8 @@ single Python class that defines one or more of the following methods:
|
||||
|
||||
If it raises an exception, Scrapy won't bother calling any other spider
|
||||
middleware :meth:`process_spider_input` and will call the request
|
||||
errback. The output of the errback is chained back in the other
|
||||
errback if there is one, otherwise it will start the :meth:`process_spider_exception`
|
||||
chain. The output of the errback is chained back in the other
|
||||
direction for :meth:`process_spider_output` to process it, or
|
||||
:meth:`process_spider_exception` if it raised an exception.
|
||||
|
||||
@ -81,7 +93,7 @@ single Python class that defines one or more of the following methods:
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param spider: the spider for which this response is intended
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
|
||||
.. method:: process_spider_output(response, result, spider)
|
||||
@ -90,28 +102,28 @@ single Python class that defines one or more of the following methods:
|
||||
it has processed the response.
|
||||
|
||||
:meth:`process_spider_output` must return an iterable of
|
||||
:class:`~scrapy.http.Request` or :class:`~scrapy.item.Item` objects.
|
||||
:class:`~scrapy.http.Request` objects and :ref:`item object
|
||||
<topics-items>`.
|
||||
|
||||
:param response: the response which generated this output from the
|
||||
spider
|
||||
:type response: class:`~scrapy.http.Response` object
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param result: the result returned by the spider
|
||||
:type result: an iterable of :class:`~scrapy.http.Request` or
|
||||
:class:`~scrapy.item.Item` objects
|
||||
:type result: an iterable of :class:`~scrapy.http.Request` objects and
|
||||
:ref:`item object <topics-items>`
|
||||
|
||||
:param spider: the spider whose result is being processed
|
||||
:type spider: :class:`~scrapy.item.Spider` object
|
||||
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
.. method:: process_spider_exception(response, exception, spider)
|
||||
|
||||
This method is called when when a spider or :meth:`process_spider_input`
|
||||
method (from other spider middleware) raises an exception.
|
||||
This method is called when a spider or :meth:`process_spider_output`
|
||||
method (from a previous spider middleware) raises an exception.
|
||||
|
||||
:meth:`process_spider_exception` should return either ``None`` or an
|
||||
iterable of :class:`~scrapy.http.Response` or
|
||||
:class:`~scrapy.item.Item` objects.
|
||||
iterable of :class:`~scrapy.http.Request` objects and :ref:`item object
|
||||
<topics-items>`.
|
||||
|
||||
If it returns ``None``, Scrapy will continue processing this exception,
|
||||
executing any other :meth:`process_spider_exception` in the following
|
||||
@ -119,17 +131,18 @@ single Python class that defines one or more of the following methods:
|
||||
exception reaches the engine (where it's logged and discarded).
|
||||
|
||||
If it returns an iterable the :meth:`process_spider_output` pipeline
|
||||
kicks in, and no other :meth:`process_spider_exception` will be called.
|
||||
kicks in, starting from the next spider middleware, and no other
|
||||
:meth:`process_spider_exception` will be called.
|
||||
|
||||
:param response: the response being processed when the exception was
|
||||
raised
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param exception: the exception raised
|
||||
:type exception: `Exception`_ object
|
||||
:type exception: :exc:`Exception` object
|
||||
|
||||
:param spider: the spider which raised the exception
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
.. method:: process_start_requests(start_requests, spider)
|
||||
|
||||
@ -156,11 +169,18 @@ single Python class that defines one or more of the following methods:
|
||||
:type start_requests: an iterable of :class:`~scrapy.http.Request`
|
||||
|
||||
:param spider: the spider to whom the start requests belong
|
||||
:type spider: :class:`~scrapy.item.Spider` object
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
.. method:: from_crawler(cls, crawler)
|
||||
|
||||
.. _Exception: http://docs.python.org/library/exceptions.html#exceptions.Exception
|
||||
If present, this classmethod is called to create a middleware instance
|
||||
from a :class:`~scrapy.crawler.Crawler`. It must return a new instance
|
||||
of the middleware. Crawler object provides access to all Scrapy core
|
||||
components like settings and signals; it is a way for middleware to
|
||||
access them and hook its functionality into Scrapy.
|
||||
|
||||
:param crawler: crawler that uses this middleware
|
||||
:type crawler: :class:`~scrapy.crawler.Crawler` object
|
||||
|
||||
.. _topics-spider-middleware-ref:
|
||||
|
||||
@ -177,28 +197,33 @@ For a list of the components enabled by default (and their orders) see the
|
||||
DepthMiddleware
|
||||
---------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.depth
|
||||
.. module:: scrapy.spidermiddlewares.depth
|
||||
:synopsis: Depth Spider Middleware
|
||||
|
||||
.. class:: DepthMiddleware
|
||||
|
||||
DepthMiddleware is a scrape middleware used for tracking the depth of each
|
||||
Request inside the site being scraped. It can be used to limit the maximum
|
||||
depth to scrape or things like that.
|
||||
DepthMiddleware is used for tracking the depth of each Request inside the
|
||||
site being scraped. It works by setting ``request.meta['depth'] = 0`` whenever
|
||||
there is no value previously set (usually just the first Request) and
|
||||
incrementing it by 1 otherwise.
|
||||
|
||||
It can be used to limit the maximum depth to scrape, control Request
|
||||
priority based on their depth, and things like that.
|
||||
|
||||
The :class:`DepthMiddleware` can be configured through the following
|
||||
settings (see the settings documentation for more info):
|
||||
|
||||
* :setting:`DEPTH_LIMIT` - The maximum depth that will be allowed to
|
||||
crawl for any site. If zero, no limit will be imposed.
|
||||
* :setting:`DEPTH_STATS` - Whether to collect depth stats.
|
||||
* :setting:`DEPTH_STATS_VERBOSE` - Whether to collect the number of
|
||||
requests for each depth.
|
||||
* :setting:`DEPTH_PRIORITY` - Whether to prioritize the requests based on
|
||||
their depth.
|
||||
|
||||
HttpErrorMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.httperror
|
||||
.. module:: scrapy.spidermiddlewares.httperror
|
||||
:synopsis: HTTP Error Spider Middleware
|
||||
|
||||
.. class:: HttpErrorMiddleware
|
||||
@ -210,7 +235,7 @@ HttpErrorMiddleware
|
||||
According to the `HTTP standard`_, successful responses are those whose
|
||||
status codes are in the 200-300 range.
|
||||
|
||||
.. _HTTP standard: http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
|
||||
.. _HTTP standard: https://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
|
||||
|
||||
If you still want to process response codes outside that range, you can
|
||||
specify which response codes the spider is able to handle using the
|
||||
@ -225,16 +250,19 @@ this::
|
||||
|
||||
.. reqmeta:: handle_httpstatus_list
|
||||
|
||||
.. reqmeta:: handle_httpstatus_all
|
||||
|
||||
The ``handle_httpstatus_list`` key of :attr:`Request.meta
|
||||
<scrapy.http.Request.meta>` can also be used to specify which response codes to
|
||||
allow on a per-request basis.
|
||||
allow on a per-request basis. You can also set the meta key ``handle_httpstatus_all``
|
||||
to ``True`` if you want to allow any response code for a request.
|
||||
|
||||
Keep in mind, however, that it's usually a bad idea to handle non-200
|
||||
responses, unless you really know what you're doing.
|
||||
|
||||
For more information see: `HTTP Status Code Definitions`_.
|
||||
|
||||
.. _HTTP Status Code Definitions: http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
|
||||
.. _HTTP Status Code Definitions: https://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
|
||||
|
||||
HttpErrorMiddleware settings
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -260,7 +288,7 @@ Pass all responses, regardless of its status code.
|
||||
OffsiteMiddleware
|
||||
-----------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.offsite
|
||||
.. module:: scrapy.spidermiddlewares.offsite
|
||||
:synopsis: Offsite Spider Middleware
|
||||
|
||||
.. class:: OffsiteMiddleware
|
||||
@ -268,7 +296,10 @@ OffsiteMiddleware
|
||||
Filters out Requests for URLs outside the domains covered by the spider.
|
||||
|
||||
This middleware filters out every request whose host names aren't in the
|
||||
spider's :attr:`~scrapy.spider.Spider.allowed_domains` attribute.
|
||||
spider's :attr:`~scrapy.spiders.Spider.allowed_domains` attribute.
|
||||
All subdomains of any domain in the list are also allowed.
|
||||
E.g. the rule ``www.example.org`` will also allow ``bob.www.example.org``
|
||||
but not ``www2.example.com`` nor ``example.com``.
|
||||
|
||||
When your spider returns a request for a domain not belonging to those
|
||||
covered by the spider, this middleware will log a debug message similar to
|
||||
@ -283,7 +314,7 @@ OffsiteMiddleware
|
||||
will be printed (but only for the first request filtered).
|
||||
|
||||
If the spider doesn't define an
|
||||
:attr:`~scrapy.spider.Spider.allowed_domains` attribute, or the
|
||||
:attr:`~scrapy.spiders.Spider.allowed_domains` attribute, or the
|
||||
attribute is empty, the offsite middleware will allow all requests.
|
||||
|
||||
If the request has the :attr:`~scrapy.http.Request.dont_filter` attribute
|
||||
@ -294,7 +325,7 @@ OffsiteMiddleware
|
||||
RefererMiddleware
|
||||
-----------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.referer
|
||||
.. module:: scrapy.spidermiddlewares.referer
|
||||
:synopsis: Referer Spider Middleware
|
||||
|
||||
.. class:: RefererMiddleware
|
||||
@ -316,10 +347,94 @@ Default: ``True``
|
||||
|
||||
Whether to enable referer middleware.
|
||||
|
||||
.. setting:: REFERRER_POLICY
|
||||
|
||||
REFERRER_POLICY
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
.. versionadded:: 1.4
|
||||
|
||||
Default: ``'scrapy.spidermiddlewares.referer.DefaultReferrerPolicy'``
|
||||
|
||||
.. reqmeta:: referrer_policy
|
||||
|
||||
`Referrer Policy`_ to apply when populating Request "Referer" header.
|
||||
|
||||
.. note::
|
||||
You can also set the Referrer Policy per request,
|
||||
using the special ``"referrer_policy"`` :ref:`Request.meta <topics-request-meta>` key,
|
||||
with the same acceptable values as for the ``REFERRER_POLICY`` setting.
|
||||
|
||||
Acceptable values for REFERRER_POLICY
|
||||
*************************************
|
||||
|
||||
- either a path to a ``scrapy.spidermiddlewares.referer.ReferrerPolicy``
|
||||
subclass — a custom policy or one of the built-in ones (see classes below),
|
||||
- or one of the standard W3C-defined string values,
|
||||
- or the special ``"scrapy-default"``.
|
||||
|
||||
======================================= ========================================================================
|
||||
String value Class name (as a string)
|
||||
======================================= ========================================================================
|
||||
``"scrapy-default"`` (default) :class:`scrapy.spidermiddlewares.referer.DefaultReferrerPolicy`
|
||||
`"no-referrer"`_ :class:`scrapy.spidermiddlewares.referer.NoReferrerPolicy`
|
||||
`"no-referrer-when-downgrade"`_ :class:`scrapy.spidermiddlewares.referer.NoReferrerWhenDowngradePolicy`
|
||||
`"same-origin"`_ :class:`scrapy.spidermiddlewares.referer.SameOriginPolicy`
|
||||
`"origin"`_ :class:`scrapy.spidermiddlewares.referer.OriginPolicy`
|
||||
`"strict-origin"`_ :class:`scrapy.spidermiddlewares.referer.StrictOriginPolicy`
|
||||
`"origin-when-cross-origin"`_ :class:`scrapy.spidermiddlewares.referer.OriginWhenCrossOriginPolicy`
|
||||
`"strict-origin-when-cross-origin"`_ :class:`scrapy.spidermiddlewares.referer.StrictOriginWhenCrossOriginPolicy`
|
||||
`"unsafe-url"`_ :class:`scrapy.spidermiddlewares.referer.UnsafeUrlPolicy`
|
||||
======================================= ========================================================================
|
||||
|
||||
.. autoclass:: DefaultReferrerPolicy
|
||||
.. warning::
|
||||
Scrapy's default referrer policy — just like `"no-referrer-when-downgrade"`_,
|
||||
the W3C-recommended value for browsers — will send a non-empty
|
||||
"Referer" header from any ``http(s)://`` to any ``https://`` URL,
|
||||
even if the domain is different.
|
||||
|
||||
`"same-origin"`_ may be a better choice if you want to remove referrer
|
||||
information for cross-domain requests.
|
||||
|
||||
.. autoclass:: NoReferrerPolicy
|
||||
|
||||
.. autoclass:: NoReferrerWhenDowngradePolicy
|
||||
.. note::
|
||||
"no-referrer-when-downgrade" policy is the W3C-recommended default,
|
||||
and is used by major web browsers.
|
||||
|
||||
However, it is NOT Scrapy's default referrer policy (see :class:`DefaultReferrerPolicy`).
|
||||
|
||||
.. autoclass:: SameOriginPolicy
|
||||
|
||||
.. autoclass:: OriginPolicy
|
||||
|
||||
.. autoclass:: StrictOriginPolicy
|
||||
|
||||
.. autoclass:: OriginWhenCrossOriginPolicy
|
||||
|
||||
.. autoclass:: StrictOriginWhenCrossOriginPolicy
|
||||
|
||||
.. autoclass:: UnsafeUrlPolicy
|
||||
.. warning::
|
||||
"unsafe-url" policy is NOT recommended.
|
||||
|
||||
.. _Referrer Policy: https://www.w3.org/TR/referrer-policy
|
||||
.. _"no-referrer": https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
|
||||
.. _"no-referrer-when-downgrade": https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade
|
||||
.. _"same-origin": https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin
|
||||
.. _"origin": https://www.w3.org/TR/referrer-policy/#referrer-policy-origin
|
||||
.. _"strict-origin": https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin
|
||||
.. _"origin-when-cross-origin": https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin
|
||||
.. _"strict-origin-when-cross-origin": https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin
|
||||
.. _"unsafe-url": https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url
|
||||
|
||||
|
||||
UrlLengthMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.urllength
|
||||
.. module:: scrapy.spidermiddlewares.urllength
|
||||
:synopsis: URL Length Spider Middleware
|
||||
|
||||
.. class:: UrlLengthMiddleware
|
||||
|
@ -17,15 +17,16 @@ For spiders, the scraping cycle goes through something like this:
|
||||
those requests.
|
||||
|
||||
The first requests to perform are obtained by calling the
|
||||
:meth:`~scrapy.spider.Spider.start_requests` method which (by default)
|
||||
:meth:`~scrapy.spiders.Spider.start_requests` method which (by default)
|
||||
generates :class:`~scrapy.http.Request` for the URLs specified in the
|
||||
:attr:`~scrapy.spider.Spider.start_urls` and the
|
||||
:attr:`~scrapy.spider.Spider.parse` method as callback function for the
|
||||
:attr:`~scrapy.spiders.Spider.start_urls` and the
|
||||
:attr:`~scrapy.spiders.Spider.parse` method as callback function for the
|
||||
Requests.
|
||||
|
||||
2. In the callback function, you parse the response (web page) and return either
|
||||
:class:`~scrapy.item.Item` objects, :class:`~scrapy.http.Request` objects,
|
||||
or an iterable of both. Those Requests will also contain a callback (maybe
|
||||
2. In the callback function, you parse the response (web page) and return
|
||||
:ref:`item objects <topics-items>`,
|
||||
:class:`~scrapy.http.Request` objects, or an iterable of these objects.
|
||||
Those Requests will also contain a callback (maybe
|
||||
the same) and will then be downloaded by Scrapy and then their
|
||||
response handled by the specified callback.
|
||||
|
||||
@ -41,70 +42,22 @@ Even though this cycle applies (more or less) to any kind of spider, there are
|
||||
different kinds of default spiders bundled into Scrapy for different purposes.
|
||||
We will talk about those types here.
|
||||
|
||||
.. _spiderargs:
|
||||
|
||||
Spider arguments
|
||||
================
|
||||
|
||||
Spiders can receive arguments that modify their behaviour. Some common uses for
|
||||
spider arguments are to define the start URLs or to restrict the crawl to
|
||||
certain sections of the site, but they can be used to configure any
|
||||
functionality of the spider.
|
||||
|
||||
Spider arguments are passed through the :command:`crawl` command using the
|
||||
``-a`` option. For example::
|
||||
|
||||
scrapy crawl myspider -a category=electronics
|
||||
|
||||
Spiders receive arguments in their constructors::
|
||||
|
||||
import scrapy
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
name = 'myspider'
|
||||
|
||||
def __init__(self, category=None, *args, **kwargs):
|
||||
super(MySpider, self).__init__(*args, **kwargs)
|
||||
self.start_urls = ['http://www.example.com/categories/%s' % category]
|
||||
# ...
|
||||
|
||||
Spider arguments can also be passed through the Scrapyd ``schedule.json`` API.
|
||||
See `Scrapyd documentation`_.
|
||||
.. module:: scrapy.spiders
|
||||
:synopsis: Spiders base class, spider manager and spider middleware
|
||||
|
||||
.. _topics-spiders-ref:
|
||||
|
||||
Built-in spiders reference
|
||||
==========================
|
||||
|
||||
Scrapy comes with some useful generic spiders that you can use, to subclass
|
||||
your spiders from. Their aim is to provide convenient functionality for a few
|
||||
common scraping cases, like following all links on a site based on certain
|
||||
rules, crawling from `Sitemaps`_, or parsing a XML/CSV feed.
|
||||
|
||||
For the examples used in the following spiders, we'll assume you have a project
|
||||
with a ``TestItem`` declared in a ``myproject.items`` module::
|
||||
|
||||
import scrapy
|
||||
|
||||
class TestItem(scrapy.Item):
|
||||
id = scrapy.Field()
|
||||
name = scrapy.Field()
|
||||
description = scrapy.Field()
|
||||
|
||||
|
||||
.. module:: scrapy.spider
|
||||
:synopsis: Spiders base class, spider manager and spider middleware
|
||||
|
||||
Spider
|
||||
------
|
||||
scrapy.Spider
|
||||
=============
|
||||
|
||||
.. class:: Spider()
|
||||
|
||||
This is the simplest spider, and the one from which every other spider
|
||||
must inherit from (either the ones that come bundled with Scrapy, or the ones
|
||||
must inherit (including spiders that come bundled with Scrapy, as well as spiders
|
||||
that you write yourself). It doesn't provide any special functionality. It just
|
||||
requests the given ``start_urls``/``start_requests``, and calls the spider's
|
||||
method ``parse`` for each of the resulting responses.
|
||||
provides a default :meth:`start_requests` implementation which sends requests from
|
||||
the :attr:`start_urls` spider attribute and calls the spider's method ``parse``
|
||||
for each of the resulting responses.
|
||||
|
||||
.. attribute:: name
|
||||
|
||||
@ -123,55 +76,98 @@ Spider
|
||||
|
||||
An optional list of strings containing domains that this spider is
|
||||
allowed to crawl. Requests for URLs not belonging to the domain names
|
||||
specified in this list won't be followed if
|
||||
:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware` is enabled.
|
||||
specified in this list (or their subdomains) won't be followed if
|
||||
:class:`~scrapy.spidermiddlewares.offsite.OffsiteMiddleware` is enabled.
|
||||
|
||||
Let's say your target url is ``https://www.example.com/1.html``,
|
||||
then add ``'example.com'`` to the list.
|
||||
|
||||
.. attribute:: start_urls
|
||||
|
||||
A list of URLs where the spider will begin to crawl from, when no
|
||||
particular URLs are specified. So, the first pages downloaded will be those
|
||||
listed here. The subsequent URLs will be generated successively from data
|
||||
listed here. The subsequent :class:`~scrapy.http.Request` will be generated successively from data
|
||||
contained in the start URLs.
|
||||
|
||||
.. attribute:: custom_settings
|
||||
|
||||
A dictionary of settings that will be overridden from the project wide
|
||||
configuration when running this spider. It must be defined as a class
|
||||
attribute since the settings are updated before instantiation.
|
||||
|
||||
For a list of available built-in settings see:
|
||||
:ref:`topics-settings-ref`.
|
||||
|
||||
.. attribute:: crawler
|
||||
|
||||
This attribute is set by the :meth:`from_crawler` class method after
|
||||
initializating the class, and links to the
|
||||
:class:`~scrapy.crawler.Crawler` object to which this spider instance is
|
||||
bound.
|
||||
|
||||
Crawlers encapsulate a lot of components in the project for their single
|
||||
entry access (such as extensions, middlewares, signals managers, etc).
|
||||
See :ref:`topics-api-crawler` to know more about them.
|
||||
|
||||
.. attribute:: settings
|
||||
|
||||
Configuration for running this spider. This is a
|
||||
:class:`~scrapy.settings.Settings` instance, see the
|
||||
:ref:`topics-settings` topic for a detailed introduction on this subject.
|
||||
|
||||
.. attribute:: logger
|
||||
|
||||
Python logger created with the Spider's :attr:`name`. You can use it to
|
||||
send log messages through it as described on
|
||||
:ref:`topics-logging-from-spiders`.
|
||||
|
||||
.. method:: from_crawler(crawler, *args, **kwargs)
|
||||
|
||||
This is the class method used by Scrapy to create your spiders.
|
||||
|
||||
You probably won't need to override this directly because the default
|
||||
implementation acts as a proxy to the :meth:`__init__` method, calling
|
||||
it with the given arguments ``args`` and named arguments ``kwargs``.
|
||||
|
||||
Nonetheless, this method sets the :attr:`crawler` and :attr:`settings`
|
||||
attributes in the new instance so they can be accessed later inside the
|
||||
spider's code.
|
||||
|
||||
:param crawler: crawler to which the spider will be bound
|
||||
:type crawler: :class:`~scrapy.crawler.Crawler` instance
|
||||
|
||||
:param args: arguments passed to the :meth:`__init__` method
|
||||
:type args: list
|
||||
|
||||
:param kwargs: keyword arguments passed to the :meth:`__init__` method
|
||||
:type kwargs: dict
|
||||
|
||||
.. method:: start_requests()
|
||||
|
||||
This method must return an iterable with the first Requests to crawl for
|
||||
this spider.
|
||||
this spider. It is called by Scrapy when the spider is opened for
|
||||
scraping. Scrapy calls it only once, so it is safe to implement
|
||||
:meth:`start_requests` as a generator.
|
||||
|
||||
This is the method called by Scrapy when the spider is opened for
|
||||
scraping when no particular URLs are specified. If particular URLs are
|
||||
specified, the :meth:`make_requests_from_url` is used instead to create
|
||||
the Requests. This method is also called only once from Scrapy, so it's
|
||||
safe to implement it as a generator.
|
||||
|
||||
The default implementation uses :meth:`make_requests_from_url` to
|
||||
generate Requests for each url in :attr:`start_urls`.
|
||||
The default implementation generates ``Request(url, dont_filter=True)``
|
||||
for each url in :attr:`start_urls`.
|
||||
|
||||
If you want to change the Requests used to start scraping a domain, this is
|
||||
the method to override. For example, if you need to start by logging in using
|
||||
a POST request, you could do::
|
||||
|
||||
def start_requests(self):
|
||||
return [scrapy.FormRequest("http://www.example.com/login",
|
||||
formdata={'user': 'john', 'pass': 'secret'},
|
||||
callback=self.logged_in)]
|
||||
class MySpider(scrapy.Spider):
|
||||
name = 'myspider'
|
||||
|
||||
def logged_in(self, response):
|
||||
# here you would extract links to follow and return Requests for
|
||||
# each of them, with another callback
|
||||
pass
|
||||
def start_requests(self):
|
||||
return [scrapy.FormRequest("http://www.example.com/login",
|
||||
formdata={'user': 'john', 'pass': 'secret'},
|
||||
callback=self.logged_in)]
|
||||
|
||||
.. method:: make_requests_from_url(url)
|
||||
|
||||
A method that receives a URL and returns a :class:`~scrapy.http.Request`
|
||||
object (or a list of :class:`~scrapy.http.Request` objects) to scrape. This
|
||||
method is used to construct the initial requests in the
|
||||
:meth:`start_requests` method, and is typically used to convert urls to
|
||||
requests.
|
||||
|
||||
Unless overridden, this method returns Requests with the :meth:`parse`
|
||||
method as their callback function, and with dont_filter parameter enabled
|
||||
(see :class:`~scrapy.http.Request` class for more info).
|
||||
def logged_in(self, response):
|
||||
# here you would extract links to follow and return Requests for
|
||||
# each of them, with another callback
|
||||
pass
|
||||
|
||||
.. method:: parse(response)
|
||||
|
||||
@ -183,27 +179,23 @@ Spider
|
||||
the same requirements as the :class:`Spider` class.
|
||||
|
||||
This method, as well as any other Request callback, must return an
|
||||
iterable of :class:`~scrapy.http.Request` and/or
|
||||
:class:`~scrapy.item.Item` objects.
|
||||
iterable of :class:`~scrapy.http.Request` and/or :ref:`item objects
|
||||
<topics-items>`.
|
||||
|
||||
:param response: the response to parse
|
||||
:type response: :class:~scrapy.http.Response`
|
||||
:type response: :class:`~scrapy.http.Response`
|
||||
|
||||
.. method:: log(message, [level, component])
|
||||
|
||||
Log a message using the :func:`scrapy.log.msg` function, automatically
|
||||
populating the spider argument with the :attr:`name` of this
|
||||
spider. For more information see :ref:`topics-logging`.
|
||||
Wrapper that sends a log message through the Spider's :attr:`logger`,
|
||||
kept for backward compatibility. For more information see
|
||||
:ref:`topics-logging-from-spiders`.
|
||||
|
||||
.. method:: closed(reason)
|
||||
|
||||
Called when the spider closes. This method provides a shortcut to
|
||||
signals.connect() for the :signal:`spider_closed` signal.
|
||||
|
||||
|
||||
Spider example
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Let's see an example::
|
||||
|
||||
import scrapy
|
||||
@ -219,12 +211,11 @@ Let's see an example::
|
||||
]
|
||||
|
||||
def parse(self, response):
|
||||
self.log('A response from %s just arrived!' % response.url)
|
||||
self.logger.info('A response from %s just arrived!', response.url)
|
||||
|
||||
Another example returning multiple Requests and Items from a single callback::
|
||||
Return multiple Requests and items from a single callback::
|
||||
|
||||
import scrapy
|
||||
from myproject.items import MyItem
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
name = 'example.com'
|
||||
@ -236,14 +227,115 @@ Another example returning multiple Requests and Items from a single callback::
|
||||
]
|
||||
|
||||
def parse(self, response):
|
||||
for h3 in response.xpath('//h3').extract():
|
||||
for h3 in response.xpath('//h3').getall():
|
||||
yield {"title": h3}
|
||||
|
||||
for href in response.xpath('//a/@href').getall():
|
||||
yield scrapy.Request(response.urljoin(href), self.parse)
|
||||
|
||||
Instead of :attr:`~.start_urls` you can use :meth:`~.start_requests` directly;
|
||||
to give data more structure you can use :class:`~scrapy.item.Item` objects::
|
||||
|
||||
import scrapy
|
||||
from myproject.items import MyItem
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
name = 'example.com'
|
||||
allowed_domains = ['example.com']
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request('http://www.example.com/1.html', self.parse)
|
||||
yield scrapy.Request('http://www.example.com/2.html', self.parse)
|
||||
yield scrapy.Request('http://www.example.com/3.html', self.parse)
|
||||
|
||||
def parse(self, response):
|
||||
for h3 in response.xpath('//h3').getall():
|
||||
yield MyItem(title=h3)
|
||||
|
||||
for url in response.xpath('//a/@href').extract():
|
||||
yield scrapy.Request(url, callback=self.parse)
|
||||
for href in response.xpath('//a/@href').getall():
|
||||
yield scrapy.Request(response.urljoin(href), self.parse)
|
||||
|
||||
.. module:: scrapy.contrib.spiders
|
||||
:synopsis: Collection of generic spiders
|
||||
.. _spiderargs:
|
||||
|
||||
Spider arguments
|
||||
================
|
||||
|
||||
Spiders can receive arguments that modify their behaviour. Some common uses for
|
||||
spider arguments are to define the start URLs or to restrict the crawl to
|
||||
certain sections of the site, but they can be used to configure any
|
||||
functionality of the spider.
|
||||
|
||||
Spider arguments are passed through the :command:`crawl` command using the
|
||||
``-a`` option. For example::
|
||||
|
||||
scrapy crawl myspider -a category=electronics
|
||||
|
||||
Spiders can access arguments in their `__init__` methods::
|
||||
|
||||
import scrapy
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
name = 'myspider'
|
||||
|
||||
def __init__(self, category=None, *args, **kwargs):
|
||||
super(MySpider, self).__init__(*args, **kwargs)
|
||||
self.start_urls = ['http://www.example.com/categories/%s' % category]
|
||||
# ...
|
||||
|
||||
The default `__init__` method will take any spider arguments
|
||||
and copy them to the spider as attributes.
|
||||
The above example can also be written as follows::
|
||||
|
||||
import scrapy
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
name = 'myspider'
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request('http://www.example.com/categories/%s' % self.category)
|
||||
|
||||
Keep in mind that spider arguments are only strings.
|
||||
The spider will not do any parsing on its own.
|
||||
If you were to set the ``start_urls`` attribute from the command line,
|
||||
you would have to parse it on your own into a list
|
||||
using something like :func:`ast.literal_eval` or :func:`json.loads`
|
||||
and then set it as an attribute.
|
||||
Otherwise, you would cause iteration over a ``start_urls`` string
|
||||
(a very common python pitfall)
|
||||
resulting in each character being seen as a separate url.
|
||||
|
||||
A valid use case is to set the http auth credentials
|
||||
used by :class:`~scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware`
|
||||
or the user agent
|
||||
used by :class:`~scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`::
|
||||
|
||||
scrapy crawl myspider -a http_user=myuser -a http_pass=mypassword -a user_agent=mybot
|
||||
|
||||
Spider arguments can also be passed through the Scrapyd ``schedule.json`` API.
|
||||
See `Scrapyd documentation`_.
|
||||
|
||||
.. _builtin-spiders:
|
||||
|
||||
Generic Spiders
|
||||
===============
|
||||
|
||||
Scrapy comes with some useful generic spiders that you can use to subclass
|
||||
your spiders from. Their aim is to provide convenient functionality for a few
|
||||
common scraping cases, like following all links on a site based on certain
|
||||
rules, crawling from `Sitemaps`_, or parsing an XML/CSV feed.
|
||||
|
||||
For the examples used in the following spiders, we'll assume you have a project
|
||||
with a ``TestItem`` declared in a ``myproject.items`` module::
|
||||
|
||||
import scrapy
|
||||
|
||||
class TestItem(scrapy.Item):
|
||||
id = scrapy.Field()
|
||||
name = scrapy.Field()
|
||||
description = scrapy.Field()
|
||||
|
||||
|
||||
.. currentmodule:: scrapy.spiders
|
||||
|
||||
CrawlSpider
|
||||
-----------
|
||||
@ -268,48 +360,67 @@ CrawlSpider
|
||||
|
||||
This spider also exposes an overrideable method:
|
||||
|
||||
.. method:: parse_start_url(response)
|
||||
.. method:: parse_start_url(response, **kwargs)
|
||||
|
||||
This method is called for the start_urls responses. It allows to parse
|
||||
the initial responses and must return either a
|
||||
:class:`~scrapy.item.Item` object, a :class:`~scrapy.http.Request`
|
||||
This method is called for each response produced for the URLs in
|
||||
the spider's ``start_urls`` attribute. It allows to parse
|
||||
the initial responses and must return either an
|
||||
:ref:`item object <topics-items>`, a :class:`~scrapy.http.Request`
|
||||
object, or an iterable containing any of them.
|
||||
|
||||
Crawling rules
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. class:: Rule(link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None)
|
||||
.. autoclass:: Rule
|
||||
|
||||
``link_extractor`` is a :ref:`Link Extractor <topics-link-extractors>` object which
|
||||
defines how links will be extracted from each crawled page.
|
||||
defines how links will be extracted from each crawled page. Each produced link will
|
||||
be used to generate a :class:`~scrapy.http.Request` object, which will contain the
|
||||
link's text in its ``meta`` dictionary (under the ``link_text`` key).
|
||||
If omitted, a default link extractor created with no arguments will be used,
|
||||
resulting in all links being extracted.
|
||||
|
||||
``callback`` is a callable or a string (in which case a method from the spider
|
||||
object with that name will be used) to be called for each link extracted with
|
||||
the specified link_extractor. This callback receives a response as its first
|
||||
argument and must return a list containing :class:`~scrapy.item.Item` and/or
|
||||
:class:`~scrapy.http.Request` objects (or any subclass of them).
|
||||
|
||||
.. warning:: When writing crawl spider rules, avoid using ``parse`` as
|
||||
callback, since the :class:`CrawlSpider` uses the ``parse`` method
|
||||
itself to implement its logic. So if you override the ``parse`` method,
|
||||
the crawl spider will no longer work.
|
||||
the specified link extractor. This callback receives a :class:`~scrapy.http.Response`
|
||||
as its first argument and must return either a single instance or an iterable of
|
||||
:ref:`item objects <topics-items>` and/or :class:`~scrapy.http.Request` objects
|
||||
(or any subclass of them). As mentioned above, the received :class:`~scrapy.http.Response`
|
||||
object will contain the text of the link that produced the :class:`~scrapy.http.Request`
|
||||
in its ``meta`` dictionary (under the ``link_text`` key)
|
||||
|
||||
``cb_kwargs`` is a dict containing the keyword arguments to be passed to the
|
||||
callback function.
|
||||
|
||||
``follow`` is a boolean which specifies if links should be followed from each
|
||||
response extracted with this rule. If ``callback`` is None ``follow`` defaults
|
||||
to ``True``, otherwise it default to ``False``.
|
||||
to ``True``, otherwise it defaults to ``False``.
|
||||
|
||||
``process_links`` is a callable, or a string (in which case a method from the
|
||||
spider object with that name will be used) which will be called for each list
|
||||
of links extracted from each response using the specified ``link_extractor``.
|
||||
This is mainly used for filtering purposes.
|
||||
|
||||
``process_request`` is a callable, or a string (in which case a method from
|
||||
the spider object with that name will be used) which will be called with
|
||||
every request extracted by this rule, and must return a request or None (to
|
||||
filter out the request).
|
||||
``process_request`` is a callable (or a string, in which case a method from
|
||||
the spider object with that name will be used) which will be called for every
|
||||
:class:`~scrapy.http.Request` extracted by this rule. This callable should
|
||||
take said request as first argument and the :class:`~scrapy.http.Response`
|
||||
from which the request originated as second argument. It must return a
|
||||
``Request`` object or ``None`` (to filter out the request).
|
||||
|
||||
``errback`` is a callable or a string (in which case a method from the spider
|
||||
object with that name will be used) to be called if any exception is
|
||||
raised while processing a request generated by the rule.
|
||||
It receives a :class:`Twisted Failure <twisted.python.failure.Failure>`
|
||||
instance as first parameter.
|
||||
|
||||
|
||||
.. warning:: Because of its internal implementation, you must explicitly set
|
||||
callbacks for new requests when writing :class:`CrawlSpider`-based spiders;
|
||||
unexpected behaviour can occur otherwise.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
The *errback* parameter.
|
||||
|
||||
CrawlSpider example
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
@ -317,8 +428,8 @@ CrawlSpider example
|
||||
Let's now take a look at an example CrawlSpider with rules::
|
||||
|
||||
import scrapy
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
||||
class MySpider(CrawlSpider):
|
||||
name = 'example.com'
|
||||
@ -335,18 +446,24 @@ Let's now take a look at an example CrawlSpider with rules::
|
||||
)
|
||||
|
||||
def parse_item(self, response):
|
||||
self.log('Hi, this is an item page! %s' % response.url)
|
||||
self.logger.info('Hi, this is an item page! %s', response.url)
|
||||
item = scrapy.Item()
|
||||
item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
|
||||
item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
|
||||
item['description'] = response.xpath('//td[@id="item_description"]/text()').extract()
|
||||
item['name'] = response.xpath('//td[@id="item_name"]/text()').get()
|
||||
item['description'] = response.xpath('//td[@id="item_description"]/text()').get()
|
||||
item['link_text'] = response.meta['link_text']
|
||||
url = response.xpath('//td[@id="additional_data"]/@href').get()
|
||||
return response.follow(url, self.parse_additional_page, cb_kwargs=dict(item=item))
|
||||
|
||||
def parse_additional_page(self, response, item):
|
||||
item['additional_data'] = response.xpath('//p[@id="additional_data"]/text()').get()
|
||||
return item
|
||||
|
||||
|
||||
This spider would start crawling example.com's home page, collecting category
|
||||
links, and item links, parsing the latter with the ``parse_item`` method. For
|
||||
each item response, some data will be extracted from the HTML using XPath, and
|
||||
a :class:`~scrapy.item.Item` will be filled with it.
|
||||
an :class:`~scrapy.item.Item` will be filled with it.
|
||||
|
||||
XMLFeedSpider
|
||||
-------------
|
||||
@ -420,7 +537,7 @@ XMLFeedSpider
|
||||
(``itertag``). Receives the response and an
|
||||
:class:`~scrapy.selector.Selector` for each node. Overriding this
|
||||
method is mandatory. Otherwise, you spider won't work. This method
|
||||
must return either a :class:`~scrapy.item.Item` object, a
|
||||
must return an :ref:`item object <topics-items>`, a
|
||||
:class:`~scrapy.http.Request` object, or an iterable containing any of
|
||||
them.
|
||||
|
||||
@ -430,7 +547,12 @@ XMLFeedSpider
|
||||
spider, and it's intended to perform any last time processing required
|
||||
before returning the results to the framework core, for example setting the
|
||||
item IDs. It receives a list of results and the response which originated
|
||||
those results. It must return a list of results (Items or Requests).
|
||||
those results. It must return a list of results (items or requests).
|
||||
|
||||
|
||||
.. warning:: Because of its internal implementation, you must explicitly set
|
||||
callbacks for new requests when writing :class:`XMLFeedSpider`-based spiders;
|
||||
unexpected behaviour can occur otherwise.
|
||||
|
||||
|
||||
XMLFeedSpider example
|
||||
@ -438,8 +560,7 @@ XMLFeedSpider example
|
||||
|
||||
These spiders are pretty easy to use, let's have a look at one example::
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.contrib.spiders import XMLFeedSpider
|
||||
from scrapy.spiders import XMLFeedSpider
|
||||
from myproject.items import TestItem
|
||||
|
||||
class MySpider(XMLFeedSpider):
|
||||
@ -450,12 +571,12 @@ These spiders are pretty easy to use, let's have a look at one example::
|
||||
itertag = 'item'
|
||||
|
||||
def parse_node(self, response, node):
|
||||
log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract())))
|
||||
self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.getall()))
|
||||
|
||||
item = TestItem()
|
||||
item['id'] = node.xpath('@id').extract()
|
||||
item['name'] = node.xpath('name').extract()
|
||||
item['description'] = node.xpath('description').extract()
|
||||
item['id'] = node.xpath('@id').get()
|
||||
item['name'] = node.xpath('name').get()
|
||||
item['description'] = node.xpath('description').get()
|
||||
return item
|
||||
|
||||
Basically what we did up there was to create a spider that downloads a feed from
|
||||
@ -476,10 +597,14 @@ CSVFeedSpider
|
||||
A string with the separator character for each field in the CSV file
|
||||
Defaults to ``','`` (comma).
|
||||
|
||||
.. attribute:: quotechar
|
||||
|
||||
A string with the enclosure character for each field in the CSV file
|
||||
Defaults to ``'"'`` (quotation mark).
|
||||
|
||||
.. attribute:: headers
|
||||
|
||||
A list of the rows contained in the file CSV feed which will be used to
|
||||
extract fields from it.
|
||||
A list of the column names in the CSV file.
|
||||
|
||||
.. method:: parse_row(response, row)
|
||||
|
||||
@ -494,8 +619,7 @@ CSVFeedSpider example
|
||||
Let's see an example similar to the previous one, but using a
|
||||
:class:`CSVFeedSpider`::
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.contrib.spiders import CSVFeedSpider
|
||||
from scrapy.spiders import CSVFeedSpider
|
||||
from myproject.items import TestItem
|
||||
|
||||
class MySpider(CSVFeedSpider):
|
||||
@ -503,10 +627,11 @@ Let's see an example similar to the previous one, but using a
|
||||
allowed_domains = ['example.com']
|
||||
start_urls = ['http://www.example.com/feed.csv']
|
||||
delimiter = ';'
|
||||
quotechar = "'"
|
||||
headers = ['id', 'name', 'description']
|
||||
|
||||
def parse_row(self, response, row):
|
||||
log.msg('Hi, this is a row!: %r' % row)
|
||||
self.logger.info('Hi, this is a row!: %r', row)
|
||||
|
||||
item = TestItem()
|
||||
item['id'] = row['id']
|
||||
@ -556,7 +681,7 @@ SitemapSpider
|
||||
|
||||
.. attribute:: sitemap_follow
|
||||
|
||||
A list of regexes of sitemap that should be followed. This is is only
|
||||
A list of regexes of sitemap that should be followed. This is only
|
||||
for sites that use `Sitemap index files`_ that point to other sitemap
|
||||
files.
|
||||
|
||||
@ -581,6 +706,50 @@ SitemapSpider
|
||||
|
||||
Default is ``sitemap_alternate_links`` disabled.
|
||||
|
||||
.. method:: sitemap_filter(entries)
|
||||
|
||||
This is a filter function that could be overridden to select sitemap entries
|
||||
based on their attributes.
|
||||
|
||||
For example::
|
||||
|
||||
<url>
|
||||
<loc>http://example.com/</loc>
|
||||
<lastmod>2005-01-01</lastmod>
|
||||
</url>
|
||||
|
||||
We can define a ``sitemap_filter`` function to filter ``entries`` by date::
|
||||
|
||||
from datetime import datetime
|
||||
from scrapy.spiders import SitemapSpider
|
||||
|
||||
class FilteredSitemapSpider(SitemapSpider):
|
||||
name = 'filtered_sitemap_spider'
|
||||
allowed_domains = ['example.com']
|
||||
sitemap_urls = ['http://example.com/sitemap.xml']
|
||||
|
||||
def sitemap_filter(self, entries):
|
||||
for entry in entries:
|
||||
date_time = datetime.strptime(entry['lastmod'], '%Y-%m-%d')
|
||||
if date_time.year >= 2005:
|
||||
yield entry
|
||||
|
||||
This would retrieve only ``entries`` modified on 2005 and the following
|
||||
years.
|
||||
|
||||
Entries are dict objects extracted from the sitemap document.
|
||||
Usually, the key is the tag name and the value is the text inside it.
|
||||
|
||||
It's important to notice that:
|
||||
|
||||
- as the loc attribute is required, entries without this tag are discarded
|
||||
- alternate links are stored in a list with the key ``alternate``
|
||||
(see ``sitemap_alternate_links``)
|
||||
- namespaces are removed, so lxml tags named as ``{namespace}tagname`` become only ``tagname``
|
||||
|
||||
If you omit this method, all entries found in sitemaps will be
|
||||
processed, observing other attributes and their settings.
|
||||
|
||||
|
||||
SitemapSpider examples
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -588,7 +757,7 @@ SitemapSpider examples
|
||||
Simplest example: process all urls discovered through sitemaps using the
|
||||
``parse`` callback::
|
||||
|
||||
from scrapy.contrib.spiders import SitemapSpider
|
||||
from scrapy.spiders import SitemapSpider
|
||||
|
||||
class MySpider(SitemapSpider):
|
||||
sitemap_urls = ['http://www.example.com/sitemap.xml']
|
||||
@ -599,7 +768,7 @@ Simplest example: process all urls discovered through sitemaps using the
|
||||
Process some urls with certain callback and other urls with a different
|
||||
callback::
|
||||
|
||||
from scrapy.contrib.spiders import SitemapSpider
|
||||
from scrapy.spiders import SitemapSpider
|
||||
|
||||
class MySpider(SitemapSpider):
|
||||
sitemap_urls = ['http://www.example.com/sitemap.xml']
|
||||
@ -617,7 +786,7 @@ callback::
|
||||
Follow sitemaps defined in the `robots.txt`_ file and only follow sitemaps
|
||||
whose url contains ``/sitemap_shop``::
|
||||
|
||||
from scrapy.contrib.spiders import SitemapSpider
|
||||
from scrapy.spiders import SitemapSpider
|
||||
|
||||
class MySpider(SitemapSpider):
|
||||
sitemap_urls = ['http://www.example.com/robots.txt']
|
||||
@ -631,7 +800,7 @@ whose url contains ``/sitemap_shop``::
|
||||
|
||||
Combine SitemapSpider with other sources of urls::
|
||||
|
||||
from scrapy.contrib.spiders import SitemapSpider
|
||||
from scrapy.spiders import SitemapSpider
|
||||
|
||||
class MySpider(SitemapSpider):
|
||||
sitemap_urls = ['http://www.example.com/robots.txt']
|
||||
@ -652,8 +821,8 @@ Combine SitemapSpider with other sources of urls::
|
||||
def parse_other(self, response):
|
||||
pass # ... scrape other here ...
|
||||
|
||||
.. _Sitemaps: http://www.sitemaps.org
|
||||
.. _Sitemap index files: http://www.sitemaps.org/protocol.php#index
|
||||
.. _robots.txt: http://www.robotstxt.org/
|
||||
.. _TLD: http://en.wikipedia.org/wiki/Top-level_domain
|
||||
.. _Scrapyd documentation: http://scrapyd.readthedocs.org/
|
||||
.. _Sitemaps: https://www.sitemaps.org/index.html
|
||||
.. _Sitemap index files: https://www.sitemaps.org/protocol.html#index
|
||||
.. _robots.txt: https://www.robotstxt.org/
|
||||
.. _TLD: https://en.wikipedia.org/wiki/Top-level_domain
|
||||
.. _Scrapyd documentation: https://scrapyd.readthedocs.io/en/latest/
|
||||
|
@ -32,7 +32,7 @@ Common Stats Collector uses
|
||||
Access the stats collector through the :attr:`~scrapy.crawler.Crawler.stats`
|
||||
attribute. Here is an example of an extension that access stats::
|
||||
|
||||
class ExtensionThatAccessStats(object):
|
||||
class ExtensionThatAccessStats:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
@ -47,7 +47,7 @@ Set stat value::
|
||||
|
||||
Increment stat value::
|
||||
|
||||
stats.inc_value('pages_crawled')
|
||||
stats.inc_value('custom_count')
|
||||
|
||||
Set stat value only if greater than previous::
|
||||
|
||||
@ -57,15 +57,15 @@ Set stat value only if lower than previous::
|
||||
|
||||
stats.min_value('min_free_memory_percent', value)
|
||||
|
||||
Get stat value::
|
||||
Get stat value:
|
||||
|
||||
>>> stats.get_value('pages_crawled')
|
||||
8
|
||||
>>> stats.get_value('custom_count')
|
||||
1
|
||||
|
||||
Get all stats::
|
||||
Get all stats:
|
||||
|
||||
>>> stats.get_stats()
|
||||
{'pages_crawled': 1238, 'start_time': datetime.datetime(2009, 7, 14, 21, 47, 28, 977139)}
|
||||
>>> stats.get_stats()
|
||||
{'custom_count': 1, 'start_time': datetime.datetime(2009, 7, 14, 21, 47, 28, 977139)}
|
||||
|
||||
Available Stats Collectors
|
||||
==========================
|
||||
@ -75,8 +75,7 @@ available in Scrapy which extend the basic Stats Collector. You can select
|
||||
which Stats Collector to use through the :setting:`STATS_CLASS` setting. The
|
||||
default Stats Collector used is the :class:`MemoryStatsCollector`.
|
||||
|
||||
.. module:: scrapy.statscol
|
||||
:synopsis: Stats Collectors
|
||||
.. currentmodule:: scrapy.statscollectors
|
||||
|
||||
MemoryStatsCollector
|
||||
--------------------
|
||||
|