diff --git a/.bandit.yml b/.bandit.yml new file mode 100644 index 000000000..243379b0b --- /dev/null +++ b/.bandit.yml @@ -0,0 +1,18 @@ +skips: +- B101 +- B105 +- B301 +- B303 +- B306 +- B307 +- B311 +- B320 +- B321 +- B402 # https://github.com/scrapy/scrapy/issues/4180 +- B403 +- B404 +- B406 +- B410 +- B503 +- B603 +- B605 diff --git a/.bumpversion.cfg b/.bumpversion.cfg index d373d676a..3c1c8f891 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,8 +1,7 @@ [bumpversion] -current_version = 0.25.1 +current_version = 2.3.0 commit = True tag = True tag_name = {new_version} [bumpversion:file:scrapy/VERSION] - diff --git a/.coveragerc b/.coveragerc index 3baaf659a..02acbff8e 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,3 +1,5 @@ [run] +branch = true include = scrapy/* -omit = scrapy/xlib*,scrapy/tests* +omit = + tests/* diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..dfbdf4208 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +tests/sample_data/** binary diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..8ca10109b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,41 @@ +--- +name: Bug report +about: Report a problem to help us improve +--- + + + +### Description + +[Description of the issue] + +### Steps to Reproduce + +1. [First Step] +2. [Second Step] +3. [and so on...] + +**Expected behavior:** [What you expect to happen] + +**Actual behavior:** [What actually happens] + +**Reproduces how often:** [What percentage of the time does it reproduce?] + +### Versions + +Please paste here the output of executing `scrapy version --verbose` in the command line. + +### Additional context + +Any additional information, configuration, data or output from commands that might be necessary to reproduce or understand the issue. Please try not to include screenshots of code or the command line, paste the contents as text instead. You can use [GitHub Flavored Markdown](https://help.github.com/en/articles/creating-and-highlighting-code-blocks) to make the text look better. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..e05273fe2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,33 @@ +--- +name: Feature request +about: Suggest an idea for an enhancement or new feature +--- + + + +## Summary + +One paragraph explanation of the feature. + +## Motivation + +Why are we doing this? What use cases does it support? What is the expected outcome? + +## Describe alternatives you've considered + +A clear and concise description of the alternative solutions you've considered. Be sure to explain why Scrapy's existing customizability isn't suitable for this feature. + +## Additional context + +Any additional information about the feature request here. diff --git a/.gitignore b/.gitignore index 4eb80012f..83a2569dd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +/.vagrant +/scrapy.iml *.pyc _trial_temp* dropin.cache @@ -8,3 +10,12 @@ venv build dist .idea +htmlcov/ +.coverage +.pytest_cache/ +.coverage.* +.cache/ +.mypy_cache/ + +# Windows +Thumbs.db diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 000000000..e4d3f02cc --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,12 @@ +version: 2 +formats: all +sphinx: + configuration: docs/conf.py + fail_on_warning: true +python: + # For available versions, see: + # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-image + version: 3.7 # Keep in sync with .travis.yml + install: + - requirements: docs/requirements.txt + - path: . diff --git a/.travis-workarounds.sh b/.travis-workarounds.sh deleted file mode 100755 index 5c34e54f7..000000000 --- a/.travis-workarounds.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -set -e -set -x - -if [[ "${TOXENV}" == "pypy" ]]; then - sudo add-apt-repository -y ppa:pypy/ppa - sudo apt-get -qy update - sudo apt-get install -y pypy pypy-dev - # This is required because we need to get rid of the Travis installed PyPy - # or it'll take precedence over the PPA installed one. - sudo rm -rf /usr/local/pypy/bin -fi - -# Workaround travis-ci/travis-ci#2065 -pip install -U wheel diff --git a/.travis.yml b/.travis.yml index b30d13bed..33a920bb6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,19 +1,74 @@ language: python -python: 2.7 -env: -- TOXENV=py27 -- TOXENV=precise -- TOXENV=py33 +dist: xenial +branches: + only: + - master + - /^\d\.\d+$/ + - /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/ +matrix: + include: + - env: TOXENV=security + python: 3.8 + - env: TOXENV=flake8 + python: 3.8 + - env: TOXENV=pylint + python: 3.8 + - env: TOXENV=docs + python: 3.7 # Keep in sync with .readthedocs.yml + - env: TOXENV=typing + python: 3.8 + + - env: TOXENV=pinned + python: 3.5.2 + - env: TOXENV=asyncio-pinned + python: 3.5.2 # We use additional code to support 3.5.3 and earlier + - env: TOXENV=pypy3-pinned PYPY_VERSION=3-v5.9.0 + + - env: TOXENV=py + python: 3.5 + - env: TOXENV=asyncio + python: 3.5 # We use specific code to support >= 3.5.4, < 3.6 + - env: TOXENV=pypy3 PYPY_VERSION=3.5-v7.0.0 + + - env: TOXENV=py + python: 3.6 + - env: TOXENV=pypy3 PYPY_VERSION=3.6-v7.3.1 + + - env: TOXENV=py + python: 3.7 + + - env: TOXENV=py PYPI_RELEASE_JOB=true + python: 3.8 + dist: bionic + - env: TOXENV=extra-deps + python: 3.8 + dist: bionic + - env: TOXENV=asyncio + python: 3.8 + dist: bionic install: -- "./.travis-workarounds.sh" -- pip install -U tox + - | + if [[ ! -z "$PYPY_VERSION" ]]; then + export PYPY_VERSION="pypy$PYPY_VERSION-linux64" + wget "https://downloads.python.org/pypy/${PYPY_VERSION}.tar.bz2" + tar -jxf ${PYPY_VERSION}.tar.bz2 + virtualenv --python="$PYPY_VERSION/bin/pypy3" "$HOME/virtualenvs/$PYPY_VERSION" + source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate" + fi + - pip install -U tox twine wheel codecov + script: tox +after_success: + - codecov notifications: irc: use_notice: true skip_join: true channels: - irc.freenode.org#scrapy +cache: + directories: + - $HOME/.cache/pip deploy: provider: pypi distributions: "sdist bdist_wheel" @@ -22,6 +77,5 @@ deploy: secure: JaAKcy1AXWXDK3LXdjOtKyaVPCSFoCGCnW15g4f65E/8Fsi9ZzDfmBa4Equs3IQb/vs/if2SVrzJSr7arN7r9Z38Iv1mUXHkFAyA3Ym8mThfABBzzcUWEQhIHrCX0Tdlx9wQkkhs+PZhorlmRS4gg5s6DzPaeA2g8SCgmlRmFfA= on: tags: true - all_branches: true repo: scrapy/scrapy - condition: "$TOXENV == py27 && $TRAVIS_TAG =~ ^[0-9][.][0-9]*[02468][.]" + condition: "$PYPI_RELEASE_JOB == true && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$" diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..d1cd3e517 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,74 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, gender identity and expression, level of experience, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at opensource@scrapinghub.com. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at [http://contributor-covenant.org/version/1/4][version]. + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6624b43b6..a05d07aee 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,2 +1,6 @@ The guidelines for contributing are available here: -http://doc.scrapy.org/en/latest/contributing.html +https://docs.scrapy.org/en/master/contributing.html + +Please do not abuse the issue tracker for support questions. +If your issue topic can be rephrased to "How to ...?", please use the +support channels to get it answered: https://scrapy.org/community/ diff --git a/INSTALL b/INSTALL index 84803a933..06e812936 100644 --- a/INSTALL +++ b/INSTALL @@ -1,4 +1,4 @@ For information about installing Scrapy see: * docs/intro/install.rst (local file) -* http://doc.scrapy.org/en/latest/intro/install.html (online version) +* https://docs.scrapy.org/en/latest/intro/install.html (online version) diff --git a/LICENSE b/LICENSE index 68ccf9762..4d0a0863a 100644 --- a/LICENSE +++ b/LICENSE @@ -4,11 +4,11 @@ All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of Scrapy nor the names of its contributors may be used diff --git a/MANIFEST.in b/MANIFEST.in index 0561cc74c..ae7db51fa 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,11 +3,24 @@ include AUTHORS include INSTALL include LICENSE include MANIFEST.in +include NEWS + include scrapy/VERSION include scrapy/mime.types + +include codecov.yml +include conftest.py +include pytest.ini +include requirements-*.txt +include tox.ini + recursive-include scrapy/templates * recursive-include scrapy license.txt recursive-include docs * prune docs/build + recursive-include extras * recursive-include bin * +recursive-include tests * + +global-exclude __pycache__ *.py[cod] diff --git a/Makefile.buildbot b/Makefile.buildbot deleted file mode 100644 index 68c8bdc54..000000000 --- a/Makefile.buildbot +++ /dev/null @@ -1,21 +0,0 @@ -TRIAL := $(shell which trial) -BRANCH := $(shell git rev-parse --abbrev-ref HEAD) -export PYTHONPATH=$(PWD) - -test: - coverage run --branch $(TRIAL) --reporter=text tests - rm -rf htmlcov && coverage html - -s3cmd sync -P htmlcov/ s3://static.scrapy.org/coverage-scrapy-$(BRANCH)/ - -build: - test $(BRANCH) != master || git describe >scrapy/VERSION - python extras/makedeb.py build - -clean: - git checkout debian scrapy/VERSION - git clean -dfq - -pypi: - umask 0022 && chmod -R a+rX . && python setup.py sdist upload - -.PHONY: clean test build diff --git a/README.rst b/README.rst index 6020a3670..0e3939e9b 100644 --- a/README.rst +++ b/README.rst @@ -2,31 +2,46 @@ Scrapy ====== -.. image:: https://badge.fury.io/py/Scrapy.png - :target: http://badge.fury.io/py/Scrapy +.. image:: https://img.shields.io/pypi/v/Scrapy.svg + :target: https://pypi.python.org/pypi/Scrapy + :alt: PyPI Version -.. image:: https://secure.travis-ci.org/scrapy/scrapy.png?branch=master - :target: http://travis-ci.org/scrapy/scrapy +.. image:: https://img.shields.io/pypi/pyversions/Scrapy.svg + :target: https://pypi.python.org/pypi/Scrapy + :alt: Supported Python Versions + +.. image:: https://img.shields.io/travis/scrapy/scrapy/master.svg + :target: https://travis-ci.org/scrapy/scrapy + :alt: Build Status + +.. image:: https://img.shields.io/badge/wheel-yes-brightgreen.svg + :target: https://pypi.python.org/pypi/Scrapy + :alt: Wheel Status + +.. image:: https://img.shields.io/codecov/c/github/scrapy/scrapy/master.svg + :target: https://codecov.io/github/scrapy/scrapy?branch=master + :alt: Coverage report + +.. image:: https://anaconda.org/conda-forge/scrapy/badges/version.svg + :target: https://anaconda.org/conda-forge/scrapy + :alt: Conda Version -.. image:: https://pypip.in/wheel/Scrapy/badge.png - :target: https://pypi.python.org/pypi/Scrapy/ - :alt: Wheel Status Overview ======== -Scrapy is a fast high-level screen scraping and web crawling framework, used to +Scrapy is a fast high-level web crawling and web scraping framework, used to crawl websites and extract structured data from their pages. It can be used for a wide range of purposes, from data mining to monitoring and automated testing. -For more information including a list of features check the Scrapy homepage at: -http://scrapy.org +Check the Scrapy homepage at https://scrapy.org for more information, +including a list of features. Requirements ============ -* Python 2.7 -* Works on Linux, Windows, Mac OSX, BSD +* Python 3.5.2+ +* Works on Linux, Windows, macOS, BSD Install ======= @@ -35,37 +50,45 @@ The quick way:: pip install scrapy -For more details see the install section in the documentation: -http://doc.scrapy.org/en/latest/intro/install.html - -Releases -======== - -You can download the latest stable and development releases from: -http://scrapy.org/download/ +See the install section in the documentation at +https://docs.scrapy.org/en/latest/intro/install.html for more details. Documentation ============= -Documentation is available online at http://doc.scrapy.org/ and in the ``docs`` +Documentation is available online at https://docs.scrapy.org/ and in the ``docs`` directory. +Releases +======== + +You can check https://docs.scrapy.org/en/latest/news.html for the release notes. + Community (blog, twitter, mail list, IRC) ========================================= -See http://scrapy.org/community/ +See https://scrapy.org/community/ for details. Contributing ============ -See http://doc.scrapy.org/en/latest/contributing.html +See https://docs.scrapy.org/en/master/contributing.html for details. + +Code of Conduct +--------------- + +Please note that this project is released with a Contributor Code of Conduct +(see https://github.com/scrapy/scrapy/blob/master/CODE_OF_CONDUCT.md). + +By participating in this project you agree to abide by its terms. +Please report unacceptable behavior to opensource@scrapinghub.com. Companies using Scrapy ====================== -See http://scrapy.org/companies/ +See https://scrapy.org/companies/ for a list. Commercial Support ================== -See http://scrapy.org/support/ +See https://scrapy.org/support/ for details. diff --git a/artwork/README b/artwork/README.rst similarity index 73% rename from artwork/README rename to artwork/README.rst index c185d57da..8a1028cde 100644 --- a/artwork/README +++ b/artwork/README.rst @@ -1,3 +1,4 @@ +============== Scrapy artwork ============== @@ -8,10 +9,10 @@ scrapy-logo.jpg Main Scrapy logo, in JPEG format. -qlassik.zip +qlassik.zip ----------- -Font used for Scrapy logo. Homepage: http://www.dafont.com/qlassik.font +Font used for Scrapy logo. Homepage: https://www.dafont.com/qlassik.font scrapy-blog.logo.xcf -------------------- diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 000000000..710e42090 --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,24 @@ +variables: + TOXENV: py +pool: + vmImage: 'windows-latest' +strategy: + matrix: + Python35: + python.version: '3.5' + TOXENV: windows-pinned + Python36: + python.version: '3.6' + Python37: + python.version: '3.7' + Python38: + python.version: '3.8' +steps: +- task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + displayName: 'Use Python $(python.version)' +- script: | + pip install -U tox twine wheel codecov + tox + displayName: 'Run test suite' diff --git a/bin/scrapy b/bin/scrapy deleted file mode 100755 index 918ea7fbd..000000000 --- a/bin/scrapy +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python - -from scrapy.cmdline import execute -execute() diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 000000000..d8aa6b984 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,6 @@ +comment: + layout: "header, diff, tree" + +coverage: + status: + project: false diff --git a/conftest.py b/conftest.py index 9f9a5bca7..b39d644a5 100644 --- a/conftest.py +++ b/conftest.py @@ -1,49 +1,55 @@ -import six +from pathlib import Path + import pytest -from twisted.python import log - -from scrapy import optional_features - -collect_ignore = ["scrapy/stats.py"] -if 'django' not in optional_features: - collect_ignore.append("tests/test_djangoitem/models.py") - -if six.PY3: - for fn in open('tests/py3-ignores.txt'): - if fn.strip(): - collect_ignore.append(fn.strip()) - -class LogObservers: - """Class for keeping track of log observers across test modules""" - - def __init__(self): - self.observers = [] - - def add(self, logfile='test.log'): - fileobj = open(logfile, 'wb') - observer = log.FileLogObserver(fileobj) - log.startLoggingWithObserver(observer.emit, 0) - self.observers.append((fileobj, observer)) - - def remove(self): - fileobj, observer = self.observers.pop() - log.removeObserver(observer.emit) - fileobj.close() -@pytest.fixture(scope='module') -def log_observers(): - return LogObservers() +def _py_files(folder): + return (str(p) for p in Path(folder).rglob('*.py')) -@pytest.fixture() -def setlog(request, log_observers): - """Attach test.log file observer to twisted log, for trial compatibility""" - log_observers.add() - request.addfinalizer(log_observers.remove) +collect_ignore = [ + # not a test, but looks like a test + "scrapy/utils/testsite.py", + # contains scripts to be run by tests/test_crawler.py::CrawlerProcessSubprocess + *_py_files("tests/CrawlerProcess"), + # contains scripts to be run by tests/test_crawler.py::CrawlerRunnerSubprocess + *_py_files("tests/CrawlerRunner"), + # Py36-only parts of respective tests + *_py_files("tests/py36"), +] + +for line in open('tests/ignores.txt'): + file_path = line.strip() + if file_path and file_path[0] != '#': + collect_ignore.append(file_path) @pytest.fixture() def chdir(tmpdir): """Change to pytest-provided temporary directory""" tmpdir.chdir() + + +def pytest_collection_modifyitems(session, config, items): + # Avoid executing tests when executing `--flake8` flag (pytest-flake8) + try: + from pytest_flake8 import Flake8Item + if config.getoption('--flake8'): + items[:] = [item for item in items if isinstance(item, Flake8Item)] + except ImportError: + pass + + +@pytest.fixture(scope='class') +def reactor_pytest(request): + if not request.cls: + # doctests + return + request.cls.reactor_pytest = request.config.getoption("--reactor") + return request.cls.reactor_pytest + + +@pytest.fixture(autouse=True) +def only_asyncio(request, reactor_pytest): + if request.node.get_closest_marker('only_asyncio') and reactor_pytest != 'asyncio': + pytest.skip('This test is only run with --reactor=asyncio') diff --git a/debian/changelog b/debian/changelog deleted file mode 100644 index f4f5b9d9c..000000000 --- a/debian/changelog +++ /dev/null @@ -1,5 +0,0 @@ -scrapy-SUFFIX (0.11) unstable; urgency=low - - * Initial release. - - -- Scrapinghub Team Thu, 10 Jun 2010 17:24:02 -0300 diff --git a/debian/compat b/debian/compat deleted file mode 100644 index 7f8f011eb..000000000 --- a/debian/compat +++ /dev/null @@ -1 +0,0 @@ -7 diff --git a/debian/control b/debian/control deleted file mode 100644 index 4be62895f..000000000 --- a/debian/control +++ /dev/null @@ -1,20 +0,0 @@ -Source: scrapy-SUFFIX -Section: python -Priority: optional -Maintainer: Scrapinghub Team -Build-Depends: debhelper (>= 7.0.50), python (>=2.7), python-twisted, python-w3lib, python-lxml, python-six (>=1.5.2) -Standards-Version: 3.8.4 -Homepage: http://scrapy.org/ - -Package: scrapy-SUFFIX -Architecture: all -Depends: ${python:Depends}, python-lxml, python-twisted, python-openssl, - python-w3lib (>= 1.8.0), python-queuelib, python-cssselect (>= 0.9), python-six (>=1.5.2) -Recommends: python-setuptools -Conflicts: python-scrapy, scrapy, scrapy-0.11 -Provides: python-scrapy, scrapy -Description: Python web crawling and scraping framework - Scrapy is a fast high-level screen scraping and web crawling framework, - used to crawl websites and extract structured data from their pages. - It can be used for a wide range of purposes, from data mining to - monitoring and automated testing. diff --git a/debian/copyright b/debian/copyright deleted file mode 100644 index 4cc239002..000000000 --- a/debian/copyright +++ /dev/null @@ -1,40 +0,0 @@ -This package was debianized by the Scrapinghub team . - -It was downloaded from http://scrapy.org - -Upstream Author: Scrapy Developers - -Copyright: 2007-2013 Scrapy Developers - -License: bsd - -Copyright (c) Scrapy developers. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - 3. Neither the name of Scrapy nor the names of its contributors may be used - to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The Debian packaging is (C) 2010-2013, Scrapinghub and -is licensed under the BSD, see `/usr/share/common-licenses/BSD'. diff --git a/debian/pyversions b/debian/pyversions deleted file mode 100644 index 1effb0034..000000000 --- a/debian/pyversions +++ /dev/null @@ -1 +0,0 @@ -2.7 diff --git a/debian/rules b/debian/rules deleted file mode 100755 index b8796e6e3..000000000 --- a/debian/rules +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/make -f -# -*- makefile -*- - -%: - dh $@ diff --git a/debian/scrapy.docs b/debian/scrapy.docs deleted file mode 100644 index c19ffba4d..000000000 --- a/debian/scrapy.docs +++ /dev/null @@ -1,2 +0,0 @@ -README.rst -AUTHORS diff --git a/debian/scrapy.install b/debian/scrapy.install deleted file mode 100644 index 5977d5f43..000000000 --- a/debian/scrapy.install +++ /dev/null @@ -1 +0,0 @@ -extras/scrapy_bash_completion etc/bash_completion.d/ diff --git a/debian/scrapy.lintian-overrides b/debian/scrapy.lintian-overrides deleted file mode 100644 index 955e7def0..000000000 --- a/debian/scrapy.lintian-overrides +++ /dev/null @@ -1,2 +0,0 @@ -new-package-should-close-itp-bug -extra-license-file usr/share/pyshared/scrapy/xlib/pydispatch/license.txt diff --git a/debian/scrapy.manpages b/debian/scrapy.manpages deleted file mode 100644 index 4818e9c92..000000000 --- a/debian/scrapy.manpages +++ /dev/null @@ -1 +0,0 @@ -extras/scrapy.1 diff --git a/docs/Makefile b/docs/Makefile index c6e4dd64d..ff68bf1ae 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -8,8 +8,10 @@ PYTHON = python SPHINXOPTS = PAPER = SOURCES = +SHELL = /bin/bash -ALLSPHINXOPTS = -b $(BUILDER) -d build/doctrees -D latex_paper_size=$(PAPER) \ +ALLSPHINXOPTS = -b $(BUILDER) -d build/doctrees \ + -D latex_elements.papersize=$(PAPER) \ $(SPHINXOPTS) . build/$(BUILDER) $(SOURCES) .PHONY: help update build html htmlhelp clean @@ -22,13 +24,19 @@ help: @echo " text to make plain text files" @echo " changes to make an overview over all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" + @echo " watch build HTML docs, open in browser and watch for changes" - -build: +build-dirs: mkdir -p build/$(BUILDER) build/doctrees + +build: build-dirs sphinx-build $(ALLSPHINXOPTS) @echo +build-ignore-errors: build-dirs + -sphinx-build $(ALLSPHINXOPTS) + @echo + html: BUILDER = html html: build @@ -58,6 +66,12 @@ linkcheck: build @echo "Link check complete; look for any errors in the above output " \ "or in build/$(BUILDER)/output.txt" +linkfix: BUILDER = linkcheck +linkfix: build-ignore-errors + $(PYTHON) utils/linkfix.py + @echo "Fixing redirecting links in docs has finished; check all " \ + "replacements before committing them" + doctest: BUILDER = doctest doctest: build @echo "Testing of doctests in the sources finished, look at the " \ @@ -68,9 +82,15 @@ pydoc-topics: build @echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \ "into the Lib/ directory" +coverage: BUILDER = coverage +coverage: build + htmlview: html - $(PYTHON) -c "import webbrowser; webbrowser.open('build/html/index.html')" + $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \ + os.path.realpath('build/html/index.html'))" clean: -rm -rf build/* +watch: htmlview + watchmedo shell-command -p '*.rst' -c 'make html' -R -D diff --git a/docs/README b/docs/README.rst similarity index 58% rename from docs/README rename to docs/README.rst index 7fd549374..0b7afa548 100644 --- a/docs/README +++ b/docs/README.rst @@ -1,3 +1,5 @@ +:orphan: + ====================================== Scrapy documentation quick start guide ====================================== @@ -8,16 +10,12 @@ This file provides a quick guide on how to compile the Scrapy documentation. Setup the environment --------------------- -To compile the documentation you need the following Python libraries: +To compile the documentation you need Sphinx Python library. To install it +and all its dependencies run the following command from this dir - * Sphinx - * docutils - * jinja +:: -If you have setuptools available the following command will install all of them -(since Sphinx requires both docutils and jinja):: - - easy_install Sphinx + pip install -r requirements.txt Compile the documentation @@ -52,3 +50,19 @@ To cleanup all generated documentation files and start from scratch run:: Keep in mind that this command won't touch any documentation source files. +Recreating documentation on the fly +----------------------------------- + +There is a way to recreate the doc automatically when you make changes, you +need to install watchdog (``pip install watchdog``) and then use:: + + make watch + +Alternative method using tox +---------------------------- + +To compile the documentation to HTML run the following command:: + + tox -e docs + +Documentation will be generated (in HTML format) inside the ``.tox/docs/tmp/html`` dir. diff --git a/docs/_ext/scrapydocs.py b/docs/_ext/scrapydocs.py index 1fa1c93d6..640660943 100644 --- a/docs/_ext/scrapydocs.py +++ b/docs/_ext/scrapydocs.py @@ -1,5 +1,82 @@ from docutils.parsers.rst.roles import set_classes from docutils import nodes +from docutils.parsers.rst import Directive +from sphinx.util.nodes import make_refnode +from operator import itemgetter + + +class settingslist_node(nodes.General, nodes.Element): + pass + + +class SettingsListDirective(Directive): + def run(self): + return [settingslist_node('')] + + +def is_setting_index(node): + if node.tagname == 'index': + # index entries for setting directives look like: + # [('pair', 'SETTING_NAME; setting', 'std:setting-SETTING_NAME', '')] + entry_type, info, refid = node['entries'][0][:3] + return entry_type == 'pair' and info.endswith('; setting') + return False + + +def get_setting_target(node): + # target nodes are placed next to the node in the doc tree + return node.parent[node.parent.index(node) + 1] + + +def get_setting_name_and_refid(node): + """Extract setting name from directive index node""" + entry_type, info, refid = node['entries'][0][:3] + return info.replace('; setting', ''), refid + + +def collect_scrapy_settings_refs(app, doctree): + env = app.builder.env + + if not hasattr(env, 'scrapy_all_settings'): + env.scrapy_all_settings = [] + + for node in doctree.traverse(is_setting_index): + targetnode = get_setting_target(node) + assert isinstance(targetnode, nodes.target), "Next node is not a target" + + setting_name, refid = get_setting_name_and_refid(node) + + env.scrapy_all_settings.append({ + 'docname': env.docname, + 'setting_name': setting_name, + 'refid': refid, + }) + + +def make_setting_element(setting_data, app, fromdocname): + refnode = make_refnode(app.builder, fromdocname, + todocname=setting_data['docname'], + targetid=setting_data['refid'], + child=nodes.Text(setting_data['setting_name'])) + p = nodes.paragraph() + p += refnode + + item = nodes.list_item() + item += p + return item + + +def replace_settingslist_nodes(app, doctree, fromdocname): + env = app.builder.env + + for node in doctree.traverse(settingslist_node): + settings_list = nodes.bullet_list() + settings_list.extend([make_setting_element(d, app, fromdocname) + for d in sorted(env.scrapy_all_settings, + key=itemgetter('setting_name')) + if fromdocname != d['docname']]) + node.replace_self(settings_list) + def setup(app): app.add_crossref_type( @@ -27,24 +104,34 @@ def setup(app): app.add_role('issue', issue_role) app.add_role('rev', rev_role) + app.add_node(settingslist_node) + app.add_directive('settingslist', SettingsListDirective) + + app.connect('doctree-read', collect_scrapy_settings_refs) + app.connect('doctree-resolved', replace_settingslist_nodes) + + def source_role(name, rawtext, text, lineno, inliner, options={}, content=[]): ref = 'https://github.com/scrapy/scrapy/blob/master/' + text set_classes(options) node = nodes.reference(rawtext, text, refuri=ref, **options) return [node], [] + def issue_role(name, rawtext, text, lineno, inliner, options={}, content=[]): ref = 'https://github.com/scrapy/scrapy/issues/' + text set_classes(options) node = nodes.reference(rawtext, 'issue ' + text, refuri=ref, **options) return [node], [] + def commit_role(name, rawtext, text, lineno, inliner, options={}, content=[]): ref = 'https://github.com/scrapy/scrapy/commit/' + text set_classes(options) node = nodes.reference(rawtext, 'commit ' + text, refuri=ref, **options) return [node], [] + def rev_role(name, rawtext, text, lineno, inliner, options={}, content=[]): ref = 'http://hg.scrapy.org/scrapy/changeset/' + text set_classes(options) diff --git a/docs/_static/scrapydoc.css b/docs/_static/scrapydoc.css deleted file mode 100644 index 3e58a5e70..000000000 --- a/docs/_static/scrapydoc.css +++ /dev/null @@ -1,657 +0,0 @@ -/** - * Sphinx Doc Design - */ - -body { - font-family: sans-serif; - font-size: 100%; - background-color: #3d1e11; - color: #000; - margin: 0; - padding: 0; -} - -/* :::: LAYOUT :::: */ - -div.document { - background-color: #69341e; -} - -div.documentwrapper { - float: left; - width: 100%; -} - -div.bodywrapper { - margin: 0 0 0 230px; -} - -div.body { - background-color: white; - padding: 0 20px 30px 20px; -} - -div.sphinxsidebarwrapper { - padding: 10px 5px 0 10px; -} - -div.sphinxsidebar { - float: left; - width: 230px; - margin-left: -100%; - font-size: 90%; -} - -div.clearer { - clear: both; -} - -div.footer { - color: #fff; - width: 100%; - padding: 9px 0 9px 0; - text-align: center; - font-size: 75%; -} - -div.footer a { - color: #fff; - text-decoration: underline; -} - -div.related { - background-color: #5b1616; - color: #fff; - width: 100%; - line-height: 30px; - font-size: 90%; -} - -div.related h3 { - display: none; -} - -div.related ul { - margin: 0; - padding: 0 0 0 10px; - list-style: none; -} - -div.related li { - display: inline; -} - -div.related li.right { - float: right; - margin-right: 5px; -} - -div.related a { - color: white; -} - -/* ::: TOC :::: */ -div.sphinxsidebar h3 { - font-family: 'Trebuchet MS', sans-serif; - color: white; - font-size: 1.4em; - font-weight: normal; - margin: 0; - padding: 0; -} - -div.sphinxsidebar h3 a { - color: white; -} - -div.sphinxsidebar h4 { - font-family: 'Trebuchet MS', sans-serif; - color: white; - font-size: 1.3em; - font-weight: normal; - margin: 5px 0 0 0; - padding: 0; -} - -div.sphinxsidebar p { - color: white; -} - -div.sphinxsidebar p.topless { - margin: 5px 10px 10px 10px; -} - -div.sphinxsidebar ul { - margin: 10px; - padding: 0; - list-style: none; - color: white; -} - -div.sphinxsidebar ul ul, -div.sphinxsidebar ul.want-points { - margin-left: 20px; - list-style: square; -} - -div.sphinxsidebar ul ul { - margin-top: 0; - margin-bottom: 0; -} - -div.sphinxsidebar a { - color: #ffca9b; -} - -div.sphinxsidebar form { - margin-top: 10px; -} - -div.sphinxsidebar input { - border: 1px solid #ffca9b; - font-family: sans-serif; - font-size: 1em; -} - -/* :::: MODULE CLOUD :::: */ -div.modulecloud { - margin: -5px 10px 5px 10px; - padding: 10px; - line-height: 160%; - border: 1px solid #cbe7e5; - background-color: #f2fbfd; -} - -div.modulecloud a { - padding: 0 5px 0 5px; -} - -/* :::: SEARCH :::: */ -ul.search { - margin: 10px 0 0 20px; - padding: 0; -} - -ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; -} - -ul.search li a { - font-weight: bold; -} - -ul.search li div.context { - color: #888; - margin: 2px 0 0 30px; - text-align: left; -} - -ul.keywordmatches li.goodmatch a { - font-weight: bold; -} - -/* :::: COMMON FORM STYLES :::: */ - -div.actions { - padding: 5px 10px 5px 10px; - border-top: 1px solid #cbe7e5; - border-bottom: 1px solid #cbe7e5; - background-color: #e0f6f4; -} - -form dl { - color: #333; -} - -form dt { - clear: both; - float: left; - min-width: 110px; - margin-right: 10px; - padding-top: 2px; -} - -input#homepage { - display: none; -} - -div.error { - margin: 5px 20px 0 0; - padding: 5px; - border: 1px solid #d00; - font-weight: bold; -} - -/* :::: INDEX PAGE :::: */ - -table.contentstable { - width: 90%; -} - -table.contentstable p.biglink { - line-height: 150%; -} - -a.biglink { - font-size: 1.3em; -} - -span.linkdescr { - font-style: italic; - padding-top: 5px; - font-size: 90%; -} - -/* :::: INDEX STYLES :::: */ - -table.indextable td { - text-align: left; - vertical-align: top; -} - -table.indextable dl, table.indextable dd { - margin-top: 0; - margin-bottom: 0; -} - -table.indextable tr.pcap { - height: 10px; -} - -table.indextable tr.cap { - margin-top: 10px; - background-color: #f2f2f2; -} - -img.toggler { - margin-right: 3px; - margin-top: 3px; - cursor: pointer; -} - -form.pfform { - margin: 10px 0 20px 0; -} - -/* :::: GLOBAL STYLES :::: */ - -.docwarning { - background-color: #ffe4e4; - padding: 10px; - margin: 0 -20px 0 -20px; - border-bottom: 1px solid #f66; -} - -p.subhead { - font-weight: bold; - margin-top: 20px; -} - -a { - color: #6e0909; - text-decoration: none; -} - -a:hover { - text-decoration: underline; -} - -div.body h1, -div.body h2, -div.body h3, -div.body h4, -div.body h5, -div.body h6 { - font-family: 'Trebuchet MS', sans-serif; - background-color: #f2f2f2; - font-weight: normal; - color: #331F0A; - border-bottom: 1px solid #ccc; - margin: 20px -20px 10px -20px; - padding: 3px 0 3px 10px; -} - -div.body h1 { margin-top: 0; font-size: 200%; } -div.body h2 { font-size: 160%; } -div.body h3 { font-size: 140%; } -div.body h4 { font-size: 120%; } -div.body h5 { font-size: 110%; } -div.body h6 { font-size: 100%; } - -a.headerlink { - color: #c60f0f; - font-size: 0.8em; - padding: 0 4px 0 4px; - text-decoration: none; - visibility: hidden; -} - -h1:hover > a.headerlink, -h2:hover > a.headerlink, -h3:hover > a.headerlink, -h4:hover > a.headerlink, -h5:hover > a.headerlink, -h6:hover > a.headerlink, -dt:hover > a.headerlink { - visibility: visible; -} - -a.headerlink:hover { - background-color: #c60f0f; - color: white; -} - -div.body p, div.body dd, div.body li { - text-align: justify; - line-height: 130%; -} - -div.body p.caption { - text-align: inherit; -} - -div.body td { - text-align: left; -} - -ul.fakelist { - list-style: none; - margin: 10px 0 10px 20px; - padding: 0; -} - -.field-list ul { - padding-left: 1em; -} - -.first { - margin-top: 0 !important; -} - -/* "Footnotes" heading */ -p.rubric { - margin-top: 30px; - font-weight: bold; -} - -/* Sidebars */ - -div.sidebar { - margin: 0 0 0.5em 1em; - border: 1px solid #ddb; - padding: 7px 7px 0 7px; - background-color: #ffe; - width: 40%; - float: right; -} - -p.sidebar-title { - font-weight: bold; -} - -/* "Topics" */ - -div.topic { - background-color: #eee; - border: 1px solid #ccc; - padding: 7px 7px 0 7px; - margin: 10px 0 10px 0; -} - -p.topic-title { - font-size: 1.1em; - font-weight: bold; - margin-top: 10px; -} - -/* Admonitions */ - -div.admonition { - margin-top: 10px; - margin-bottom: 10px; - padding: 7px; -} - -div.admonition dt { - font-weight: bold; -} - -div.admonition dl { - margin-bottom: 0; -} - -div.admonition p.admonition-title + p { - display: inline; -} - -div.seealso { - background-color: #ffc; - border: 1px solid #ff6; -} - -div.warning { - background-color: #ffe4e4; - border: 1px solid #f66; -} - -div.note { - background-color: #eee; - border: 1px solid #ccc; -} - -p.admonition-title { - margin: 0px 10px 5px 0px; - font-weight: bold; - display: inline; -} - -p.admonition-title:after { - content: ":"; -} - -div.body p.centered { - text-align: center; - margin-top: 25px; -} - -table.docutils { - border: 0; -} - -table.docutils td, table.docutils th { - padding: 1px 8px 1px 0; - border-top: 0; - border-left: 0; - border-right: 0; - border-bottom: 1px solid #aaa; -} - -table.field-list td, table.field-list th { - border: 0 !important; -} - -table.footnote td, table.footnote th { - border: 0 !important; -} - -.field-list ul { - margin: 0; - padding-left: 1em; -} - -.field-list p { - margin: 0; -} - -dl { - margin-bottom: 15px; - clear: both; -} - -dd p { - margin-top: 0px; -} - -dd ul, dd table { - margin-bottom: 10px; -} - -dd { - margin-top: 3px; - margin-bottom: 10px; - margin-left: 30px; -} - -.refcount { - color: #060; -} - -dt:target, -.highlight { - background-color: #fbe54e; -} - -dl.glossary dt { - font-weight: bold; - font-size: 1.1em; -} - -th { - text-align: left; - padding-right: 5px; -} - -pre { - padding: 5px; - background-color: #efc; - color: #333; - border: 1px solid #ac9; - border-left: none; - border-right: none; - overflow: auto; -} - -td.linenos pre { - padding: 5px 0px; - border: 0; - background-color: transparent; - color: #aaa; -} - -table.highlighttable { - margin-left: 0.5em; -} - -table.highlighttable td { - padding: 0 0.5em 0 0.5em; -} - -tt { - background-color: #ecf0f3; - padding: 0 1px 0 1px; - font-size: 0.95em; -} - -tt.descname { - background-color: transparent; - font-weight: bold; - font-size: 1.2em; -} - -tt.descclassname { - background-color: transparent; -} - -tt.xref, a tt { - background-color: transparent; - font-weight: bold; -} - -.footnote:target { background-color: #ffa } - -h1 tt, h2 tt, h3 tt, h4 tt, h5 tt, h6 tt { - background-color: transparent; -} - -.optional { - font-size: 1.3em; -} - -.versionmodified { - font-style: italic; -} - -form.comment { - margin: 0; - padding: 10px 30px 10px 30px; - background-color: #eee; -} - -form.comment h3 { - background-color: #326591; - color: white; - margin: -10px -30px 10px -30px; - padding: 5px; - font-size: 1.4em; -} - -form.comment input, -form.comment textarea { - border: 1px solid #ccc; - padding: 2px; - font-family: sans-serif; - font-size: 100%; -} - -form.comment input[type="text"] { - width: 240px; -} - -form.comment textarea { - width: 100%; - height: 200px; - margin-bottom: 10px; -} - -.system-message { - background-color: #fda; - padding: 5px; - border: 3px solid red; -} - -img.math { - vertical-align: middle; -} - -div.math p { - text-align: center; -} - -span.eqno { - float: right; -} - -img.logo { - border: 0; -} - -/* :::: PRINT :::: */ -@media print { - div.document, - div.documentwrapper, - div.bodywrapper { - margin: 0; - width : 100%; - } - - div.sphinxsidebar, - div.related, - div.footer, - div#comments div.new-comment-box, - #top-link { - display: none; - } -} diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html new file mode 100644 index 000000000..a6f6cbda8 --- /dev/null +++ b/docs/_templates/layout.html @@ -0,0 +1,16 @@ +{% extends "!layout.html" %} + +{% block footer %} +{{ super() }} + +{% endblock %} diff --git a/docs/_tests/quotes.html b/docs/_tests/quotes.html new file mode 100644 index 000000000..71aff8847 --- /dev/null +++ b/docs/_tests/quotes.html @@ -0,0 +1,281 @@ + + + + + Quotes to Scrape + + + + +
+
+ +
+

+ + Login + +

+
+
+ + +
+
+ +
+ “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” + by + (about) + +
+ Tags: + + + change + + deep-thoughts + + thinking + + world + +
+
+ +
+ “It is our choices, Harry, that show what we truly are, far more than our abilities.” + by + (about) + +
+ Tags: + + + abilities + + choices + +
+
+ +
+ “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.” + by + (about) + +
+ Tags: + + + inspirational + + life + + live + + miracle + + miracles + +
+
+ +
+ “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.” + by + (about) + +
+ Tags: + + + aliteracy + + books + + classic + + humor + +
+
+ +
+ “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.” + by + (about) + +
+ Tags: + + + be-yourself + + inspirational + +
+
+ +
+ “Try not to become a man of success. Rather become a man of value.” + by + (about) + +
+ Tags: + + + adulthood + + success + + value + +
+
+ +
+ “It is better to be hated for what you are than to be loved for what you are not.” + by + (about) + +
+ Tags: + + + life + + love + +
+
+ +
+ “I have not failed. I've just found 10,000 ways that won't work.” + by + (about) + +
+ Tags: + + + edison + + failure + + inspirational + + paraphrased + +
+
+ +
+ “A woman is like a tea bag; you never know how strong it is until it's in hot water.” + by + (about) + + +
+ +
+ “A day without sunshine is like, you know, night.” + by + (about) + +
+ Tags: + + + humor + + obvious + + simile + +
+
+ + +
+
+ +

Top Ten tags

+ + + love + + + + inspirational + + + + life + + + + humor + + + + books + + + + reading + + + + friendship + + + + friends + + + + truth + + + + simile + + + +
+
+ +
+ + + \ No newline at end of file diff --git a/docs/_tests/quotes1.html b/docs/_tests/quotes1.html new file mode 100644 index 000000000..71aff8847 --- /dev/null +++ b/docs/_tests/quotes1.html @@ -0,0 +1,281 @@ + + + + + Quotes to Scrape + + + + +
+
+ +
+

+ + Login + +

+
+
+ + +
+
+ +
+ “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” + by + (about) + +
+ Tags: + + + change + + deep-thoughts + + thinking + + world + +
+
+ +
+ “It is our choices, Harry, that show what we truly are, far more than our abilities.” + by + (about) + +
+ Tags: + + + abilities + + choices + +
+
+ +
+ “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.” + by + (about) + +
+ Tags: + + + inspirational + + life + + live + + miracle + + miracles + +
+
+ +
+ “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.” + by + (about) + +
+ Tags: + + + aliteracy + + books + + classic + + humor + +
+
+ +
+ “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.” + by + (about) + +
+ Tags: + + + be-yourself + + inspirational + +
+
+ +
+ “Try not to become a man of success. Rather become a man of value.” + by + (about) + +
+ Tags: + + + adulthood + + success + + value + +
+
+ +
+ “It is better to be hated for what you are than to be loved for what you are not.” + by + (about) + +
+ Tags: + + + life + + love + +
+
+ +
+ “I have not failed. I've just found 10,000 ways that won't work.” + by + (about) + +
+ Tags: + + + edison + + failure + + inspirational + + paraphrased + +
+
+ +
+ “A woman is like a tea bag; you never know how strong it is until it's in hot water.” + by + (about) + + +
+ +
+ “A day without sunshine is like, you know, night.” + by + (about) + +
+ Tags: + + + humor + + obvious + + simile + +
+
+ + +
+
+ +

Top Ten tags

+ + + love + + + + inspirational + + + + life + + + + humor + + + + books + + + + reading + + + + friendship + + + + friends + + + + truth + + + + simile + + + +
+
+ +
+ + + \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 7acf7c7fa..427c79481 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # Scrapy documentation build configuration file, created by # sphinx-quickstart on Mon Nov 24 12:02:52 2008. # @@ -12,13 +10,14 @@ # serve to show the default. import sys +from datetime import datetime from os import path # If your extensions are in another directory, add it here. If the directory # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. sys.path.append(path.join(path.dirname(__file__), "_ext")) -sys.path.append(path.join(path.dirname(path.dirname(__file__)), "scrapy")) +sys.path.insert(0, path.dirname(path.dirname(__file__))) # General configuration @@ -26,7 +25,15 @@ sys.path.append(path.join(path.dirname(path.dirname(__file__)), "scrapy")) # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['scrapydocs'] +extensions = [ + 'hoverxref.extension', + 'notfound.extension', + 'scrapydocs', + 'sphinx.ext.autodoc', + 'sphinx.ext.coverage', + 'sphinx.ext.intersphinx', + 'sphinx.ext.viewcode', +] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -41,8 +48,8 @@ source_suffix = '.rst' master_doc = 'index' # General information about the project. -project = u'Scrapy' -copyright = u'2008-2013, Scrapy developers' +project = 'Scrapy' +copyright = '2008–{}, Scrapy developers'.format(datetime.now().year) # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -70,6 +77,8 @@ language = 'en' # List of documents that shouldn't be included in the build. #unused_docs = [] +exclude_patterns = ['build'] + # List of directories, relative to source directory, that shouldn't be searched # for source files. exclude_trees = ['.build'] @@ -91,14 +100,33 @@ exclude_trees = ['.build'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' +# List of Sphinx warnings that will not be raised +suppress_warnings = ['epub.unknown_project_files'] + # Options for HTML output # ----------------------- +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# Add path to the RTD explicitly to robustify builds (otherwise might +# fail in a clean Debian build env) +import sphinx_rtd_theme +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + + # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths # given in html_static_path. -html_style = 'scrapydoc.css' +# html_style = 'scrapydoc.css' # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". @@ -125,10 +153,6 @@ html_static_path = ['_static'] # using the given strftime format. html_last_updated_fmt = '%b %d, %Y' -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -html_use_smartypants = True - # Custom sidebar templates, maps document names to template names. #html_sidebars = {} @@ -172,8 +196,8 @@ htmlhelp_basename = 'Scrapydoc' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, document class [howto/manual]). latex_documents = [ - ('index', 'Scrapy.tex', ur'Scrapy Documentation', - ur'Scrapy developers', 'manual'), + ('index', 'Scrapy.tex', 'Scrapy Documentation', + 'Scrapy developers', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -203,3 +227,94 @@ linkcheck_ignore = [ 'http://localhost:\d+', 'http://hg.scrapy.org', 'http://directory.google.com/' ] + + +# Options for the Coverage extension +# ---------------------------------- +coverage_ignore_pyobjects = [ + # Contract’s add_pre_hook and add_post_hook are not documented because + # they should be transparent to contract developers, for whom pre_hook and + # post_hook should be the actual concern. + r'\bContract\.add_(pre|post)_hook$', + + # ContractsManager is an internal class, developers are not expected to + # interact with it directly in any way. + r'\bContractsManager\b$', + + # For default contracts we only want to document their general purpose in + # their __init__ method, the methods they reimplement to achieve that purpose + # should be irrelevant to developers using those contracts. + r'\w+Contract\.(adjust_request_args|(pre|post)_process)$', + + # Methods of downloader middlewares are not documented, only the classes + # themselves, since downloader middlewares are controlled through Scrapy + # settings. + r'^scrapy\.downloadermiddlewares\.\w*?\.(\w*?Middleware|DownloaderStats)\.', + + # Base classes of downloader middlewares are implementation details that + # are not meant for users. + r'^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware', + + # Private exception used by the command-line interface implementation. + r'^scrapy\.exceptions\.UsageError', + + # Methods of BaseItemExporter subclasses are only documented in + # BaseItemExporter. + r'^scrapy\.exporters\.(?!BaseItemExporter\b)\w*?\.', + + # Extension behavior is only modified through settings. Methods of + # extension classes, as well as helper functions, are implementation + # details that are not documented. + r'^scrapy\.extensions\.[a-z]\w*?\.[A-Z]\w*?\.', # methods + r'^scrapy\.extensions\.[a-z]\w*?\.[a-z]', # helper functions + + # Never documented before, and deprecated now. + r'^scrapy\.item\.DictItem$', + r'^scrapy\.linkextractors\.FilteringLinkExtractor$', + + # Implementation detail of LxmlLinkExtractor + r'^scrapy\.linkextractors\.lxmlhtml\.LxmlParserLinkExtractor', +] + + +# Options for the InterSphinx extension +# ------------------------------------- + +intersphinx_mapping = { + 'attrs': ('https://www.attrs.org/en/stable/', None), + 'coverage': ('https://coverage.readthedocs.io/en/stable', None), + 'cssselect': ('https://cssselect.readthedocs.io/en/latest', None), + 'itemloaders': ('https://itemloaders.readthedocs.io/en/latest/', None), + 'pytest': ('https://docs.pytest.org/en/latest', None), + 'python': ('https://docs.python.org/3', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master', None), + 'tox': ('https://tox.readthedocs.io/en/latest', None), + 'twisted': ('https://twistedmatrix.com/documents/current', None), + 'twistedapi': ('https://twistedmatrix.com/documents/current/api', None), +} + + +# Options for sphinx-hoverxref options +# ------------------------------------ + +hoverxref_auto_ref = True +hoverxref_role_types = { + "class": "tooltip", + "confval": "tooltip", + "hoverxref": "tooltip", + "mod": "tooltip", + "ref": "tooltip", +} +hoverxref_roles = ['command', 'reqmeta', 'setting', 'signal'] + + +def setup(app): + app.connect('autodoc-skip-member', maybe_skip_member) + + +def maybe_skip_member(app, what, name, obj, skip, options): + if not skip: + # autodocs was generating a text "alias of" for the following members + # https://github.com/sphinx-doc/sphinx/issues/4422 + return name in {'default_item_class', 'default_selector_class'} + return skip diff --git a/docs/conftest.py b/docs/conftest.py new file mode 100644 index 000000000..8c735e838 --- /dev/null +++ b/docs/conftest.py @@ -0,0 +1,29 @@ +import os +from doctest import ELLIPSIS, NORMALIZE_WHITESPACE + +from scrapy.http.response.html import HtmlResponse +from sybil import Sybil +from sybil.parsers.codeblock import CodeBlockParser +from sybil.parsers.doctest import DocTestParser +from sybil.parsers.skip import skip + + +def load_response(url, filename): + input_path = os.path.join(os.path.dirname(__file__), '_tests', filename) + with open(input_path, 'rb') as input_file: + return HtmlResponse(url, body=input_file.read()) + + +def setup(namespace): + namespace['load_response'] = load_response + + +pytest_collect_file = Sybil( + parsers=[ + DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE), + CodeBlockParser(future_imports=['print_function']), + skip, + ], + pattern='*.rst', + setup=setup, +).pytest() diff --git a/docs/contributing.rst b/docs/contributing.rst index d7a47a746..525ad3497 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -4,22 +4,31 @@ Contributing to Scrapy ====================== +.. important:: + + Double check that you are reading the most recent version of this document at + https://docs.scrapy.org/en/master/contributing.html + There are many ways to contribute to Scrapy. Here are some of them: * Blog about Scrapy. Tell the world how you're using Scrapy. This will help - newcomers with more examples and the Scrapy project to increase its + newcomers with more examples and will help the Scrapy project to increase its visibility. * Report bugs and request features in the `issue tracker`_, trying to follow the guidelines detailed in `Reporting bugs`_ below. -* Submit patches for new functionality and/or bug fixes. Please read - `Writing patches`_ and `Submitting patches`_ below for details on how to +* Submit patches for new functionalities and/or bug fixes. Please read + :ref:`writing-patches` and `Submitting patches`_ below for details on how to write and submit a patch. -* Join the `scrapy-users`_ mailing list and share your ideas on how to +* Join the `Scrapy subreddit`_ and share your ideas on how to improve Scrapy. We're always open to suggestions. +* Answer Scrapy questions at + `Stack Overflow `__. + + Reporting bugs ============== @@ -30,33 +39,48 @@ Reporting bugs trusted Scrapy developers, and its archives are not public. Well-written bug reports are very helpful, so keep in mind the following -guidelines when reporting a new bug. +guidelines when you're going to report a new bug. * check the :ref:`FAQ ` first to see if your issue is addressed in a well-known question -* check the `open issues`_ to see if it has already been reported. If it has, - don't dismiss the report but check the ticket history and comments, you may - find additional useful information to contribute. +* if you have a general question about Scrapy usage, please ask it at + `Stack Overflow `__ + (use "scrapy" tag). -* search the `scrapy-users`_ list to see if it has been discussed there, or - if you're not sure if what you're seeing is a bug. You can also ask in the - `#scrapy` IRC channel. +* check the `open issues`_ to see if the issue has already been reported. If it + has, don't dismiss the report, but check the ticket history and comments. If + you have additional useful information, please leave a comment, or consider + :ref:`sending a pull request ` with a fix. -* write complete, reproducible, specific bug reports. The smaller the test +* search the `scrapy-users`_ list and `Scrapy subreddit`_ to see if it has + been discussed there, or if you're not sure if what you're seeing is a bug. + You can also ask in the ``#scrapy`` IRC channel. + +* write **complete, reproducible, specific bug reports**. The smaller the test case, the better. Remember that other developers won't have your project to reproduce the bug, so please include all relevant files required to reproduce - it. + it. See for example StackOverflow's guide on creating a + `Minimal, Complete, and Verifiable example`_ exhibiting the issue. + +* the most awesome way to provide a complete reproducible example is to + send a pull request which adds a failing test case to the + Scrapy testing suite (see :ref:`submitting-patches`). + This is helpful even if you don't have an intention to + fix the issue yourselves. * include the output of ``scrapy version -v`` so developers working on your bug know exactly which version and platform it occurred on, which is often very helpful for reproducing it, or knowing if it was already fixed. +.. _Minimal, Complete, and Verifiable example: https://stackoverflow.com/help/mcve + +.. _writing-patches: + Writing patches =============== -The better written a patch is, the higher chance that it'll get accepted and -the sooner that will be merged. +The better a patch is written, the higher the chances that it'll get accepted and the sooner it will be merged. Well-written patches should: @@ -75,10 +99,26 @@ Well-written patches should: the documentation changes in the same patch. See `Documentation policies`_ below. +* if you're adding a private API, please add a regular expression to the + ``coverage_ignore_pyobjects`` variable of ``docs/conf.py`` to exclude the new + private API from documentation coverage checks. + + To see if your private API is skipped properly, generate a documentation + coverage report as follows:: + + tox -e docs-coverage + +* if you are removing deprecated code, first make sure that at least 1 year + (12 months) has passed since the release that introduced the deprecation. + See :ref:`deprecation-policy`. + + +.. _submitting-patches: + Submitting patches ================== -The best way to submit a patch is to issue a `pull request`_ on Github, +The best way to submit a patch is to issue a `pull request`_ on GitHub, optionally creating a new issue first. Remember to explain what was fixed or the new functionality (what it is, why @@ -88,15 +128,41 @@ developers to understand and accept your patch. You can also discuss the new functionality (or bug fix) before creating the patch, but it's always good to have a patch ready to illustrate your arguments and show that you have put some additional thought into the subject. A good -starting point is to send a pull request on Github. It can be simple enough to +starting point is to send a pull request on GitHub. It can be simple enough to illustrate your idea, and leave documentation/tests for later, after the idea -has been validated and proven useful. Alternatively, you can send an email to -`scrapy-users`_ to discuss your idea first. +has been validated and proven useful. Alternatively, you can start a +conversation in the `Scrapy subreddit`_ to discuss your idea first. + +Sometimes there is an existing pull request for the problem you'd like to +solve, which is stalled for some reason. Often the pull request is in a +right direction, but changes are requested by Scrapy maintainers, and the +original pull request author hasn't had time to address them. +In this case consider picking up this pull request: open +a new pull request with all commits from the original pull request, as well as +additional changes to address the raised issues. Doing so helps a lot; it is +not considered rude as soon as the original author is acknowledged by keeping +his/her commits. + +You can pull an existing pull request to a local branch +by running ``git fetch upstream pull/$PR_NUMBER/head:$BRANCH_NAME_TO_CREATE`` +(replace 'upstream' with a remote name for scrapy repository, +``$PR_NUMBER`` with an ID of the pull request, and ``$BRANCH_NAME_TO_CREATE`` +with a name of the branch you want to create locally). +See also: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/checking-out-pull-requests-locally#modifying-an-inactive-pull-request-locally. + +When writing GitHub pull requests, try to keep titles short but descriptive. +E.g. For bug #411: "Scrapy hangs if an exception raises in start_requests" +prefer "Fix hanging when exception occurs in start_requests (#411)" +instead of "Fix for #411". Complete titles make it easy to skim through +the issue tracker. Finally, try to keep aesthetic changes (:pep:`8` compliance, unused imports -removal, etc) in separate commits than functional changes. This will make pull +removal, etc) in separate commits from functional changes. This will make pull requests easier to review and more likely to get merged. + +.. _coding-style: + Coding style ============ @@ -105,50 +171,84 @@ Scrapy: * Unless otherwise specified, follow :pep:`8`. -* It's OK to use lines longer than 80 chars if it improves the code +* It's OK to use lines longer than 79 chars if it improves the code readability. -* Don't put your name in the code you contribute. Our policy is to keep - the contributor's name in the `AUTHORS`_ file distributed with Scrapy. +* Don't put your name in the code you contribute; git provides enough + metadata to identify author of the code. + See https://help.github.com/en/github/using-git/setting-your-username-in-git for + setup instructions. -Scrapy Contrib -============== - -Scrapy contrib shares a similar rationale as Django contrib, which is explained -in `this post `_. If you -are working on a new functionality, please follow that rationale to decide -whether it should be a Scrapy contrib. If unsure, you can ask in -`scrapy-users`_. +.. _documentation-policies: Documentation policies ====================== -* **Don't** use docstrings for documenting classes, or methods which are - already documented in the official (sphinx) documentation. For example, the - :meth:`ItemLoader.add_value` method should be documented in the sphinx - documentation, not its docstring. +For reference documentation of API members (classes, methods, etc.) use +docstrings and make sure that the Sphinx documentation uses the +:mod:`~sphinx.ext.autodoc` extension to pull the docstrings. API reference +documentation should follow docstring conventions (`PEP 257`_) and be +IDE-friendly: short, to the point, and it may provide short examples. -* **Do** use docstrings for documenting functions not present in the official - (sphinx) documentation, such as functions from ``scrapy.utils`` package and - its sub-modules. +Other types of documentation, such as tutorials or topics, should be covered in +files within the ``docs/`` directory. This includes documentation that is +specific to an API member, but goes beyond API reference documentation. + +In any case, if something is covered in a docstring, use the +:mod:`~sphinx.ext.autodoc` extension to pull the docstring into the +documentation instead of duplicating the docstring in files within the +``docs/`` directory. Tests ===== -Tests are implemented using the `Twisted unit-testing framework`_, running -tests requires `tox`_. +Tests are implemented using the :doc:`Twisted unit-testing framework +`. Running tests requires +:doc:`tox `. + +.. _running-tests: Running tests ------------- -To run all tests go to the root directory of Scrapy source code and run: +To run all tests:: - ``tox`` + tox -To run a specific test (say ``tests/test_contrib_loader.py``) use: +To run a specific test (say ``tests/test_loader.py``) use: - ``tox -- tests/test_contrib_loader.py`` + ``tox -- tests/test_loader.py`` +To run the tests on a specific :doc:`tox ` environment, use +``-e `` with an environment name from ``tox.ini``. For example, to run +the tests with Python 3.6 use:: + + tox -e py36 + +You can also specify a comma-separated list of environments, and use :ref:`tox’s +parallel mode ` to run the tests on multiple environments in +parallel:: + + tox -e py36,py38 -p auto + +To pass command-line options to :doc:`pytest `, add them after +``--`` in your call to :doc:`tox `. Using ``--`` overrides the +default positional arguments defined in ``tox.ini``, so you must include those +default positional arguments (``scrapy tests``) after ``--`` as well:: + + tox -- scrapy tests -x # stop after first failure + +You can also use the `pytest-xdist`_ plugin. For example, to run all tests on +the Python 3.6 :doc:`tox ` environment using all your CPU cores:: + + tox -e py36 -- scrapy tests -n auto + +To see coverage report install :doc:`coverage ` +(``pip install coverage``) and run: + + ``coverage report`` + +see output of ``coverage --help`` for more options like html or xml report. Writing tests ------------- @@ -161,17 +261,18 @@ Scrapy uses unit-tests, which are located in the `tests/`_ directory. Their module name typically resembles the full path of the module they're testing. For example, the item loaders code is in:: - scrapy.contrib.loader + scrapy.loader And their unit-tests are in:: - tests/test_contrib_loader.py + tests/test_loader.py .. _issue tracker: https://github.com/scrapy/scrapy/issues -.. _scrapy-users: http://groups.google.com/group/scrapy-users -.. _Twisted unit-testing framework: http://twistedmatrix.com/documents/current/core/development/policy/test-standard.html +.. _scrapy-users: https://groups.google.com/forum/#!forum/scrapy-users +.. _Scrapy subreddit: https://reddit.com/r/scrapy .. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS .. _tests/: https://github.com/scrapy/scrapy/tree/master/tests .. _open issues: https://github.com/scrapy/scrapy/issues -.. _pull request: http://help.github.com/send-pull-requests/ -.. _tox: https://pypi.python.org/pypi/tox +.. _PEP 257: https://www.python.org/dev/peps/pep-0257/ +.. _pull request: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request +.. _pytest-xdist: https://github.com/pytest-dev/pytest-xdist diff --git a/docs/experimental/index.rst b/docs/experimental/index.rst deleted file mode 100644 index 1c019c396..000000000 --- a/docs/experimental/index.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. _experimental: - -Experimental features -===================== - -This section documents experimental Scrapy features that may become stable in -future releases, but whose API is not yet stable. Use them with caution, and -subscribe to the `mailing lists `_ to get -notified of any changes. - -Since it's not revised so frequently, this section may contain documentation -which is outdated, incomplete or overlapping with stable documentation (until -it's properly merged) . Use at your own risk. - -.. warning:: - - This documentation is a work in progress. Use at your own risk. - -Add commands using external libraries -------------------------------------- - -You can also add Scrapy commands from an external library by adding `scrapy.commands` section into entry_points in the `setup.py`. - -The following example adds `my_command` command:: - - from setuptools import setup, find_packages - - setup(name='scrapy-mymodule', - entry_points={ - 'scrapy.commands': [ - 'my_command=my_scrapy_module.commands:MyCommand', - ], - }, - ) diff --git a/docs/faq.rst b/docs/faq.rst index 47bfede71..9346ec358 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -3,6 +3,8 @@ Frequently Asked Questions ========================== +.. _faq-scrapy-bs-cmp: + How does Scrapy compare to BeautifulSoup or lxml? ------------------------------------------------- @@ -19,33 +21,56 @@ Python code. In other words, comparing `BeautifulSoup`_ (or `lxml`_) to Scrapy is like comparing `jinja2`_ to `Django`_. -.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ -.. _lxml: http://lxml.de/ -.. _jinja2: http://jinja.pocoo.org/2/ -.. _Django: http://www.djangoproject.com +.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ +.. _lxml: https://lxml.de/ +.. _jinja2: https://palletsprojects.com/p/jinja/ +.. _Django: https://www.djangoproject.com/ -.. _faq-python-versions: +Can I use Scrapy with BeautifulSoup? +------------------------------------ -What Python versions does Scrapy support? ------------------------------------------ +Yes, you can. +As mentioned :ref:`above `, `BeautifulSoup`_ can be used +for parsing HTML responses in Scrapy callbacks. +You just have to feed the response's body into a ``BeautifulSoup`` object +and extract whatever data you need from it. -Scrapy is supported under Python 2.7 only. -Python 2.6 support was dropped starting at Scrapy 0.20. +Here's an example spider using BeautifulSoup API, with ``lxml`` as the HTML parser:: -Does Scrapy work with Python 3? ---------------------------------- -No, but there are plans to support Python 3.3+. -At the moment, Scrapy works with Python 2.7. + from bs4 import BeautifulSoup + import scrapy + + + class ExampleSpider(scrapy.Spider): + name = "example" + allowed_domains = ["example.com"] + start_urls = ( + 'http://www.example.com/', + ) + + def parse(self, response): + # use lxml to get decent HTML parsing speed + soup = BeautifulSoup(response.text, 'lxml') + yield { + "url": response.url, + "title": soup.h1.string + } + +.. note:: + + ``BeautifulSoup`` supports several HTML/XML parsers. + See `BeautifulSoup's official documentation`_ on which ones are available. + +.. _BeautifulSoup's official documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use -.. seealso:: :ref:`faq-python-versions`. Did Scrapy "steal" X from Django? --------------------------------- Probably, but we don't like that word. We think Django_ is a great open source project and an example to follow, so we've used it as an inspiration for -Scrapy. +Scrapy. We believe that, if something is already done well, there's no need to reinvent it. This concept, besides being one of the foundations for open source and free @@ -57,14 +82,12 @@ focus on the real problems we need to solve. We'd be proud if Scrapy serves as an inspiration for other projects. Feel free to steal from us! -.. _Django: http://www.djangoproject.com - Does Scrapy work with HTTP proxies? ----------------------------------- Yes. Support for HTTP proxies is provided (since Scrapy 0.8) through the HTTP Proxy downloader middleware. See -:class:`~scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware`. +:class:`~scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware`. How can I scrape an item with attributes in different pages? ------------------------------------------------------------ @@ -77,25 +100,37 @@ Scrapy crashes with: ImportError: No module named win32api You need to install `pywin32`_ because of `this Twisted bug`_. -.. _pywin32: http://sourceforge.net/projects/pywin32/ -.. _this Twisted bug: http://twistedmatrix.com/trac/ticket/3707 +.. _pywin32: https://sourceforge.net/projects/pywin32/ +.. _this Twisted bug: https://twistedmatrix.com/trac/ticket/3707 How can I simulate a user login in my spider? --------------------------------------------- See :ref:`topics-request-response-ref-request-userlogin`. +.. _faq-bfo-dfo: + Does Scrapy crawl in breadth-first or depth-first order? -------------------------------------------------------- By default, Scrapy uses a `LIFO`_ queue for storing pending requests, which basically means that it crawls in `DFO order`_. This order is more convenient -in most cases. If you do want to crawl in true `BFO order`_, you can do it by +in most cases. + +If you do want to crawl in true `BFO order`_, you can do it by setting the following settings:: DEPTH_PRIORITY = 1 - SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue' - SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue' + SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' + SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' + +While pending requests are below the configured values of +:setting:`CONCURRENT_REQUESTS`, :setting:`CONCURRENT_REQUESTS_PER_DOMAIN` or +:setting:`CONCURRENT_REQUESTS_PER_IP`, those requests are sent +concurrently. As a result, the first few requests of a crawl rarely follow the +desired order. Lowering those settings to ``1`` enforces the desired order, but +it significantly slows down the crawl as a whole. + My Scrapy crawler has memory leaks. What can I do? -------------------------------------------------- @@ -113,7 +148,7 @@ See previous question. Can I use Basic HTTP Authentication in my spiders? -------------------------------------------------- -Yes, see :class:`~scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware`. +Yes, see :class:`~scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware`. Why does Scrapy download pages in English instead of my native language? ------------------------------------------------------------------------ @@ -121,7 +156,7 @@ Why does Scrapy download pages in English instead of my native language? Try changing the default `Accept-Language`_ request header by overriding the :setting:`DEFAULT_REQUEST_HEADERS` setting. -.. _Accept-Language: http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4 +.. _Accept-Language: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4 Where can I find some example Scrapy projects? ---------------------------------------------- @@ -144,23 +179,23 @@ I get "Filtered offsite request" messages. How can I fix them? Those messages (logged with ``DEBUG`` level) don't necessarily mean there is a problem, so you may not need to fix them. -Those message are thrown by the Offsite Spider Middleware, which is a spider +Those messages are thrown by the Offsite Spider Middleware, which is a spider middleware (enabled by default) whose purpose is to filter out requests to domains outside the ones covered by the spider. For more info see: -:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware`. +:class:`~scrapy.spidermiddlewares.offsite.OffsiteMiddleware`. What is the recommended way to deploy a Scrapy crawler in production? --------------------------------------------------------------------- -See :ref:`topics-scrapyd`. +See :ref:`topics-deploy`. Can I use JSON for large exports? --------------------------------- It'll depend on how large your output is. See :ref:`this warning -` in :class:`~scrapy.contrib.exporter.JsonItemExporter` +` in :class:`~scrapy.exporters.JsonItemExporter` documentation. Can I return (Twisted) deferreds from signal handlers? @@ -190,7 +225,7 @@ Or by setting a global download delay in your project with the Can I call ``pdb.set_trace()`` from my spiders to debug them? ------------------------------------------------------------- -Yes, but you can also use the Scrapy shell which allows you too quickly analyze +Yes, but you can also use the Scrapy shell which allows you to quickly analyze (and even modify) the response being processed by your spider, which is, quite often, more useful than plain old ``pdb.set_trace()``. @@ -201,15 +236,15 @@ Simplest way to dump all my scraped items into a JSON/CSV/XML file? To dump into a JSON file:: - scrapy crawl myspider -o items.json + scrapy crawl myspider -O items.json To dump into a CSV file:: - scrapy crawl myspider -o items.csv + scrapy crawl myspider -O items.csv To dump into a XML file:: - scrapy crawl myspider -o items.xml + scrapy crawl myspider -O items.xml For more information see :ref:`topics-feed-exports` @@ -220,8 +255,8 @@ The ``__VIEWSTATE`` parameter is used in sites built with ASP.NET/VB.NET. For more info on how it works see `this page`_. Also, here's an `example spider`_ which scrapes one of these sites. -.. _this page: http://search.cpan.org/~ecarroll/HTML-TreeBuilderX-ASP_NET-0.09/lib/HTML/TreeBuilderX/ASP_NET.pm -.. _example spider: http://github.com/AmbientLighter/rpn-fas/blob/master/fas/spiders/rnp.py +.. _this page: https://metacpan.org/pod/release/ECARROLL/HTML-TreeBuilderX-ASP_NET-0.09/lib/HTML/TreeBuilderX/ASP_NET.pm +.. _example spider: https://github.com/AmbientLighter/rpn-fas/blob/master/fas/spiders/rnp.py What's the best way to parse big XML/CSV data feeds? ---------------------------------------------------- @@ -280,38 +315,63 @@ I'm scraping a XML document and my XPath selector doesn't return any items You may need to remove namespaces. See :ref:`removing-namespaces`. +.. _faq-split-item: -I'm getting an error: "cannot import name crawler" +How to split an item into multiple items in an item pipeline? +------------------------------------------------------------- + +:ref:`Item pipelines ` cannot yield multiple items per +input item. :ref:`Create a spider middleware ` +instead, and use its +:meth:`~scrapy.spidermiddlewares.SpiderMiddleware.process_spider_output` +method for this purpose. For example:: + + from copy import deepcopy + + from itemadapter import is_item, ItemAdapter + + class MultiplyItemsMiddleware: + + def process_spider_output(self, response, result, spider): + for item in result: + if is_item(item): + adapter = ItemAdapter(item) + for _ in range(adapter['multiply_by']): + yield deepcopy(item) + +Does Scrapy support IPv6 addresses? +----------------------------------- + +Yes, by setting :setting:`DNS_RESOLVER` to ``scrapy.resolver.CachingHostnameResolver``. +Note that by doing so, you lose the ability to set a specific timeout for DNS requests +(the value of the :setting:`DNS_TIMEOUT` setting is ignored). + + +.. _faq-specific-reactor: + +How to deal with ``: filedescriptor out of range in select()`` exceptions? +---------------------------------------------------------------------------------------------- + +This issue `has been reported`_ to appear when running broad crawls in macOS, where the default +Twisted reactor is :class:`twisted.internet.selectreactor.SelectReactor`. Switching to a +different reactor is possible by using the :setting:`TWISTED_REACTOR` setting. + + +.. _faq-stop-response-download: + +How can I cancel the download of a given response? -------------------------------------------------- -This is caused by Scrapy changes due to the singletons removal. The error is -most likely raised by a module (extension, middleware, pipeline or spider) in -your Scrapy project that imports ``crawler`` from ``scrapy.project``. For -example:: +In some situations, it might be useful to stop the download of a certain response. +For instance, if you only need the first part of a large response and you would like +to save resources by avoiding the download of the whole body. +In that case, you could attach a handler to the :class:`~scrapy.signals.bytes_received` +signal and raise a :exc:`~scrapy.exceptions.StopDownload` exception. Please refer to +the :ref:`topics-stop-response-download` topic for additional information and examples. - from scrapy.project import crawler - class SomeExtension(object): - def __init__(self): - self.crawler = crawler - # ... - -This way to access the crawler object is deprecated, the code should be ported -to use ``from_crawler`` class method, for example:: - - class SomeExtension(object): - - @classmethod - def from_crawler(cls, crawler): - o = cls() - o.crawler = crawler - return o - -Scrapy command line tool has some backwards compatibility in place to support -the old import mechanism (with a deprecation warning), but this mechanism may -not work if you use Scrapy differently (for example, as a library). - -.. _user agents: http://en.wikipedia.org/wiki/User_agent -.. _LIFO: http://en.wikipedia.org/wiki/LIFO -.. _DFO order: http://en.wikipedia.org/wiki/Depth-first_search -.. _BFO order: http://en.wikipedia.org/wiki/Breadth-first_search +.. _has been reported: https://github.com/scrapy/scrapy/issues/2905 +.. _user agents: https://en.wikipedia.org/wiki/User_agent +.. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type) +.. _DFO order: https://en.wikipedia.org/wiki/Depth-first_search +.. _BFO order: https://en.wikipedia.org/wiki/Breadth-first_search diff --git a/docs/index.rst b/docs/index.rst index 2a1ae037b..11aa5c9be 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,7 +4,13 @@ Scrapy |version| documentation ============================== -This documentation contains everything you need to know about Scrapy. +Scrapy is a fast high-level `web crawling`_ and `web scraping`_ framework, used +to crawl websites and extract structured data from their pages. It can be used +for a wide range of purposes, from data mining to monitoring and automated +testing. + +.. _web crawling: https://en.wikipedia.org/wiki/Web_crawler +.. _web scraping: https://en.wikipedia.org/wiki/Web_scraping Getting help ============ @@ -13,13 +19,15 @@ Having trouble? We'd like to help! * Try the :doc:`FAQ ` -- it's got answers to some common questions. * Looking for specific information? Try the :ref:`genindex` or :ref:`modindex`. -* Search for information in the `archives of the scrapy-users mailing list`_, or - `post a question`_. -* Ask a question in the `#scrapy IRC channel`_. +* Ask or search questions in `StackOverflow using the scrapy tag`_. +* Ask or search questions in the `Scrapy subreddit`_. +* Search for questions on the archives of the `scrapy-users mailing list`_. +* Ask a question in the `#scrapy IRC channel`_, * Report bugs with Scrapy in our `issue tracker`_. -.. _archives of the scrapy-users mailing list: http://groups.google.com/group/scrapy-users/ -.. _post a question: http://groups.google.com/group/scrapy-users/ +.. _scrapy-users mailing list: https://groups.google.com/forum/#!forum/scrapy-users +.. _Scrapy subreddit: https://www.reddit.com/r/scrapy/ +.. _StackOverflow using the scrapy tag: https://stackoverflow.com/tags/scrapy .. _#scrapy IRC channel: irc://irc.freenode.net/scrapy .. _issue tracker: https://github.com/scrapy/scrapy/issues @@ -28,6 +36,7 @@ First steps =========== .. toctree:: + :caption: First steps :hidden: intro/overview @@ -53,24 +62,26 @@ Basic concepts ============== .. toctree:: + :caption: Basic concepts :hidden: topics/commands - topics/items topics/spiders topics/selectors + topics/items topics/loaders topics/shell topics/item-pipeline topics/feed-exports + topics/request-response topics/link-extractors + topics/settings + topics/exceptions + :doc:`topics/commands` Learn about the command-line tool used to manage your Scrapy project. -:doc:`topics/items` - Define the data you want to scrape. - :doc:`topics/spiders` Write the rules to crawl your websites. @@ -80,6 +91,9 @@ Basic concepts :doc:`topics/shell` Test your extraction code in an interactive environment. +:doc:`topics/items` + Define the data you want to scrape. + :doc:`topics/loaders` Populate your items with the extracted data. @@ -89,13 +103,24 @@ Basic concepts :doc:`topics/feed-exports` Output your scraped data using different formats and storages. +:doc:`topics/request-response` + Understand the classes used to represent HTTP requests and responses. + :doc:`topics/link-extractors` Convenient classes to extract links to follow from pages. +:doc:`topics/settings` + Learn how to configure Scrapy and see all :ref:`available settings `. + +:doc:`topics/exceptions` + See all available exceptions and their meaning. + + Built-in services ================= .. toctree:: + :caption: Built-in services :hidden: topics/logging @@ -105,8 +130,8 @@ Built-in services topics/webservice :doc:`topics/logging` - Understand the simple logging facility provided by Scrapy. - + Learn how to use Python's builtin logging on Scrapy. + :doc:`topics/stats` Collect statistics about your scraping crawler. @@ -124,6 +149,7 @@ Solving specific problems ========================= .. toctree:: + :caption: Solving specific problems :hidden: faq @@ -131,22 +157,22 @@ Solving specific problems topics/contracts topics/practices topics/broad-crawls - topics/firefox - topics/firebug + topics/developer-tools + topics/dynamic-content topics/leaks - topics/images - topics/ubuntu - topics/scrapyd + topics/media-pipeline + topics/deploy topics/autothrottle topics/benchmarking topics/jobs - topics/djangoitem + topics/coroutines + topics/asyncio :doc:`faq` Get answers to most frequently asked questions. :doc:`topics/debug` - Learn how to debug common problems of your scrapy spider. + Learn how to debug common problems of your Scrapy spider. :doc:`topics/contracts` Learn how to use contracts for testing your spiders. @@ -157,23 +183,20 @@ Solving specific problems :doc:`topics/broad-crawls` Tune Scrapy for crawling a lot domains in parallel. -:doc:`topics/firefox` - Learn how to scrape with Firefox and some useful add-ons. +:doc:`topics/developer-tools` + Learn how to scrape with your browser's developer tools. -:doc:`topics/firebug` - Learn how to scrape efficiently using Firebug. +:doc:`topics/dynamic-content` + Read webpage data that is loaded dynamically. :doc:`topics/leaks` Learn how to find and get rid of memory leaks in your crawler. -:doc:`topics/images` - Download static images associated with your scraped items. +:doc:`topics/media-pipeline` + Download files and/or images associated with your scraped items. -:doc:`topics/ubuntu` - Install latest Scrapy packages easily on Ubuntu - -:doc:`topics/scrapyd` - Deploying your Scrapy project in production. +:doc:`topics/deploy` + Deploying your Scrapy spiders and run them in a remote server. :doc:`topics/autothrottle` Adjust crawl rate dynamically based on load. @@ -184,8 +207,11 @@ Solving specific problems :doc:`topics/jobs` Learn how to pause and resume crawls for large spiders. -:doc:`topics/djangoitem` - Write scraped items using Django models. +:doc:`topics/coroutines` + Use the :ref:`coroutine syntax `. + +:doc:`topics/asyncio` + Use :mod:`asyncio` and :mod:`asyncio`-powered libraries. .. _extending-scrapy: @@ -193,6 +219,7 @@ Extending Scrapy ================ .. toctree:: + :caption: Extending Scrapy :hidden: topics/architecture @@ -200,6 +227,9 @@ Extending Scrapy topics/spider-middleware topics/extensions topics/api + topics/signals + topics/exporters + :doc:`topics/architecture` Understand the Scrapy architecture. @@ -216,33 +246,9 @@ Extending Scrapy :doc:`topics/api` Use it on extensions and middlewares to extend Scrapy functionality -Reference -========= - -.. toctree:: - :hidden: - - topics/request-response - topics/settings - topics/signals - topics/exceptions - topics/exporters - -:doc:`topics/commands` - Learn about the command-line tool and see all :ref:`available commands `. - -:doc:`topics/request-response` - Understand the classes used to represent HTTP requests and responses. - -:doc:`topics/settings` - Learn how to configure Scrapy and see all :ref:`available settings `. - :doc:`topics/signals` See all available signals and how to work with them. -:doc:`topics/exceptions` - See all available exceptions and their meaning. - :doc:`topics/exporters` Quickly export your scraped items to a file (XML, CSV, etc). @@ -251,12 +257,12 @@ All the rest ============ .. toctree:: + :caption: All the rest :hidden: news contributing versioning - experimental/index :doc:`news` See what has changed in recent Scrapy versions. @@ -266,6 +272,3 @@ All the rest :doc:`versioning` Understand Scrapy versioning and API stability. - -:doc:`experimental/index` - Learn about bleeding-edge features. diff --git a/docs/intro/examples.rst b/docs/intro/examples.rst index 40a124679..96363c7d5 100644 --- a/docs/intro/examples.rst +++ b/docs/intro/examples.rst @@ -5,21 +5,16 @@ Examples ======== The best way to learn is with examples, and Scrapy is no exception. For this -reason, there is an example Scrapy project named dirbot_, that you can use to -play and learn more about Scrapy. It contains the dmoz spider described in the -tutorial. +reason, there is an example Scrapy project named quotesbot_, that you can use to +play and learn more about Scrapy. It contains two spiders for +http://quotes.toscrape.com, one using CSS selectors and another one using XPath +expressions. -This dirbot_ project is available at: https://github.com/scrapy/dirbot - -It contains a README file with a detailed description of the project contents. +The quotesbot_ project is available at: https://github.com/scrapy/quotesbot. +You can find more information about it in the project's README. If you're familiar with git, you can checkout the code. Otherwise you can -download a tarball or zip file of the project by clicking on `Downloads`_. +download the project as a zip file by clicking +`here `_. -The `scrapy tag on Snipplr`_ is used for sharing code snippets such as spiders, -middlewares, extensions, or scripts. Feel free (and encouraged!) to share any -code there. - -.. _dirbot: https://github.com/scrapy/dirbot -.. _Downloads: https://github.com/scrapy/dirbot/archives/master -.. _scrapy tag on Snipplr: http://snipplr.com/all/tags/scrapy/ +.. _quotesbot: https://github.com/scrapy/quotesbot diff --git a/docs/intro/install.rst b/docs/intro/install.rst index ffba0e2b3..6d65ae2ee 100644 --- a/docs/intro/install.rst +++ b/docs/intro/install.rst @@ -4,90 +4,271 @@ Installation guide ================== +.. _faq-python-versions: + +Supported Python versions +========================= + +Scrapy requires Python 3.5.2+, either the CPython implementation (default) or +the PyPy 5.9+ implementation (see :ref:`python:implementations`). + + Installing Scrapy ================= -.. note:: Check :ref:`intro-install-platform-notes` first. +If you're using `Anaconda`_ or `Miniconda`_, you can install the package from +the `conda-forge`_ channel, which has up-to-date packages for Linux, Windows +and macOS. -The installation steps assume that you have the following things installed: +To install Scrapy using ``conda``, run:: -* `Python`_ 2.7 + conda install -c conda-forge scrapy -* `pip`_ and `setuptools`_ Python packages. Nowadays `pip`_ requires and - installs `setuptools`_ if not installed. +Alternatively, if you’re already familiar with installation of Python packages, +you can install Scrapy and its dependencies from PyPI with:: -* `lxml`_. Most Linux distributions ships prepackaged versions of lxml. - Otherwise refer to http://lxml.de/installation.html + pip install Scrapy -* `OpenSSL`_. This comes preinstalled in all operating systems, except Windows - where the Python installer ships it bundled. +Note that sometimes this may require solving compilation issues for some Scrapy +dependencies depending on your operating system, so be sure to check the +:ref:`intro-install-platform-notes`. -You can install Scrapy using pip (which is the canonical way to install Python -packages). +We strongly recommend that you install Scrapy in :ref:`a dedicated virtualenv `, +to avoid conflicting with your system packages. -To install using pip:: +For more detailed and platform specifics instructions, as well as +troubleshooting information, read on. + + +Things that are good to know +---------------------------- + +Scrapy is written in pure Python and depends on a few key Python packages (among others): + +* `lxml`_, an efficient XML and HTML parser +* `parsel`_, an HTML/XML data extraction library written on top of lxml, +* `w3lib`_, a multi-purpose helper for dealing with URLs and web page encodings +* `twisted`_, an asynchronous networking framework +* `cryptography`_ and `pyOpenSSL`_, to deal with various network-level security needs + +The minimal versions which Scrapy is tested against are: + +* Twisted 14.0 +* lxml 3.4 +* pyOpenSSL 0.14 + +Scrapy may work with older versions of these packages +but it is not guaranteed it will continue working +because it’s not being tested against them. + +Some of these packages themselves depends on non-Python packages +that might require additional installation steps depending on your platform. +Please check :ref:`platform-specific guides below `. + +In case of any trouble related to these dependencies, +please refer to their respective installation instructions: + +* `lxml installation`_ +* `cryptography installation`_ + +.. _lxml installation: https://lxml.de/installation.html +.. _cryptography installation: https://cryptography.io/en/latest/installation/ + + +.. _intro-using-virtualenv: + +Using a virtual environment (recommended) +----------------------------------------- + +TL;DR: We recommend installing Scrapy inside a virtual environment +on all platforms. + +Python packages can be installed either globally (a.k.a system wide), +or in user-space. We do not recommend installing Scrapy system wide. + +Instead, we recommend that you install Scrapy within a so-called +"virtual environment" (:mod:`venv`). +Virtual environments allow you to not conflict with already-installed Python +system packages (which could break some of your system tools and scripts), +and still install packages normally with ``pip`` (without ``sudo`` and the likes). + +See :ref:`tut-venv` on how to create your virtual environment. + +Once you have created a virtual environment, you can install Scrapy inside it with ``pip``, +just like any other Python package. +(See :ref:`platform-specific guides ` +below for non-Python dependencies that you may need to install beforehand). - pip install Scrapy .. _intro-install-platform-notes: Platform specific installation notes ==================================== +.. _intro-install-windows: + Windows ------- -* Install Python 2.7 from http://python.org/download/ +Though it's possible to install Scrapy on Windows using pip, we recommend you +to install `Anaconda`_ or `Miniconda`_ and use the package from the +`conda-forge`_ channel, which will avoid most installation issues. - You need to adjust ``PATH`` environment variable to include paths to - the Python executable and additional scripts. The following paths need to be - added to ``PATH``:: +Once you've installed `Anaconda`_ or `Miniconda`_, install Scrapy with:: - C:\Python2.7\;C:\Python2.7\Scripts\; + conda install -c conda-forge scrapy - To update the ``PATH`` open a Command prompt and run:: - c:\python27\python.exe c:\python27\tools\scripts\win_add2path.py +.. _intro-install-ubuntu: - Close the command prompt window and reopen it so changes take effect, run the - following command and check it shows the expected Python version:: +Ubuntu 14.04 or above +--------------------- - python --version - -* Install `pip`_ from https://pip.pypa.io/en/latest/installing.html - - Now open a Command prompt to check ``pip`` is installed correctly:: - - pip --version - -* At this point Python 2.7 and ``pip`` package manager must be working, let's - install Scrapy:: - - pip install Scrapy - -Ubuntu 9.10 or above -~~~~~~~~~~~~~~~~~~~~ +Scrapy is currently tested with recent-enough versions of lxml, +twisted and pyOpenSSL, and is compatible with recent Ubuntu distributions. +But it should support older versions of Ubuntu too, like Ubuntu 14.04, +albeit with potential issues with TLS connections. **Don't** use the ``python-scrapy`` package provided by Ubuntu, they are typically too old and slow to catch up with latest Scrapy. -Instead, use the official :ref:`Ubuntu Packages `, which already -solve all dependencies for you and are continuously updated with the latest bug -fixes. -Archlinux -~~~~~~~~~ +To install Scrapy on Ubuntu (or Ubuntu-based) systems, you need to install +these dependencies:: -You can follow the generic instructions or install Scrapy from `AUR Scrapy package`:: + sudo apt-get install python3 python3-dev python3-pip libxml2-dev libxslt1-dev zlib1g-dev libffi-dev libssl-dev - yaourt -S scrapy +- ``python3-dev``, ``zlib1g-dev``, ``libxml2-dev`` and ``libxslt1-dev`` + are required for ``lxml`` +- ``libssl-dev`` and ``libffi-dev`` are required for ``cryptography`` + +Inside a :ref:`virtualenv `, +you can install Scrapy with ``pip`` after that:: + + pip install scrapy + +.. note:: + The same non-Python dependencies can be used to install Scrapy in Debian + Jessie (8.0) and above. -.. _Python: http://www.python.org -.. _pip: http://www.pip-installer.org/en/latest/installing.html -.. _easy_install: http://pypi.python.org/pypi/setuptools -.. _Control Panel: http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/sysdm_advancd_environmnt_addchange_variable.mspx -.. _lxml: http://lxml.de/ -.. _OpenSSL: https://pypi.python.org/pypi/pyOpenSSL +.. _intro-install-macos: + +macOS +----- + +Building Scrapy's dependencies requires the presence of a C compiler and +development headers. On macOS this is typically provided by Apple’s Xcode +development tools. To install the Xcode command line tools open a terminal +window and run:: + + xcode-select --install + +There's a `known issue `_ that +prevents ``pip`` from updating system packages. This has to be addressed to +successfully install Scrapy and its dependencies. Here are some proposed +solutions: + +* *(Recommended)* **Don't** use system python, install a new, updated version + that doesn't conflict with the rest of your system. Here's how to do it using + the `homebrew`_ package manager: + + * Install `homebrew`_ following the instructions in https://brew.sh/ + + * Update your ``PATH`` variable to state that homebrew packages should be + used before system packages (Change ``.bashrc`` to ``.zshrc`` accordantly + if you're using `zsh`_ as default shell):: + + echo "export PATH=/usr/local/bin:/usr/local/sbin:$PATH" >> ~/.bashrc + + * Reload ``.bashrc`` to ensure the changes have taken place:: + + source ~/.bashrc + + * Install python:: + + brew install python + + * Latest versions of python have ``pip`` bundled with them so you won't need + to install it separately. If this is not the case, upgrade python:: + + brew update; brew upgrade python + +* *(Optional)* :ref:`Install Scrapy inside a Python virtual environment + `. + + This method is a workaround for the above macOS issue, but it's an overall + good practice for managing dependencies and can complement the first method. + +After any of these workarounds you should be able to install Scrapy:: + + pip install Scrapy + + +PyPy +---- + +We recommend using the latest PyPy version. The version tested is 5.9.0. +For PyPy3, only Linux installation was tested. + +Most Scrapy dependencides now have binary wheels for CPython, but not for PyPy. +This means that these dependecies will be built during installation. +On macOS, you are likely to face an issue with building Cryptography dependency, +solution to this problem is described +`here `_, +that is to ``brew install openssl`` and then export the flags that this command +recommends (only needed when installing Scrapy). Installing on Linux has no special +issues besides installing build dependencies. +Installing Scrapy with PyPy on Windows is not tested. + +You can check that Scrapy is installed correctly by running ``scrapy bench``. +If this command gives errors such as +``TypeError: ... got 2 unexpected keyword arguments``, this means +that setuptools was unable to pick up one PyPy-specific dependency. +To fix this issue, run ``pip install 'PyPyDispatcher>=2.1.0'``. + + +.. _intro-install-troubleshooting: + +Troubleshooting +=============== + +AttributeError: 'module' object has no attribute 'OP_NO_TLSv1_1' +---------------------------------------------------------------- + +After you install or upgrade Scrapy, Twisted or pyOpenSSL, you may get an +exception with the following traceback:: + + […] + File "[…]/site-packages/twisted/protocols/tls.py", line 63, in + from twisted.internet._sslverify import _setAcceptableProtocols + File "[…]/site-packages/twisted/internet/_sslverify.py", line 38, in + TLSVersion.TLSv1_1: SSL.OP_NO_TLSv1_1, + AttributeError: 'module' object has no attribute 'OP_NO_TLSv1_1' + +The reason you get this exception is that your system or virtual environment +has a version of pyOpenSSL that your version of Twisted does not support. + +To install a version of pyOpenSSL that your version of Twisted supports, +reinstall Twisted with the :code:`tls` extra option:: + + pip install twisted[tls] + +For details, see `Issue #2473 `_. + +.. _Python: https://www.python.org/ +.. _pip: https://pip.pypa.io/en/latest/installing/ +.. _lxml: https://lxml.de/index.html +.. _parsel: https://pypi.org/project/parsel/ +.. _w3lib: https://pypi.org/project/w3lib/ +.. _twisted: https://twistedmatrix.com/trac/ +.. _cryptography: https://cryptography.io/en/latest/ +.. _pyOpenSSL: https://pypi.org/project/pyOpenSSL/ .. _setuptools: https://pypi.python.org/pypi/setuptools .. _AUR Scrapy package: https://aur.archlinux.org/packages/scrapy/ +.. _homebrew: https://brew.sh/ +.. _zsh: https://www.zsh.org/ +.. _Scrapinghub: https://scrapinghub.com +.. _Anaconda: https://docs.anaconda.com/anaconda/ +.. _Miniconda: https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html +.. _conda-forge: https://conda-forge.org/ diff --git a/docs/intro/overview.rst b/docs/intro/overview.rst index 289e975b8..dd80c7bd0 100644 --- a/docs/intro/overview.rst +++ b/docs/intro/overview.rst @@ -8,177 +8,90 @@ Scrapy is an application framework for crawling web sites and extracting structured data which can be used for a wide range of useful applications, like data mining, information processing or historical archival. -Even though Scrapy was originally designed for `screen scraping`_ (more -precisely, `web scraping`_), it can also be used to extract data using APIs -(such as `Amazon Associates Web Services`_) or as a general purpose web -crawler. +Even though Scrapy was originally designed for `web scraping`_, it can also be +used to extract data using APIs (such as `Amazon Associates Web Services`_) or +as a general purpose web crawler. -The purpose of this document is to introduce you to the concepts behind Scrapy -so you can get an idea of how it works and decide if Scrapy is what you need. -When you're ready to start a project, you can :ref:`start with the tutorial -`. +Walk-through of an example spider +================================= -Pick a website -============== +In order to show you what Scrapy brings to the table, we'll walk you through an +example of a Scrapy Spider using the simplest way to run a spider. -So you need to extract some information from a website, but the website doesn't -provide any API or mechanism to access that info programmatically. Scrapy can -help you extract that information. - -Let's say we want to extract the URL, name, description and size of all torrent -files added today in the `Mininova`_ site. - -The list of all torrents added today can be found on this page: - - http://www.mininova.org/today - -.. _intro-overview-item: - -Define the data you want to scrape -================================== - -The first thing is to define the data we want to scrape. In Scrapy, this is -done through :ref:`Scrapy Items ` (Torrent files, in this case). - -This would be our Item:: +Here's the code for a spider that scrapes famous quotes from website +http://quotes.toscrape.com, following the pagination:: import scrapy - class TorrentItem(scrapy.Item): - url = scrapy.Field() - name = scrapy.Field() - description = scrapy.Field() - size = scrapy.Field() -Write a Spider to extract the data -================================== + class QuotesSpider(scrapy.Spider): + name = 'quotes' + start_urls = [ + 'http://quotes.toscrape.com/tag/humor/', + ] -The next thing is to write a Spider which defines the start URL -(http://www.mininova.org/today), the rules for following links and the rules -for extracting the data from pages. + def parse(self, response): + for quote in response.css('div.quote'): + yield { + 'author': quote.xpath('span/small/text()').get(), + 'text': quote.css('span.text::text').get(), + } -If we take a look at that page content we'll see that all torrent URLs are like -``http://www.mininova.org/tor/NUMBER`` where ``NUMBER`` is an integer. We'll use -that to construct the regular expression for the links to follow: ``/tor/\d+``. + next_page = response.css('li.next a::attr("href")').get() + if next_page is not None: + yield response.follow(next_page, self.parse) -We'll use `XPath`_ for selecting the data to extract from the web page HTML -source. Let's take one of those torrent pages: +Put this in a text file, name it to something like ``quotes_spider.py`` +and run the spider using the :command:`runspider` command:: - http://www.mininova.org/tor/2676093 + scrapy runspider quotes_spider.py -o quotes.jl -And look at the page HTML source to construct the XPath to select the data we -want which is: torrent name, description and size. +When this finishes you will have in the ``quotes.jl`` file a list of the +quotes in JSON Lines format, containing text and author, looking like this:: -.. highlight:: html - -By looking at the page HTML source we can see that the file name is contained -inside a ``

`` tag:: - -

Darwin - The Evolution Of An Exhibition

- -.. highlight:: none - -An XPath expression to extract the name could be:: - - //h1/text() - -.. highlight:: html - -And the description is contained inside a ``
`` tag with ``id="description"``:: - -

Description:

- -
- Short documentary made for Plymouth City Museum and Art Gallery regarding the setup of an exhibit about Charles Darwin in conjunction with the 200th anniversary of his birth. - - ... - -.. highlight:: none - -An XPath expression to select the description could be:: - - //div[@id='description'] - -.. highlight:: html - -Finally, the file size is contained in the second ``

`` tag inside the ``

`` -tag with ``id=specifications``:: - -
- -

- Category: - Movies > Documentary -

- -

- Total size: - 150.62 megabyte

+ {"author": "Jane Austen", "text": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d"} + {"author": "Steve Martin", "text": "\u201cA day without sunshine is like, you know, night.\u201d"} + {"author": "Garrison Keillor", "text": "\u201cAnyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.\u201d"} + ... -.. highlight:: none +What just happened? +------------------- -An XPath expression to select the file size could be:: +When you ran the command ``scrapy runspider quotes_spider.py``, Scrapy looked for a +Spider definition inside it and ran it through its crawler engine. - //div[@id='specifications']/p[2]/text()[2] +The crawl started by making requests to the URLs defined in the ``start_urls`` +attribute (in this case, only the URL for quotes in *humor* category) +and called the default callback method ``parse``, passing the response object as +an argument. In the ``parse`` callback, we loop through the quote elements +using a CSS Selector, yield a Python dict with the extracted quote text and author, +look for a link to the next page and schedule another request using the same +``parse`` method as callback. -.. highlight:: python +Here you notice one of the main advantages about Scrapy: requests are +:ref:`scheduled and processed asynchronously `. This +means that Scrapy doesn't need to wait for a request to be finished and +processed, it can send another request or do other things in the meantime. This +also means that other requests can keep going even if some request fails or an +error happens while handling it. -For more information about XPath see the `XPath reference`_. +While this enables you to do very fast crawls (sending multiple concurrent +requests at the same time, in a fault-tolerant way) Scrapy also gives you +control over the politeness of the crawl through :ref:`a few settings +`. You can do things like setting a download delay between +each request, limiting amount of concurrent requests per domain or per IP, and +even :ref:`using an auto-throttling extension ` that tries +to figure out these automatically. -Finally, here's the spider code:: +.. note:: - from scrapy.contrib.spiders import CrawlSpider, Rule - from scrapy.contrib.linkextractors import LinkExtractor + This is using :ref:`feed exports ` to generate the + JSON file, you can easily change the export format (XML or CSV, for example) or the + storage backend (FTP or `Amazon S3`_, for example). You can also write an + :ref:`item pipeline ` to store the items in a database. - class MininovaSpider(CrawlSpider): - - name = 'mininova' - allowed_domains = ['mininova.org'] - start_urls = ['http://www.mininova.org/today'] - rules = [Rule(LinkExtractor(allow=['/tor/\d+']), 'parse_torrent')] - - def parse_torrent(self, response): - torrent = TorrentItem() - torrent['url'] = response.url - torrent['name'] = response.xpath("//h1/text()").extract() - torrent['description'] = response.xpath("//div[@id='description']").extract() - torrent['size'] = response.xpath("//div[@id='info-left']/p[2]/text()[2]").extract() - return torrent - -The ``TorrentItem`` class is :ref:`defined above `. - -Run the spider to extract the data -================================== - -Finally, we'll run the spider to crawl the site and output the file -``scraped_data.json`` with the scraped data in JSON format:: - - scrapy crawl mininova -o scraped_data.json - -This uses :ref:`feed exports ` to generate the JSON file. -You can easily change the export format (XML or CSV, for example) or the -storage backend (FTP or `Amazon S3`_, for example). - -You can also write an :ref:`item pipeline ` to store the -items in a database very easily. - -Review scraped data -=================== - -If you check the ``scraped_data.json`` file after the process finishes, you'll -see the scraped items there:: - - [{"url": "http://www.mininova.org/tor/2676093", "name": ["Darwin - The Evolution Of An Exhibition"], "description": ["Short documentary made for Plymouth ..."], "size": ["150.62 megabyte"]}, - # ... other items ... - ] - -You'll notice that all field values (except for the ``url`` which was assigned -directly) are actually lists. This is because the :ref:`selectors -` return lists. You may want to store single values, or -perform some additional parsing/cleansing to the values. That's what -:ref:`Item Loaders ` are for. .. _topics-whatelse: @@ -190,77 +103,53 @@ this is just the surface. Scrapy provides a lot of powerful features for making scraping easy and efficient, such as: * Built-in support for :ref:`selecting and extracting ` data - from HTML and XML sources + from HTML/XML sources using extended CSS selectors and XPath expressions, + with helper methods to extract using regular expressions. -* Built-in support for cleaning and sanitizing the scraped data using a - collection of reusable filters (called :ref:`Item Loaders `) - shared between all the spiders. +* An :ref:`interactive shell console ` (IPython aware) for trying + out the CSS and XPath expressions to scrape data, very useful when writing or + debugging your spiders. * Built-in support for :ref:`generating feed exports ` in multiple formats (JSON, CSV, XML) and storing them in multiple backends (FTP, S3, local filesystem) -* A media pipeline for :ref:`automatically downloading images ` - (or any other media) associated with the scraped items - -* Support for :ref:`extending Scrapy ` by plugging - your own functionality using :ref:`signals ` and a - well-defined API (middlewares, :ref:`extensions `, and - :ref:`pipelines `). - -* Wide range of built-in middlewares and extensions for: - - * cookies and session handling - * HTTP compression - * HTTP authentication - * HTTP cache - * user-agent spoofing - * robots.txt - * crawl depth restriction - * and more - * Robust encoding support and auto-detection, for dealing with foreign, non-standard and broken encoding declarations. -* Support for creating spiders based on pre-defined templates, to speed up - spider creation and make their code more consistent on large projects. See - :command:`genspider` command for more details. +* :ref:`Strong extensibility support `, allowing you to plug + in your own functionality using :ref:`signals ` and a + well-defined API (middlewares, :ref:`extensions `, and + :ref:`pipelines `). -* Extensible :ref:`stats collection ` for multiple spider - metrics, useful for monitoring the performance of your spiders and detecting - when they get broken +* Wide range of built-in extensions and middlewares for handling: -* An :ref:`Interactive shell console ` for trying XPaths, very - useful for writing and debugging your spiders - -* A :ref:`System service ` designed to ease the deployment and - run of your spiders in production. + - cookies and session handling + - HTTP features like compression, authentication, caching + - user-agent spoofing + - robots.txt + - crawl depth restriction + - and more * A :ref:`Telnet console ` for hooking into a Python console running inside your Scrapy process, to introspect and debug your crawler -* :ref:`Logging ` facility that you can hook on to for catching - errors during the scraping process. - -* Support for crawling based on URLs discovered through `Sitemaps`_ - -* A caching DNS resolver +* Plus other goodies like reusable spiders to crawl sites from `Sitemaps`_ and + XML/CSV feeds, a media pipeline for :ref:`automatically downloading images + ` (or any other media) associated with the scraped + items, a caching DNS resolver, and much more! What's next? ============ -The next obvious steps are for you to `download Scrapy`_, read :ref:`the -tutorial ` and join `the community`_. Thanks for your +The next steps for you are to :ref:`install Scrapy `, +:ref:`follow through the tutorial ` to learn how to create +a full-blown Scrapy project and `join the community`_. Thanks for your interest! -.. _download Scrapy: http://scrapy.org/download/ -.. _the community: http://scrapy.org/community/ -.. _screen scraping: http://en.wikipedia.org/wiki/Screen_scraping -.. _web scraping: http://en.wikipedia.org/wiki/Web_scraping -.. _Amazon Associates Web Services: http://aws.amazon.com/associates/ -.. _Mininova: http://www.mininova.org -.. _XPath: http://www.w3.org/TR/xpath -.. _XPath reference: http://www.w3.org/TR/xpath -.. _Amazon S3: http://aws.amazon.com/s3/ -.. _Sitemaps: http://www.sitemaps.org +.. _join the community: https://scrapy.org/community/ +.. _web scraping: https://en.wikipedia.org/wiki/Web_scraping +.. _Amazon Associates Web Services: https://affiliate-program.amazon.com/gp/advertising/api/detail/main.html +.. _Amazon S3: https://aws.amazon.com/s3/ +.. _Sitemaps: https://www.sitemaps.org/index.html diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index a4248d7aa..f96c78887 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -7,447 +7,752 @@ Scrapy Tutorial In this tutorial, we'll assume that Scrapy is already installed on your system. If that's not the case, see :ref:`intro-install`. -We are going to use `Open directory project (dmoz) `_ as -our example domain to scrape. +We are going to scrape `quotes.toscrape.com `_, a website +that lists quotes from famous authors. This tutorial will walk you through these tasks: 1. Creating a new Scrapy project -2. Defining the Items you will extract -3. Writing a :ref:`spider ` to crawl a site and extract - :ref:`Items ` -4. Writing an :ref:`Item Pipeline ` to store the - extracted Items +2. Writing a :ref:`spider ` to crawl a site and extract data +3. Exporting the scraped data using the command line +4. Changing spider to recursively follow links +5. Using spider arguments Scrapy is written in Python_. If you're new to the language you might want to start by getting an idea of what the language is like, to get the most out of -Scrapy. If you're already familiar with other languages, and want to learn -Python quickly, we recommend `Learn Python The Hard Way`_. If you're new to programming -and want to start with Python, take a look at `this list of Python resources -for non-programmers`_. +Scrapy. + +If you're already familiar with other languages, and want to learn Python quickly, the `Python Tutorial`_ is a good resource. + +If you're new to programming and want to start with Python, the following books +may be useful to you: + +* `Automate the Boring Stuff With Python`_ + +* `How To Think Like a Computer Scientist`_ + +* `Learn Python 3 The Hard Way`_ + +You can also take a look at `this list of Python resources for non-programmers`_, +as well as the `suggested resources in the learnpython-subreddit`_. + +.. _Python: https://www.python.org/ +.. _this list of Python resources for non-programmers: https://wiki.python.org/moin/BeginnersGuide/NonProgrammers +.. _Python Tutorial: https://docs.python.org/3/tutorial +.. _Automate the Boring Stuff With Python: https://automatetheboringstuff.com/ +.. _How To Think Like a Computer Scientist: http://openbookproject.net/thinkcs/python/english3e/ +.. _Learn Python 3 The Hard Way: https://learnpythonthehardway.org/python3/ +.. _suggested resources in the learnpython-subreddit: https://www.reddit.com/r/learnpython/wiki/index#wiki_new_to_python.3F -.. _Python: http://www.python.org -.. _this list of Python resources for non-programmers: http://wiki.python.org/moin/BeginnersGuide/NonProgrammers -.. _Learn Python The Hard Way: http://learnpythonthehardway.org/book/ Creating a project ================== -Before you start scraping, you will have set up a new Scrapy project. Enter a -directory where you'd like to store your code and then run:: +Before you start scraping, you will have to set up a new Scrapy project. Enter a +directory where you'd like to store your code and run:: scrapy startproject tutorial This will create a ``tutorial`` directory with the following contents:: tutorial/ - scrapy.cfg - tutorial/ + scrapy.cfg # deploy configuration file + + tutorial/ # project's Python module, you'll import your code from here __init__.py - items.py - pipelines.py - settings.py - spiders/ + + items.py # project items definition file + + middlewares.py # project middlewares file + + pipelines.py # project pipelines file + + settings.py # project settings file + + spiders/ # a directory where you'll later put your spiders __init__.py - ... -These are basically: - -* ``scrapy.cfg``: the project configuration file -* ``tutorial/``: the project's python module, you'll later import your code from - here. -* ``tutorial/items.py``: the project's items file. -* ``tutorial/pipelines.py``: the project's pipelines file. -* ``tutorial/settings.py``: the project's settings file. -* ``tutorial/spiders/``: a directory where you'll later put your spiders. - -Defining our Item -================= - -`Items` are containers that will be loaded with the scraped data; they work -like simple python dicts but provide additional protection against populating -undeclared fields, to prevent typos. - -They are declared by creating a :class:`scrapy.Item ` class and defining -its attributes as :class:`scrapy.Field ` objects, like you will in an ORM -(don't worry if you're not familiar with ORMs, you will see that this is an -easy task). - -We begin by modeling the item that we will use to hold the sites data obtained -from dmoz.org, as we want to capture the name, url and description of the -sites, we define fields for each of these three attributes. To do that, we edit -``items.py``, found in the ``tutorial`` directory. Our Item class looks like this:: - - import scrapy - - class DmozItem(scrapy.Item): - title = scrapy.Field() - link = scrapy.Field() - desc = scrapy.Field() - -This may seem complicated at first, but defining the item allows you to use other handy -components of Scrapy that need to know how your item looks. Our first Spider ================ -Spiders are user-written classes used to scrape information from a domain (or group -of domains). +Spiders are classes that you define and that Scrapy uses to scrape information +from a website (or a group of websites). They must subclass +:class:`~scrapy.spiders.Spider` and define the initial requests to make, +optionally how to follow links in the pages, and how to parse the downloaded +page content to extract data. -They define an initial list of URLs to download, how to follow links, and how -to parse the contents of those pages to extract :ref:`items `. - -To create a Spider, you must subclass :class:`scrapy.Spider ` and -define the three main mandatory attributes: - -* :attr:`~scrapy.spider.Spider.name`: identifies the Spider. It must be - unique, that is, you can't set the same name for different Spiders. - -* :attr:`~scrapy.spider.Spider.start_urls`: is a list of URLs where the - Spider will begin to crawl from. So, the first pages downloaded will be those - listed here. The subsequent URLs will be generated successively from data - contained in the start URLs. - -* :meth:`~scrapy.spider.Spider.parse` is a method of the spider, which will - be called with the downloaded :class:`~scrapy.http.Response` object of each - start URL. The response is passed to the method as the first and only - argument. - - This method is responsible for parsing the response data and extracting - scraped data (as scraped items) and more URLs to follow. - - The :meth:`~scrapy.spider.Spider.parse` method is in charge of processing - the response and returning scraped data (as :class:`~scrapy.item.Item` - objects) and more URLs to follow (as :class:`~scrapy.http.Request` objects). - -This is the code for our first Spider; save it in a file named -``dmoz_spider.py`` under the ``tutorial/spiders`` directory:: +This is the code for our first Spider. Save it in a file named +``quotes_spider.py`` under the ``tutorial/spiders`` directory in your project:: import scrapy - class DmozSpider(scrapy.Spider): - name = "dmoz" - allowed_domains = ["dmoz.org"] - start_urls = [ - "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", - "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" - ] + + class QuotesSpider(scrapy.Spider): + name = "quotes" + + def start_requests(self): + urls = [ + 'http://quotes.toscrape.com/page/1/', + 'http://quotes.toscrape.com/page/2/', + ] + for url in urls: + yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): - filename = response.url.split("/")[-2] + page = response.url.split("/")[-2] + filename = 'quotes-%s.html' % page with open(filename, 'wb') as f: f.write(response.body) + self.log('Saved file %s' % filename) -Crawling --------- + +As you can see, our Spider subclasses :class:`scrapy.Spider ` +and defines some attributes and methods: + +* :attr:`~scrapy.spiders.Spider.name`: identifies the Spider. It must be + unique within a project, that is, you can't set the same name for different + Spiders. + +* :meth:`~scrapy.spiders.Spider.start_requests`: must return an iterable of + Requests (you can return a list of requests or write a generator function) + which the Spider will begin to crawl from. Subsequent requests will be + generated successively from these initial requests. + +* :meth:`~scrapy.spiders.Spider.parse`: a method that will be called to handle + the response downloaded for each of the requests made. The response parameter + is an instance of :class:`~scrapy.http.TextResponse` that holds + the page content and has further helpful methods to handle it. + + The :meth:`~scrapy.spiders.Spider.parse` method usually parses the response, extracting + the scraped data as dicts and also finding new URLs to + follow and creating new requests (:class:`~scrapy.http.Request`) from them. + +How to run our spider +--------------------- To put our spider to work, go to the project's top level directory and run:: - scrapy crawl dmoz + scrapy crawl quotes -The ``crawl dmoz`` command runs the spider for the ``dmoz.org`` domain. You -will get an output similar to this:: +This command runs the spider with name ``quotes`` that we've just added, that +will send some requests for the ``quotes.toscrape.com`` domain. You will get an output +similar to this:: - 2014-01-23 18:13:07-0400 [scrapy] INFO: Scrapy started (bot: tutorial) - 2014-01-23 18:13:07-0400 [scrapy] INFO: Optional features available: ... - 2014-01-23 18:13:07-0400 [scrapy] INFO: Overridden settings: {} - 2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled extensions: ... - 2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled downloader middlewares: ... - 2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled spider middlewares: ... - 2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled item pipelines: ... - 2014-01-23 18:13:07-0400 [dmoz] INFO: Spider opened - 2014-01-23 18:13:08-0400 [dmoz] DEBUG: Crawled (200) (referer: None) - 2014-01-23 18:13:09-0400 [dmoz] DEBUG: Crawled (200) (referer: None) - 2014-01-23 18:13:09-0400 [dmoz] INFO: Closing spider (finished) + ... (omitted for brevity) + 2016-12-16 21:24:05 [scrapy.core.engine] INFO: Spider opened + 2016-12-16 21:24:05 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) + 2016-12-16 21:24:05 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023 + 2016-12-16 21:24:05 [scrapy.core.engine] DEBUG: Crawled (404) (referer: None) + 2016-12-16 21:24:05 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None) + 2016-12-16 21:24:05 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None) + 2016-12-16 21:24:05 [quotes] DEBUG: Saved file quotes-1.html + 2016-12-16 21:24:05 [quotes] DEBUG: Saved file quotes-2.html + 2016-12-16 21:24:05 [scrapy.core.engine] INFO: Closing spider (finished) + ... -Pay attention to the lines containing ``[dmoz]``, which corresponds to our -spider. You can see a log line for each URL defined in ``start_urls``. Because -these URLs are the starting ones, they have no referrers, which is shown at the -end of the log line, where it says ``(referer: None)``. +Now, check the files in the current directory. You should notice that two new +files have been created: *quotes-1.html* and *quotes-2.html*, with the content +for the respective URLs, as our ``parse`` method instructs. + +.. note:: If you are wondering why we haven't parsed the HTML yet, hold + on, we will cover that soon. -But more interesting, as our ``parse`` method instructs, two files have been -created: *Books* and *Resources*, with the content of both URLs. What just happened under the hood? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Scrapy creates :class:`scrapy.Request ` objects -for each URL in the ``start_urls`` attribute of the Spider, and assigns -them the ``parse`` method of the spider as their callback function. - -These Requests are scheduled, then executed, and :class:`scrapy.http.Response` -objects are returned and then fed back to the spider, through the -:meth:`~scrapy.spider.Spider.parse` method. - -Extracting Items ----------------- - -Introduction to Selectors -^^^^^^^^^^^^^^^^^^^^^^^^^ - -There are several ways to extract data from web pages. Scrapy uses a mechanism -based on `XPath`_ or `CSS`_ expressions called :ref:`Scrapy Selectors -`. For more information about selectors and other extraction -mechanisms see the :ref:`Selectors documentation `. - -.. _XPath: http://www.w3.org/TR/xpath -.. _CSS: http://www.w3.org/TR/selectors - -Here are some examples of XPath expressions and their meanings: - -* ``/html/head/title``: selects the ```` element, inside the ``<head>`` - element of a HTML document - -* ``/html/head/title/text()``: selects the text inside the aforementioned - ``<title>`` element. - -* ``//td``: selects all the ``<td>`` elements - -* ``//div[@class="mine"]``: selects all ``div`` elements which contain an - attribute ``class="mine"`` - -These are just a couple of simple examples of what you can do with XPath, but -XPath expressions are indeed much more powerful. To learn more about XPath we -recommend `this XPath tutorial <http://www.w3schools.com/XPath/default.asp>`_. - -For working with XPaths, Scrapy provides :class:`~scrapy.selector.Selector` -class and convenient shortcuts to avoid instantiating selectors yourself -everytime you need to select something from a response. - -You can see selectors as objects that represent nodes in the document -structure. So, the first instantiated selectors are associated with the root -node, or the entire document. - -Selectors have four basic methods (click on the method to see the complete API -documentation): - -* :meth:`~scrapy.selector.Selector.xpath`: returns a list of selectors, each of - them representing the nodes selected by the xpath expression given as - argument. - -* :meth:`~scrapy.selector.Selector.css`: returns a list of selectors, each of - them representing the nodes selected by the CSS expression given as argument. - -* :meth:`~scrapy.selector.Selector.extract`: returns a unicode string with the - selected data. - -* :meth:`~scrapy.selector.Selector.re`: returns a list of unicode strings - extracted by applying the regular expression given as argument. +Scrapy schedules the :class:`scrapy.Request <scrapy.http.Request>` objects +returned by the ``start_requests`` method of the Spider. Upon receiving a +response for each one, it instantiates :class:`~scrapy.http.Response` objects +and calls the callback method associated with the request (in this case, the +``parse`` method) passing the response as argument. -Trying Selectors in the Shell -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A shortcut to the start_requests method +--------------------------------------- +Instead of implementing a :meth:`~scrapy.spiders.Spider.start_requests` method +that generates :class:`scrapy.Request <scrapy.http.Request>` objects from URLs, +you can just define a :attr:`~scrapy.spiders.Spider.start_urls` class attribute +with a list of URLs. This list will then be used by the default implementation +of :meth:`~scrapy.spiders.Spider.start_requests` to create the initial requests +for your spider:: -To illustrate the use of Selectors we're going to use the built-in :ref:`Scrapy -shell <topics-shell>`, which also requires IPython (an extended Python console) -installed on your system. + import scrapy -To start a shell, you must go to the project's top level directory and run:: - scrapy shell "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/" + class QuotesSpider(scrapy.Spider): + name = "quotes" + start_urls = [ + 'http://quotes.toscrape.com/page/1/', + 'http://quotes.toscrape.com/page/2/', + ] + + def parse(self, response): + page = response.url.split("/")[-2] + filename = 'quotes-%s.html' % page + with open(filename, 'wb') as f: + f.write(response.body) + +The :meth:`~scrapy.spiders.Spider.parse` method will be called to handle each +of the requests for those URLs, even though we haven't explicitly told Scrapy +to do so. This happens because :meth:`~scrapy.spiders.Spider.parse` is Scrapy's +default callback method, which is called for requests without an explicitly +assigned callback. + + +Extracting data +--------------- + +The best way to learn how to extract data with Scrapy is trying selectors +using the :ref:`Scrapy shell <topics-shell>`. Run:: + + scrapy shell 'http://quotes.toscrape.com/page/1/' .. note:: - Remember to always enclose urls with quotes when running Scrapy shell from - command-line, otherwise urls containing arguments (ie. ``&`` character) + Remember to always enclose urls in quotes when running Scrapy shell from + command-line, otherwise urls containing arguments (i.e. ``&`` character) will not work. -This is what the shell looks like:: + On Windows, use double quotes instead:: + + scrapy shell "http://quotes.toscrape.com/page/1/" + +You will see something like:: [ ... Scrapy log here ... ] - - 2014-01-23 17:11:42-0400 [default] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None) + 2016-09-19 12:09:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/page/1/> (referer: None) [s] Available Scrapy objects: - [s] crawler <scrapy.crawler.Crawler object at 0x3636b50> + [s] scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc) + [s] crawler <scrapy.crawler.Crawler object at 0x7fa91d888c90> [s] item {} - [s] request <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> - [s] response <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> - [s] settings <scrapy.settings.Settings object at 0x3fadc50> - [s] spider <Spider 'default' at 0x3cebf50> + [s] request <GET http://quotes.toscrape.com/page/1/> + [s] response <200 http://quotes.toscrape.com/page/1/> + [s] settings <scrapy.settings.Settings object at 0x7fa91d888c10> + [s] spider <DefaultSpider 'default' at 0x7fa91c8af990> [s] Useful shortcuts: [s] shelp() Shell help (print this help) [s] fetch(req_or_url) Fetch request (or URL) and update local objects [s] view(response) View response in a browser - In [1]: +Using the shell, you can try selecting elements using `CSS`_ with the response +object: -After the shell loads, you will have the response fetched in a local -``response`` variable, so if you type ``response.body`` you will see the body -of the response, or you can type ``response.headers`` to see its headers. +.. invisible-code-block: python -More important, if you type ``response.selector`` you will access a selector -object you can use to query the response, and convenient shortcuts like -``response.xpath()`` and ``response.css()`` mapping to -``response.selector.xpath()`` and ``response.selector.css()`` + response = load_response('http://quotes.toscrape.com/page/1/', 'quotes1.html') + +>>> response.css('title') +[<Selector xpath='descendant-or-self::title' data='<title>Quotes to Scrape'>] + +The result of running ``response.css('title')`` is a list-like object called +:class:`~scrapy.selector.SelectorList`, which represents a list of +:class:`~scrapy.selector.Selector` objects that wrap around XML/HTML elements +and allow you to run further queries to fine-grain the selection or extract the +data. + +To extract the text from the title above, you can do: + +>>> response.css('title::text').getall() +['Quotes to Scrape'] + +There are two things to note here: one is that we've added ``::text`` to the +CSS query, to mean we want to select only the text elements directly inside +```` element. If we don't specify ``::text``, we'd get the full title +element, including its tags: + +>>> response.css('title').getall() +['<title>Quotes to Scrape'] + +The other thing is that the result of calling ``.getall()`` is a list: it is +possible that a selector returns more than one result, so we extract them all. +When you know you just want the first result, as in this case, you can do: + +>>> response.css('title::text').get() +'Quotes to Scrape' + +As an alternative, you could've written: + +>>> response.css('title::text')[0].get() +'Quotes to Scrape' + +However, using ``.get()`` directly on a :class:`~scrapy.selector.SelectorList` +instance avoids an ``IndexError`` and returns ``None`` when it doesn't +find any element matching the selection. + +There's a lesson here: for most scraping code, you want it to be resilient to +errors due to things not being found on a page, so that even if some parts fail +to be scraped, you can at least get **some** data. + +Besides the :meth:`~scrapy.selector.SelectorList.getall` and +:meth:`~scrapy.selector.SelectorList.get` methods, you can also use +the :meth:`~scrapy.selector.SelectorList.re` method to extract using +:doc:`regular expressions `: + +>>> response.css('title::text').re(r'Quotes.*') +['Quotes to Scrape'] +>>> response.css('title::text').re(r'Q\w+') +['Quotes'] +>>> response.css('title::text').re(r'(\w+) to (\w+)') +['Quotes', 'Scrape'] + +In order to find the proper CSS selectors to use, you might find useful opening +the response page from the shell in your web browser using ``view(response)``. +You can use your browser's developer tools to inspect the HTML and come up +with a selector (see :ref:`topics-developer-tools`). + +`Selector Gadget`_ is also a nice tool to quickly find CSS selector for +visually selected elements, which works in many browsers. + +.. _Selector Gadget: https://selectorgadget.com/ -So let's try it:: +XPath: a brief intro +^^^^^^^^^^^^^^^^^^^^ - In [1]: response.xpath('//title') - Out[1]: [Open Directory - Computers: Progr'>] - - In [2]: response.xpath('//title').extract() - Out[2]: [u'Open Directory - Computers: Programming: Languages: Python: Books'] - - In [3]: response.xpath('//title/text()') - Out[3]: [] - - In [4]: response.xpath('//title/text()').extract() - Out[4]: [u'Open Directory - Computers: Programming: Languages: Python: Books'] - - In [5]: response.xpath('//title/text()').re('(\w+):') - Out[5]: [u'Computers', u'Programming', u'Languages', u'Python'] +Besides `CSS`_, Scrapy selectors also support using `XPath`_ expressions: -Extracting the data -^^^^^^^^^^^^^^^^^^^ +>>> response.xpath('//title') +[] +>>> response.xpath('//title/text()').get() +'Quotes to Scrape' -Now, let's try to extract some real information from those pages. +XPath expressions are very powerful, and are the foundation of Scrapy +Selectors. In fact, CSS selectors are converted to XPath under-the-hood. You +can see that if you read closely the text representation of the selector +objects in the shell. -You could type ``response.body`` in the console, and inspect the source code to -figure out the XPaths you need to use. However, inspecting the raw HTML code -there could become a very tedious task. To make this an easier task, you can -use some Firefox extensions like Firebug. For more information see -:ref:`topics-firebug` and :ref:`topics-firefox`. +While perhaps not as popular as CSS selectors, XPath expressions offer more +power because besides navigating the structure, it can also look at the +content. Using XPath, you're able to select things like: *select the link +that contains the text "Next Page"*. This makes XPath very fitting to the task +of scraping, and we encourage you to learn XPath even if you already know how to +construct CSS selectors, it will make scraping much easier. -After inspecting the page source, you'll find that the web sites information -is inside a ``