mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 03:03:47 +00:00
Merge pull request #1181 from Curita/module-relocation
Module relocation
This commit is contained in:
commit
5155162e69
@ -150,9 +150,9 @@ To run all tests go to the root directory of Scrapy source code and run:
|
||||
|
||||
``tox``
|
||||
|
||||
To run a specific test (say ``tests/test_contrib_loader.py``) use:
|
||||
To run a specific test (say ``tests/test_loader.py``) use:
|
||||
|
||||
``tox -- tests/test_contrib_loader.py``
|
||||
``tox -- tests/test_loader.py``
|
||||
|
||||
|
||||
Writing tests
|
||||
@ -166,11 +166,11 @@ Scrapy uses unit-tests, which are located in the `tests/`_ directory.
|
||||
Their module name typically resembles the full path of the module they're
|
||||
testing. For example, the item loaders code is in::
|
||||
|
||||
scrapy.contrib.loader
|
||||
scrapy.loader
|
||||
|
||||
And their unit-tests are in::
|
||||
|
||||
tests/test_contrib_loader.py
|
||||
tests/test_loader.py
|
||||
|
||||
.. _issue tracker: https://github.com/scrapy/scrapy/issues
|
||||
.. _scrapy-users: https://groups.google.com/forum/#!forum/scrapy-users
|
||||
|
12
docs/faq.rst
12
docs/faq.rst
@ -64,7 +64,7 @@ Does Scrapy work with HTTP proxies?
|
||||
|
||||
Yes. Support for HTTP proxies is provided (since Scrapy 0.8) through the HTTP
|
||||
Proxy downloader middleware. See
|
||||
:class:`~scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware`.
|
||||
:class:`~scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware`.
|
||||
|
||||
How can I scrape an item with attributes in different pages?
|
||||
------------------------------------------------------------
|
||||
@ -94,8 +94,8 @@ in most cases. If you do want to crawl in true `BFO order`_, you can do it by
|
||||
setting the following settings::
|
||||
|
||||
DEPTH_PRIORITY = 1
|
||||
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
|
||||
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
|
||||
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
||||
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
||||
|
||||
My Scrapy crawler has memory leaks. What can I do?
|
||||
--------------------------------------------------
|
||||
@ -113,7 +113,7 @@ See previous question.
|
||||
Can I use Basic HTTP Authentication in my spiders?
|
||||
--------------------------------------------------
|
||||
|
||||
Yes, see :class:`~scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware`.
|
||||
Yes, see :class:`~scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware`.
|
||||
|
||||
Why does Scrapy download pages in English instead of my native language?
|
||||
------------------------------------------------------------------------
|
||||
@ -149,7 +149,7 @@ middleware (enabled by default) whose purpose is to filter out requests to
|
||||
domains outside the ones covered by the spider.
|
||||
|
||||
For more info see:
|
||||
:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware`.
|
||||
:class:`~scrapy.spidermiddlewares.offsite.OffsiteMiddleware`.
|
||||
|
||||
What is the recommended way to deploy a Scrapy crawler in production?
|
||||
---------------------------------------------------------------------
|
||||
@ -160,7 +160,7 @@ Can I use JSON for large exports?
|
||||
---------------------------------
|
||||
|
||||
It'll depend on how large your output is. See :ref:`this warning
|
||||
<json-with-large-data>` in :class:`~scrapy.contrib.exporter.JsonItemExporter`
|
||||
<json-with-large-data>` in :class:`~scrapy.exporters.JsonItemExporter`
|
||||
documentation.
|
||||
|
||||
Can I return (Twisted) deferreds from signal handlers?
|
||||
|
@ -511,7 +511,7 @@ using a `trick to pass additional data to the callbacks
|
||||
|
||||
.. note::
|
||||
As an example spider that leverages this mechanism, check out the
|
||||
:class:`~scrapy.contrib.spiders.CrawlSpider` class for a generic spider
|
||||
:class:`~scrapy.spiders.CrawlSpider` class for a generic spider
|
||||
that implements a small rules engine that you can use to write your
|
||||
crawlers on top of it.
|
||||
|
||||
|
@ -65,7 +65,7 @@ how you :ref:`configure the downloader middlewares
|
||||
|
||||
For an introduction on stats collection see :ref:`topics-stats`.
|
||||
|
||||
For the API see :class:`~scrapy.statscol.StatsCollector` class.
|
||||
For the API see :class:`~scrapy.statscollectors.StatsCollector` class.
|
||||
|
||||
.. attribute:: extensions
|
||||
|
||||
@ -452,11 +452,11 @@ Stats Collector API
|
||||
===================
|
||||
|
||||
There are several Stats Collectors available under the
|
||||
:mod:`scrapy.statscol` module and they all implement the Stats
|
||||
Collector API defined by the :class:`~scrapy.statscol.StatsCollector`
|
||||
:mod:`scrapy.statscollectors` module and they all implement the Stats
|
||||
Collector API defined by the :class:`~scrapy.statscollectors.StatsCollector`
|
||||
class (which they all inherit from).
|
||||
|
||||
.. module:: scrapy.statscol
|
||||
.. module:: scrapy.statscollectors
|
||||
:synopsis: Stats Collectors
|
||||
|
||||
.. class:: StatsCollector
|
||||
|
@ -399,7 +399,7 @@ Supported options:
|
||||
|
||||
* ``--pipelines``: process items through pipelines
|
||||
|
||||
* ``--rules`` or ``-r``: use :class:`~scrapy.contrib.spiders.CrawlSpider`
|
||||
* ``--rules`` or ``-r``: use :class:`~scrapy.spiders.CrawlSpider`
|
||||
rules to discover the callback (i.e. spider method) to use for parsing the
|
||||
response
|
||||
|
||||
|
@ -42,7 +42,7 @@ as its value. For example, if you want to disable the user-agent middleware::
|
||||
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
'myproject.middlewares.CustomDownloaderMiddleware': 543,
|
||||
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
|
||||
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
|
||||
}
|
||||
|
||||
Finally, keep in mind that some middlewares may need to be enabled through a
|
||||
@ -54,7 +54,7 @@ Writing your own downloader middleware
|
||||
Each middleware component is a Python class that defines one or
|
||||
more of the following methods:
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware
|
||||
.. module:: scrapy.downloadermiddlewares
|
||||
|
||||
.. class:: DownloaderMiddleware
|
||||
|
||||
@ -169,7 +169,7 @@ For a list of the components enabled by default (and their orders) see the
|
||||
CookiesMiddleware
|
||||
-----------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.cookies
|
||||
.. module:: scrapy.downloadermiddlewares.cookies
|
||||
:synopsis: Cookies Downloader Middleware
|
||||
|
||||
.. class:: CookiesMiddleware
|
||||
@ -246,7 +246,7 @@ Here's an example of a log with :setting:`COOKIES_DEBUG` enabled::
|
||||
DefaultHeadersMiddleware
|
||||
------------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.defaultheaders
|
||||
.. module:: scrapy.downloadermiddlewares.defaultheaders
|
||||
:synopsis: Default Headers Downloader Middleware
|
||||
|
||||
.. class:: DefaultHeadersMiddleware
|
||||
@ -257,7 +257,7 @@ DefaultHeadersMiddleware
|
||||
DownloadTimeoutMiddleware
|
||||
-------------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.downloadtimeout
|
||||
.. module:: scrapy.downloadermiddlewares.downloadtimeout
|
||||
:synopsis: Download timeout middleware
|
||||
|
||||
.. class:: DownloadTimeoutMiddleware
|
||||
@ -275,7 +275,7 @@ DownloadTimeoutMiddleware
|
||||
HttpAuthMiddleware
|
||||
------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.httpauth
|
||||
.. module:: scrapy.downloadermiddlewares.httpauth
|
||||
:synopsis: HTTP Auth downloader middleware
|
||||
|
||||
.. class:: HttpAuthMiddleware
|
||||
@ -288,7 +288,7 @@ HttpAuthMiddleware
|
||||
|
||||
Example::
|
||||
|
||||
from scrapy.contrib.spiders import CrawlSpider
|
||||
from scrapy.spiders import CrawlSpider
|
||||
|
||||
class SomeIntranetSiteSpider(CrawlSpider):
|
||||
|
||||
@ -304,7 +304,7 @@ HttpAuthMiddleware
|
||||
HttpCacheMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.httpcache
|
||||
.. module:: scrapy.downloadermiddlewares.httpcache
|
||||
:synopsis: HTTP Cache downloader middleware
|
||||
|
||||
.. class:: HttpCacheMiddleware
|
||||
@ -349,7 +349,7 @@ when an Internet connection is not available. The goal is to be able to
|
||||
|
||||
In order to use this policy, set:
|
||||
|
||||
* :setting:`HTTPCACHE_POLICY` to ``scrapy.contrib.httpcache.DummyPolicy``
|
||||
* :setting:`HTTPCACHE_POLICY` to ``scrapy.extensions.httpcache.DummyPolicy``
|
||||
|
||||
|
||||
.. _httpcache-policy-rfc2616:
|
||||
@ -383,7 +383,7 @@ what is missing:
|
||||
|
||||
In order to use this policy, set:
|
||||
|
||||
* :setting:`HTTPCACHE_POLICY` to ``scrapy.contrib.httpcache.RFC2616Policy``
|
||||
* :setting:`HTTPCACHE_POLICY` to ``scrapy.extensions.httpcache.RFC2616Policy``
|
||||
|
||||
|
||||
.. _httpcache-storage-fs:
|
||||
@ -395,7 +395,7 @@ File system storage backend is available for the HTTP cache middleware.
|
||||
|
||||
In order to use this storage backend, set:
|
||||
|
||||
* :setting:`HTTPCACHE_STORAGE` to ``scrapy.contrib.httpcache.FilesystemCacheStorage``
|
||||
* :setting:`HTTPCACHE_STORAGE` to ``scrapy.extensions.httpcache.FilesystemCacheStorage``
|
||||
|
||||
Each request/response pair is stored in a different directory containing
|
||||
the following files:
|
||||
@ -430,7 +430,7 @@ By default, it uses the anydbm_ module, but you can change it with the
|
||||
|
||||
In order to use this storage backend, set:
|
||||
|
||||
* :setting:`HTTPCACHE_STORAGE` to ``scrapy.contrib.httpcache.DbmCacheStorage``
|
||||
* :setting:`HTTPCACHE_STORAGE` to ``scrapy.extensions.httpcache.DbmCacheStorage``
|
||||
|
||||
.. _httpcache-storage-leveldb:
|
||||
|
||||
@ -447,7 +447,7 @@ the scrapy shell in parallel for the same spider.
|
||||
|
||||
In order to use this storage backend:
|
||||
|
||||
* set :setting:`HTTPCACHE_STORAGE` to ``scrapy.contrib.httpcache.LeveldbCacheStorage``
|
||||
* set :setting:`HTTPCACHE_STORAGE` to ``scrapy.extensions.httpcache.LeveldbCacheStorage``
|
||||
* install `LevelDB python bindings`_ like ``pip install leveldb``
|
||||
|
||||
.. _LevelDB: http://code.google.com/p/leveldb/
|
||||
@ -536,7 +536,7 @@ Don't cache responses with these URI schemes.
|
||||
HTTPCACHE_STORAGE
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
Default: ``'scrapy.contrib.httpcache.FilesystemCacheStorage'``
|
||||
Default: ``'scrapy.extensions.httpcache.FilesystemCacheStorage'``
|
||||
|
||||
The class which implements the cache storage backend.
|
||||
|
||||
@ -559,7 +559,7 @@ HTTPCACHE_POLICY
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Default: ``'scrapy.contrib.httpcache.DummyPolicy'``
|
||||
Default: ``'scrapy.extensions.httpcache.DummyPolicy'``
|
||||
|
||||
The class which implements the cache policy.
|
||||
|
||||
@ -579,7 +579,7 @@ This setting is specific to the Filesystem backend.
|
||||
HttpCompressionMiddleware
|
||||
-------------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.httpcompression
|
||||
.. module:: scrapy.downloadermiddlewares.httpcompression
|
||||
:synopsis: Http Compression Middleware
|
||||
|
||||
.. class:: HttpCompressionMiddleware
|
||||
@ -603,7 +603,7 @@ Whether the Compression middleware will be enabled.
|
||||
ChunkedTransferMiddleware
|
||||
-------------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.chunked
|
||||
.. module:: scrapy.downloadermiddlewares.chunked
|
||||
:synopsis: Chunked Transfer Middleware
|
||||
|
||||
.. class:: ChunkedTransferMiddleware
|
||||
@ -613,7 +613,7 @@ ChunkedTransferMiddleware
|
||||
HttpProxyMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.httpproxy
|
||||
.. module:: scrapy.downloadermiddlewares.httpproxy
|
||||
:synopsis: Http Proxy Middleware
|
||||
|
||||
.. versionadded:: 0.8
|
||||
@ -641,7 +641,7 @@ HttpProxyMiddleware
|
||||
RedirectMiddleware
|
||||
------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.redirect
|
||||
.. module:: scrapy.downloadermiddlewares.redirect
|
||||
:synopsis: Redirection Middleware
|
||||
|
||||
.. class:: RedirectMiddleware
|
||||
@ -731,7 +731,7 @@ The maximum meta-refresh delay (in seconds) to follow the redirection.
|
||||
RetryMiddleware
|
||||
---------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.retry
|
||||
.. module:: scrapy.downloadermiddlewares.retry
|
||||
:synopsis: Retry Middleware
|
||||
|
||||
.. class:: RetryMiddleware
|
||||
@ -800,7 +800,7 @@ connections lost, etc) are always retried.
|
||||
RobotsTxtMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.robotstxt
|
||||
.. module:: scrapy.downloadermiddlewares.robotstxt
|
||||
:synopsis: robots.txt middleware
|
||||
|
||||
.. class:: RobotsTxtMiddleware
|
||||
@ -828,7 +828,7 @@ the request will be ignored by this middleware even if
|
||||
DownloaderStats
|
||||
---------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.stats
|
||||
.. module:: scrapy.downloadermiddlewares.stats
|
||||
:synopsis: Downloader Stats Middleware
|
||||
|
||||
.. class:: DownloaderStats
|
||||
@ -842,7 +842,7 @@ DownloaderStats
|
||||
UserAgentMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.useragent
|
||||
.. module:: scrapy.downloadermiddlewares.useragent
|
||||
:synopsis: User Agent Middleware
|
||||
|
||||
.. class:: UserAgentMiddleware
|
||||
@ -857,7 +857,7 @@ UserAgentMiddleware
|
||||
AjaxCrawlMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.ajaxcrawl
|
||||
.. module:: scrapy.downloadermiddlewares.ajaxcrawl
|
||||
|
||||
.. class:: AjaxCrawlMiddleware
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
Item Exporters
|
||||
==============
|
||||
|
||||
.. module:: scrapy.contrib.exporter
|
||||
.. module:: scrapy.exporters
|
||||
:synopsis: Item Exporters
|
||||
|
||||
Once you have scraped your items, you often want to persist or export those
|
||||
@ -40,7 +40,7 @@ Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses an Item
|
||||
Exporter to export scraped items to different files, one per spider::
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.contrib.exporter import XmlItemExporter
|
||||
from scrapy.exporters import XmlItemExporter
|
||||
|
||||
class XmlExportPipeline(object):
|
||||
|
||||
@ -117,7 +117,7 @@ after your custom code.
|
||||
|
||||
Example::
|
||||
|
||||
from scrapy.contrib.exporter import XmlItemExporter
|
||||
from scrapy.exporter import XmlItemExporter
|
||||
|
||||
class ProductXmlExporter(XmlItemExporter):
|
||||
|
||||
|
@ -35,7 +35,7 @@ your Scrapy settings. In :setting:`EXTENSIONS`, each extension is represented
|
||||
by a string: the full Python path to the extension's class name. For example::
|
||||
|
||||
EXTENSIONS = {
|
||||
'scrapy.contrib.corestats.CoreStats': 500,
|
||||
'scrapy.extensions.corestats.CoreStats': 500,
|
||||
'scrapy.telnet.TelnetConsole': 500,
|
||||
}
|
||||
|
||||
@ -69,7 +69,7 @@ included in the :setting:`EXTENSIONS_BASE` setting) you must set its order to
|
||||
``None``. For example::
|
||||
|
||||
EXTENSIONS = {
|
||||
'scrapy.contrib.corestats.CoreStats': None,
|
||||
'scrapy.extensions.corestats.CoreStats': None,
|
||||
}
|
||||
|
||||
Writing your own extension
|
||||
@ -158,7 +158,7 @@ General purpose extensions
|
||||
Log Stats extension
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.logstats
|
||||
.. module:: scrapy.extensions.logstats
|
||||
:synopsis: Basic stats logging
|
||||
|
||||
.. class:: LogStats
|
||||
@ -168,7 +168,7 @@ Log basic stats like crawled pages and scraped items.
|
||||
Core Stats extension
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.corestats
|
||||
.. module:: scrapy.extensions.corestats
|
||||
:synopsis: Core stats collection
|
||||
|
||||
.. class:: CoreStats
|
||||
@ -198,10 +198,10 @@ setting, and the server will listen in the port specified in
|
||||
Memory usage extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.memusage
|
||||
.. module:: scrapy.extensions.memusage
|
||||
:synopsis: Memory usage extension
|
||||
|
||||
.. class:: scrapy.contrib.memusage.MemoryUsage
|
||||
.. class:: scrapy.extensions.memusage.MemoryUsage
|
||||
|
||||
.. note:: This extension does not work in Windows.
|
||||
|
||||
@ -226,10 +226,10 @@ can be configured with the following settings:
|
||||
Memory debugger extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.memdebug
|
||||
.. module:: scrapy.extensions.memdebug
|
||||
:synopsis: Memory debugger extension
|
||||
|
||||
.. class:: scrapy.contrib.memdebug.MemoryDebugger
|
||||
.. class:: scrapy.extensions.memdebug.MemoryDebugger
|
||||
|
||||
An extension for debugging memory usage. It collects information about:
|
||||
|
||||
@ -242,10 +242,10 @@ info will be stored in the stats.
|
||||
Close spider extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.closespider
|
||||
.. module:: scrapy.extensions.closespider
|
||||
:synopsis: Close spider extension
|
||||
|
||||
.. class:: scrapy.contrib.closespider.CloseSpider
|
||||
.. class:: scrapy.extensions.closespider.CloseSpider
|
||||
|
||||
Closes a spider automatically when some conditions are met, using a specific
|
||||
closing reason for each condition.
|
||||
@ -313,17 +313,17 @@ set), spiders won't be closed by number of errors.
|
||||
StatsMailer extension
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.statsmailer
|
||||
.. module:: scrapy.extensions.statsmailer
|
||||
:synopsis: StatsMailer extension
|
||||
|
||||
.. class:: scrapy.contrib.statsmailer.StatsMailer
|
||||
.. class:: scrapy.extensions.statsmailer.StatsMailer
|
||||
|
||||
This simple extension can be used to send a notification e-mail every time a
|
||||
domain has finished scraping, including the Scrapy stats collected. The email
|
||||
will be sent to all recipients specified in the :setting:`STATSMAILER_RCPTS`
|
||||
setting.
|
||||
|
||||
.. module:: scrapy.contrib.debug
|
||||
.. module:: scrapy.extensions.debug
|
||||
:synopsis: Extensions for debugging Scrapy
|
||||
|
||||
Debugging extensions
|
||||
@ -332,7 +332,7 @@ Debugging extensions
|
||||
Stack trace dump extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. class:: scrapy.contrib.debug.StackTraceDump
|
||||
.. class:: scrapy.extensions.debug.StackTraceDump
|
||||
|
||||
Dumps information about the running process when a `SIGQUIT`_ or `SIGUSR2`_
|
||||
signal is received. The information dumped is the following:
|
||||
@ -361,7 +361,7 @@ There are at least two ways to send Scrapy the `SIGQUIT`_ signal:
|
||||
Debugger extension
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. class:: scrapy.contrib.debug.Debugger
|
||||
.. class:: scrapy.extensions.debug.Debugger
|
||||
|
||||
Invokes a `Python debugger`_ inside a running Scrapy process when a `SIGUSR2`_
|
||||
signal is received. After the debugger is exited, the Scrapy process continues
|
||||
|
@ -37,7 +37,7 @@ JSON
|
||||
----
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``json``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.JsonItemExporter`
|
||||
* Exporter used: :class:`~scrapy.exporters.JsonItemExporter`
|
||||
* See :ref:`this warning <json-with-large-data>` if you're using JSON with
|
||||
large feeds.
|
||||
|
||||
@ -47,7 +47,7 @@ JSON lines
|
||||
----------
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``jsonlines``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.JsonLinesItemExporter`
|
||||
* Exporter used: :class:`~scrapy.exporters.JsonLinesItemExporter`
|
||||
|
||||
.. _topics-feed-format-csv:
|
||||
|
||||
@ -55,7 +55,7 @@ CSV
|
||||
---
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``csv``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.CsvItemExporter`
|
||||
* Exporter used: :class:`~scrapy.exporters.CsvItemExporter`
|
||||
* To specify columns to export and their order use
|
||||
:setting:`FEED_EXPORT_FIELDS`. Other feed exporters can also use this
|
||||
option, but it is important for CSV because unlike many other export
|
||||
@ -67,7 +67,7 @@ XML
|
||||
---
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``xml``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.XmlItemExporter`
|
||||
* Exporter used: :class:`~scrapy.exporters.XmlItemExporter`
|
||||
|
||||
.. _topics-feed-format-pickle:
|
||||
|
||||
@ -75,7 +75,7 @@ Pickle
|
||||
------
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``pickle``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.PickleItemExporter`
|
||||
* Exporter used: :class:`~scrapy.exporters.PickleItemExporter`
|
||||
|
||||
.. _topics-feed-format-marshal:
|
||||
|
||||
@ -83,7 +83,7 @@ Marshal
|
||||
-------
|
||||
|
||||
* :setting:`FEED_FORMAT`: ``marshal``
|
||||
* Exporter used: :class:`~scrapy.contrib.exporter.MarshalItemExporter`
|
||||
* Exporter used: :class:`~scrapy.exporters.MarshalItemExporter`
|
||||
|
||||
|
||||
.. _topics-feed-storage:
|
||||
@ -209,7 +209,7 @@ These are the settings used for configuring the feed exports:
|
||||
* :setting:`FEED_STORE_EMPTY`
|
||||
* :setting:`FEED_EXPORT_FIELDS`
|
||||
|
||||
.. currentmodule:: scrapy.contrib.feedexport
|
||||
.. currentmodule:: scrapy.extensions.feedexport
|
||||
|
||||
.. setting:: FEED_URI
|
||||
|
||||
@ -272,11 +272,11 @@ FEED_STORAGES_BASE
|
||||
Default::
|
||||
|
||||
{
|
||||
'': 'scrapy.contrib.feedexport.FileFeedStorage',
|
||||
'file': 'scrapy.contrib.feedexport.FileFeedStorage',
|
||||
'stdout': 'scrapy.contrib.feedexport.StdoutFeedStorage',
|
||||
's3': 'scrapy.contrib.feedexport.S3FeedStorage',
|
||||
'ftp': 'scrapy.contrib.feedexport.FTPFeedStorage',
|
||||
'': 'scrapy.extensions.feedexport.FileFeedStorage',
|
||||
'file': 'scrapy.extensions.feedexport.FileFeedStorage',
|
||||
'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage',
|
||||
's3': 'scrapy.extensions.feedexport.S3FeedStorage',
|
||||
'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage',
|
||||
}
|
||||
|
||||
A dict containing the built-in feed storage backends supported by Scrapy.
|
||||
@ -300,11 +300,11 @@ FEED_EXPORTERS_BASE
|
||||
Default::
|
||||
|
||||
FEED_EXPORTERS_BASE = {
|
||||
'json': 'scrapy.contrib.exporter.JsonItemExporter',
|
||||
'jsonlines': 'scrapy.contrib.exporter.JsonLinesItemExporter',
|
||||
'csv': 'scrapy.contrib.exporter.CsvItemExporter',
|
||||
'xml': 'scrapy.contrib.exporter.XmlItemExporter',
|
||||
'marshal': 'scrapy.contrib.exporter.MarshalItemExporter',
|
||||
'json': 'scrapy.exporters.JsonItemExporter',
|
||||
'jsonlines': 'scrapy.exporters.JsonLinesItemExporter',
|
||||
'csv': 'scrapy.exporters.CsvItemExporter',
|
||||
'xml': 'scrapy.exporters.XmlItemExporter',
|
||||
'marshal': 'scrapy.exporters.MarshalItemExporter',
|
||||
}
|
||||
|
||||
A dict containing the built-in feed exporters supported by Scrapy.
|
||||
|
@ -74,15 +74,15 @@ So, based on that regular expression we can create the first crawling rule::
|
||||
follow=True,
|
||||
),
|
||||
|
||||
The :class:`~scrapy.contrib.spiders.Rule` object instructs
|
||||
:class:`~scrapy.contrib.spiders.CrawlSpider` based spiders how to follow the
|
||||
The :class:`~scrapy.spiders.Rule` object instructs
|
||||
:class:`~scrapy.spiders.CrawlSpider` based spiders how to follow the
|
||||
category links. ``parse_category`` will be a method of the spider which will
|
||||
process and extract data from those pages.
|
||||
|
||||
This is how the spider would look so far::
|
||||
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
|
||||
class GoogleDirectorySpider(CrawlSpider):
|
||||
name = 'directory.google.com'
|
||||
|
@ -8,7 +8,7 @@ Link extractors are objects whose only purpose is to extract links from web
|
||||
pages (:class:`scrapy.http.Response` objects) which will be eventually
|
||||
followed.
|
||||
|
||||
There is ``scrapy.contrib.linkextractors import LinkExtractor`` available
|
||||
There is ``scrapy.linkextractors import LinkExtractor`` available
|
||||
in Scrapy, but you can create your own custom Link Extractors to suit your
|
||||
needs by implementing a simple interface.
|
||||
|
||||
@ -18,10 +18,10 @@ of :class:`scrapy.link.Link` objects. Link extractors are meant to be
|
||||
instantiated once and their ``extract_links`` method called several times
|
||||
with different responses to extract links to follow.
|
||||
|
||||
Link extractors are used in the :class:`~scrapy.contrib.spiders.CrawlSpider`
|
||||
Link extractors are used in the :class:`~scrapy.spiders.CrawlSpider`
|
||||
class (available in Scrapy), through a set of rules, but you can also use it in
|
||||
your spiders, even if you don't subclass from
|
||||
:class:`~scrapy.contrib.spiders.CrawlSpider`, as its purpose is very simple: to
|
||||
:class:`~scrapy.spiders.CrawlSpider`, as its purpose is very simple: to
|
||||
extract links.
|
||||
|
||||
|
||||
@ -30,16 +30,16 @@ extract links.
|
||||
Built-in link extractors reference
|
||||
==================================
|
||||
|
||||
.. module:: scrapy.contrib.linkextractors
|
||||
.. module:: scrapy.linkextractors
|
||||
:synopsis: Link extractors classes
|
||||
|
||||
Link extractors classes bundled with Scrapy are provided in the
|
||||
:mod:`scrapy.contrib.linkextractors` module.
|
||||
:mod:`scrapy.linkextractors` module.
|
||||
|
||||
The default link extractor is ``LinkExtractor``, which is the same as
|
||||
:class:`~.LxmlLinkExtractor`::
|
||||
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
||||
There used to be other link extractor classes in previous Scrapy versions,
|
||||
but they are deprecated now.
|
||||
@ -47,7 +47,7 @@ but they are deprecated now.
|
||||
LxmlLinkExtractor
|
||||
-----------------
|
||||
|
||||
.. module:: scrapy.contrib.linkextractors.lxmlhtml
|
||||
.. module:: scrapy.linkextractors.lxmlhtml
|
||||
:synopsis: lxml's HTMLParser-based link extractors
|
||||
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
Item Loaders
|
||||
============
|
||||
|
||||
.. module:: scrapy.contrib.loader
|
||||
.. module:: scrapy.loader
|
||||
:synopsis: Item Loader class
|
||||
|
||||
Item Loaders provide a convenient mechanism for populating scraped :ref:`Items
|
||||
@ -39,7 +39,7 @@ Here is a typical Item Loader usage in a :ref:`Spider <topics-spiders>`, using
|
||||
the :ref:`Product item <topics-items-declaring>` declared in the :ref:`Items
|
||||
chapter <topics-items>`::
|
||||
|
||||
from scrapy.contrib.loader import ItemLoader
|
||||
from scrapy.loader import ItemLoader
|
||||
from myproject.items import Product
|
||||
|
||||
def parse(self, response):
|
||||
@ -150,8 +150,8 @@ Declaring Item Loaders
|
||||
Item Loaders are declared like Items, by using a class definition syntax. Here
|
||||
is an example::
|
||||
|
||||
from scrapy.contrib.loader import ItemLoader
|
||||
from scrapy.contrib.loader.processor import TakeFirst, MapCompose, Join
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.loader.processors import TakeFirst, MapCompose, Join
|
||||
|
||||
class ProductLoader(ItemLoader):
|
||||
|
||||
@ -182,7 +182,7 @@ output processors to use: in the :ref:`Item Field <topics-items-fields>`
|
||||
metadata. Here is an example::
|
||||
|
||||
import scrapy
|
||||
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
|
||||
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
||||
from w3lib.html import remove_tags
|
||||
|
||||
def filter_price(value):
|
||||
@ -201,7 +201,7 @@ metadata. Here is an example::
|
||||
|
||||
::
|
||||
|
||||
>>> from scrapy.contrib.loader import ItemLoader
|
||||
>>> from scrapy.loader import ItemLoader
|
||||
>>> il = ItemLoader(item=Product())
|
||||
>>> il.add_value('name', [u'Welcome to my', u'<strong>website</strong>'])
|
||||
>>> il.add_value('price', [u'€', u'<span>1000</span>'])
|
||||
@ -309,7 +309,7 @@ ItemLoader objects
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import TakeFirst
|
||||
>>> from scrapy.loader.processors import TakeFirst
|
||||
>>> loader.get_value(u'name: foo', TakeFirst(), unicode.upper, re='name: (.+)')
|
||||
'FOO`
|
||||
|
||||
@ -513,7 +513,7 @@ those dashes in the final product names.
|
||||
Here's how you can remove those dashes by reusing and extending the default
|
||||
Product Item Loader (``ProductLoader``)::
|
||||
|
||||
from scrapy.contrib.loader.processor import MapCompose
|
||||
from scrapy.loader.processors import MapCompose
|
||||
from myproject.ItemLoaders import ProductLoader
|
||||
|
||||
def strip_dashes(x):
|
||||
@ -526,7 +526,7 @@ Another case where extending Item Loaders can be very helpful is when you have
|
||||
multiple source formats, for example XML and HTML. In the XML version you may
|
||||
want to remove ``CDATA`` occurrences. Here's an example of how to do it::
|
||||
|
||||
from scrapy.contrib.loader.processor import MapCompose
|
||||
from scrapy.loader.processors import MapCompose
|
||||
from myproject.ItemLoaders import ProductLoader
|
||||
from myproject.utils.xml import remove_cdata
|
||||
|
||||
@ -551,7 +551,7 @@ needs.
|
||||
Available built-in processors
|
||||
=============================
|
||||
|
||||
.. module:: scrapy.contrib.loader.processor
|
||||
.. module:: scrapy.loader.processors
|
||||
:synopsis: A collection of processors to use with Item Loaders
|
||||
|
||||
Even though you can use any callable function as input and output processors,
|
||||
@ -570,7 +570,7 @@ Here is a list of all built-in processors:
|
||||
|
||||
Example::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import Identity
|
||||
>>> from scrapy.loader.processors import Identity
|
||||
>>> proc = Identity()
|
||||
>>> proc(['one', 'two', 'three'])
|
||||
['one', 'two', 'three']
|
||||
@ -583,7 +583,7 @@ Here is a list of all built-in processors:
|
||||
|
||||
Example::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import TakeFirst
|
||||
>>> from scrapy.loader.processors import TakeFirst
|
||||
>>> proc = TakeFirst()
|
||||
>>> proc(['', 'one', 'two', 'three'])
|
||||
'one'
|
||||
@ -598,7 +598,7 @@ Here is a list of all built-in processors:
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import Join
|
||||
>>> from scrapy.loader.processors import Join
|
||||
>>> proc = Join()
|
||||
>>> proc(['one', 'two', 'three'])
|
||||
u'one two three'
|
||||
@ -619,7 +619,7 @@ Here is a list of all built-in processors:
|
||||
|
||||
Example::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import Compose
|
||||
>>> from scrapy.loader.processors import Compose
|
||||
>>> proc = Compose(lambda v: v[0], str.upper)
|
||||
>>> proc(['hello', 'world'])
|
||||
'HELLO'
|
||||
@ -666,7 +666,7 @@ Here is a list of all built-in processors:
|
||||
>>> def filter_world(x):
|
||||
... return None if x == 'world' else x
|
||||
...
|
||||
>>> from scrapy.contrib.loader.processor import MapCompose
|
||||
>>> from scrapy.loader.processors import MapCompose
|
||||
>>> proc = MapCompose(filter_world, unicode.upper)
|
||||
>>> proc([u'hello', u'world', u'this', u'is', u'scrapy'])
|
||||
[u'HELLO, u'THIS', u'IS', u'SCRAPY']
|
||||
@ -683,7 +683,7 @@ Here is a list of all built-in processors:
|
||||
|
||||
Example::
|
||||
|
||||
>>> from scrapy.contrib.loader.processor import SelectJmes, Compose, MapCompose
|
||||
>>> from scrapy.loader.processors import SelectJmes, Compose, MapCompose
|
||||
>>> proc = SelectJmes("foo") #for direct use on lists and dictionaries
|
||||
>>> proc({'foo': 'bar'})
|
||||
'bar'
|
||||
|
@ -4,7 +4,7 @@
|
||||
Downloading and processing files and images
|
||||
===========================================
|
||||
|
||||
.. currentmodule:: scrapy.contrib.pipeline.images
|
||||
.. currentmodule:: scrapy.pipelines.images
|
||||
|
||||
Scrapy provides reusable :doc:`item pipelines </topics/item-pipeline>` for
|
||||
downloading fies attached to a particular item (for example, when you scrape
|
||||
@ -114,11 +114,11 @@ To enable your media pipeline you must first add it to your project
|
||||
|
||||
For Images Pipeline, use::
|
||||
|
||||
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
|
||||
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
|
||||
|
||||
For Files Pipeline, use::
|
||||
|
||||
ITEM_PIPELINES = {'scrapy.contrib.pipeline.files.FilesPipeline': 1}
|
||||
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
|
||||
|
||||
|
||||
.. note::
|
||||
@ -258,7 +258,7 @@ By default, there are no size constraints, so all images are processed.
|
||||
Extending the Media Pipelines
|
||||
=============================
|
||||
|
||||
.. module:: scrapy.contrib.pipeline.files
|
||||
.. module:: scrapy.pipelines.files
|
||||
:synopsis: Files Pipeline
|
||||
|
||||
See here the methods that you can override in your custom Files Pipeline:
|
||||
@ -338,7 +338,7 @@ See here the methods that you can override in your custom Files Pipeline:
|
||||
By default, the :meth:`item_completed` method returns the item.
|
||||
|
||||
|
||||
.. module:: scrapy.contrib.pipeline.images
|
||||
.. module:: scrapy.pipelines.images
|
||||
:synopsis: Images Pipeline
|
||||
|
||||
See here the methods that you can override in your custom Images Pipeline:
|
||||
@ -374,7 +374,7 @@ Here is a full example of the Images Pipeline whose methods are examplified
|
||||
above::
|
||||
|
||||
import scrapy
|
||||
from scrapy.contrib.pipeline.images import ImagesPipeline
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
class MyImagesPipeline(ImagesPipeline):
|
||||
|
@ -239,7 +239,7 @@ Default::
|
||||
}
|
||||
|
||||
The default headers used for Scrapy HTTP Requests. They're populated in the
|
||||
:class:`~scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware`.
|
||||
:class:`~scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware`.
|
||||
|
||||
.. setting:: DEPTH_LIMIT
|
||||
|
||||
@ -335,20 +335,20 @@ DOWNLOADER_MIDDLEWARES_BASE
|
||||
Default::
|
||||
|
||||
{
|
||||
'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
|
||||
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
|
||||
'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
|
||||
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
|
||||
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
|
||||
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
|
||||
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
|
||||
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
|
||||
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
|
||||
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
|
||||
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
|
||||
'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
|
||||
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
|
||||
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
|
||||
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,
|
||||
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,
|
||||
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
|
||||
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400,
|
||||
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
|
||||
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 550,
|
||||
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,
|
||||
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,
|
||||
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,
|
||||
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
|
||||
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
|
||||
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware': 830,
|
||||
'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,
|
||||
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
|
||||
}
|
||||
|
||||
A dict containing the downloader middlewares enabled by default in Scrapy. You
|
||||
@ -487,7 +487,7 @@ If you want to disable it set to 0.
|
||||
DUPEFILTER_CLASS
|
||||
----------------
|
||||
|
||||
Default: ``'scrapy.dupefilter.RFPDupeFilter'``
|
||||
Default: ``'scrapy.dupefilters.RFPDupeFilter'``
|
||||
|
||||
The class used to detect and filter duplicate requests.
|
||||
|
||||
@ -536,15 +536,15 @@ EXTENSIONS_BASE
|
||||
Default::
|
||||
|
||||
{
|
||||
'scrapy.contrib.corestats.CoreStats': 0,
|
||||
'scrapy.extensions.corestats.CoreStats': 0,
|
||||
'scrapy.telnet.TelnetConsole': 0,
|
||||
'scrapy.contrib.memusage.MemoryUsage': 0,
|
||||
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
||||
'scrapy.contrib.closespider.CloseSpider': 0,
|
||||
'scrapy.contrib.feedexport.FeedExporter': 0,
|
||||
'scrapy.contrib.logstats.LogStats': 0,
|
||||
'scrapy.contrib.spiderstate.SpiderState': 0,
|
||||
'scrapy.contrib.throttle.AutoThrottle': 0,
|
||||
'scrapy.extensions.memusage.MemoryUsage': 0,
|
||||
'scrapy.extensions.memdebug.MemoryDebugger': 0,
|
||||
'scrapy.extensions.closespider.CloseSpider': 0,
|
||||
'scrapy.extensions.feedexport.FeedExporter': 0,
|
||||
'scrapy.extensions.logstats.LogStats': 0,
|
||||
'scrapy.extensions.spiderstate.SpiderState': 0,
|
||||
'scrapy.extensions.throttle.AutoThrottle': 0,
|
||||
}
|
||||
|
||||
The list of available extensions. Keep in mind that some of them need to
|
||||
@ -689,7 +689,7 @@ MEMUSAGE_ENABLED
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Scope: ``scrapy.contrib.memusage``
|
||||
Scope: ``scrapy.extensions.memusage``
|
||||
|
||||
Whether to enable the memory usage extension that will shutdown the Scrapy
|
||||
process when it exceeds a memory limit, and also notify by email when that
|
||||
@ -704,7 +704,7 @@ MEMUSAGE_LIMIT_MB
|
||||
|
||||
Default: ``0``
|
||||
|
||||
Scope: ``scrapy.contrib.memusage``
|
||||
Scope: ``scrapy.extensions.memusage``
|
||||
|
||||
The maximum amount of memory to allow (in megabytes) before shutting down
|
||||
Scrapy (if MEMUSAGE_ENABLED is True). If zero, no check will be performed.
|
||||
@ -718,7 +718,7 @@ MEMUSAGE_NOTIFY_MAIL
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Scope: ``scrapy.contrib.memusage``
|
||||
Scope: ``scrapy.extensions.memusage``
|
||||
|
||||
A list of emails to notify if the memory limit has been reached.
|
||||
|
||||
@ -735,7 +735,7 @@ MEMUSAGE_REPORT
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Scope: ``scrapy.contrib.memusage``
|
||||
Scope: ``scrapy.extensions.memusage``
|
||||
|
||||
Whether to send a memory usage report after each spider has been closed.
|
||||
|
||||
@ -748,7 +748,7 @@ MEMUSAGE_WARNING_MB
|
||||
|
||||
Default: ``0``
|
||||
|
||||
Scope: ``scrapy.contrib.memusage``
|
||||
Scope: ``scrapy.extensions.memusage``
|
||||
|
||||
The maximum amount of memory to allow (in megabytes) before sending a warning
|
||||
email notifying about it. If zero, no warning will be produced.
|
||||
@ -837,7 +837,7 @@ ROBOTSTXT_OBEY
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Scope: ``scrapy.contrib.downloadermiddleware.robotstxt``
|
||||
Scope: ``scrapy.downloadermiddlewares.robotstxt``
|
||||
|
||||
If enabled, Scrapy will respect robots.txt policies. For more information see
|
||||
:ref:`topics-dlmw-robots`
|
||||
@ -906,11 +906,11 @@ SPIDER_MIDDLEWARES_BASE
|
||||
Default::
|
||||
|
||||
{
|
||||
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
|
||||
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
|
||||
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
|
||||
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
|
||||
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900,
|
||||
'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
|
||||
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,
|
||||
'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,
|
||||
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,
|
||||
'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,
|
||||
}
|
||||
|
||||
A dict containing the spider middlewares enabled by default in Scrapy. You
|
||||
@ -936,7 +936,7 @@ Example::
|
||||
STATS_CLASS
|
||||
-----------
|
||||
|
||||
Default: ``'scrapy.statscol.MemoryStatsCollector'``
|
||||
Default: ``'scrapy.statscollectors.MemoryStatsCollector'``
|
||||
|
||||
The class to use for collecting stats, who must implement the
|
||||
:ref:`topics-api-stats`.
|
||||
@ -961,7 +961,7 @@ STATSMAILER_RCPTS
|
||||
Default: ``[]`` (empty list)
|
||||
|
||||
Send Scrapy stats after spiders finish scraping. See
|
||||
:class:`~scrapy.contrib.statsmailer.StatsMailer` for more info.
|
||||
:class:`~scrapy.extensions.statsmailer.StatsMailer` for more info.
|
||||
|
||||
.. setting:: TELNETCONSOLE_ENABLED
|
||||
|
||||
@ -1001,7 +1001,7 @@ URLLENGTH_LIMIT
|
||||
|
||||
Default: ``2083``
|
||||
|
||||
Scope: ``contrib.spidermiddleware.urllength``
|
||||
Scope: ``spidermiddlewares.urllength``
|
||||
|
||||
The maximum URL length to allow for crawled URLs. For more information about
|
||||
the default value for this setting see: http://www.boutell.com/newfaq/misc/urllength.html
|
||||
|
@ -43,7 +43,7 @@ value. For example, if you want to disable the off-site middleware::
|
||||
|
||||
SPIDER_MIDDLEWARES = {
|
||||
'myproject.middlewares.CustomSpiderMiddleware': 543,
|
||||
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None,
|
||||
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None,
|
||||
}
|
||||
|
||||
Finally, keep in mind that some middlewares may need to be enabled through a
|
||||
@ -55,7 +55,7 @@ Writing your own spider middleware
|
||||
Each middleware component is a Python class that defines one or more of the
|
||||
following methods:
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware
|
||||
.. module:: scrapy.spidermiddlewares
|
||||
|
||||
.. class:: SpiderMiddleware
|
||||
|
||||
@ -178,7 +178,7 @@ For a list of the components enabled by default (and their orders) see the
|
||||
DepthMiddleware
|
||||
---------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.depth
|
||||
.. module:: scrapy.spidermiddlewares.depth
|
||||
:synopsis: Depth Spider Middleware
|
||||
|
||||
.. class:: DepthMiddleware
|
||||
@ -199,7 +199,7 @@ DepthMiddleware
|
||||
HttpErrorMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.httperror
|
||||
.. module:: scrapy.spidermiddlewares.httperror
|
||||
:synopsis: HTTP Error Spider Middleware
|
||||
|
||||
.. class:: HttpErrorMiddleware
|
||||
@ -264,7 +264,7 @@ Pass all responses, regardless of its status code.
|
||||
OffsiteMiddleware
|
||||
-----------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.offsite
|
||||
.. module:: scrapy.spidermiddlewares.offsite
|
||||
:synopsis: Offsite Spider Middleware
|
||||
|
||||
.. class:: OffsiteMiddleware
|
||||
@ -298,7 +298,7 @@ OffsiteMiddleware
|
||||
RefererMiddleware
|
||||
-----------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.referer
|
||||
.. module:: scrapy.spidermiddlewares.referer
|
||||
:synopsis: Referer Spider Middleware
|
||||
|
||||
.. class:: RefererMiddleware
|
||||
@ -323,7 +323,7 @@ Whether to enable referer middleware.
|
||||
UrlLengthMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.urllength
|
||||
.. module:: scrapy.spidermiddlewares.urllength
|
||||
:synopsis: URL Length Spider Middleware
|
||||
|
||||
.. class:: UrlLengthMiddleware
|
||||
|
@ -77,7 +77,7 @@ scrapy.Spider
|
||||
An optional list of strings containing domains that this spider is
|
||||
allowed to crawl. Requests for URLs not belonging to the domain names
|
||||
specified in this list won't be followed if
|
||||
:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware` is enabled.
|
||||
:class:`~scrapy.spidermiddlewares.offsite.OffsiteMiddleware` is enabled.
|
||||
|
||||
.. attribute:: start_urls
|
||||
|
||||
@ -319,7 +319,7 @@ with a ``TestItem`` declared in a ``myproject.items`` module::
|
||||
description = scrapy.Field()
|
||||
|
||||
|
||||
.. module:: scrapy.contrib.spiders
|
||||
.. module:: scrapy.spiders
|
||||
:synopsis: Collection of generic spiders
|
||||
|
||||
CrawlSpider
|
||||
@ -394,8 +394,8 @@ CrawlSpider example
|
||||
Let's now take a look at an example CrawlSpider with rules::
|
||||
|
||||
import scrapy
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
||||
class MySpider(CrawlSpider):
|
||||
name = 'example.com'
|
||||
@ -515,7 +515,7 @@ XMLFeedSpider example
|
||||
|
||||
These spiders are pretty easy to use, let's have a look at one example::
|
||||
|
||||
from scrapy.contrib.spiders import XMLFeedSpider
|
||||
from scrapy.spiders import XMLFeedSpider
|
||||
from myproject.items import TestItem
|
||||
|
||||
class MySpider(XMLFeedSpider):
|
||||
@ -575,7 +575,7 @@ CSVFeedSpider example
|
||||
Let's see an example similar to the previous one, but using a
|
||||
:class:`CSVFeedSpider`::
|
||||
|
||||
from scrapy.contrib.spiders import CSVFeedSpider
|
||||
from scrapy.spiders import CSVFeedSpider
|
||||
from myproject.items import TestItem
|
||||
|
||||
class MySpider(CSVFeedSpider):
|
||||
@ -669,7 +669,7 @@ SitemapSpider examples
|
||||
Simplest example: process all urls discovered through sitemaps using the
|
||||
``parse`` callback::
|
||||
|
||||
from scrapy.contrib.spiders import SitemapSpider
|
||||
from scrapy.spiders import SitemapSpider
|
||||
|
||||
class MySpider(SitemapSpider):
|
||||
sitemap_urls = ['http://www.example.com/sitemap.xml']
|
||||
@ -680,7 +680,7 @@ Simplest example: process all urls discovered through sitemaps using the
|
||||
Process some urls with certain callback and other urls with a different
|
||||
callback::
|
||||
|
||||
from scrapy.contrib.spiders import SitemapSpider
|
||||
from scrapy.spiders import SitemapSpider
|
||||
|
||||
class MySpider(SitemapSpider):
|
||||
sitemap_urls = ['http://www.example.com/sitemap.xml']
|
||||
@ -698,7 +698,7 @@ callback::
|
||||
Follow sitemaps defined in the `robots.txt`_ file and only follow sitemaps
|
||||
whose url contains ``/sitemap_shop``::
|
||||
|
||||
from scrapy.contrib.spiders import SitemapSpider
|
||||
from scrapy.spiders import SitemapSpider
|
||||
|
||||
class MySpider(SitemapSpider):
|
||||
sitemap_urls = ['http://www.example.com/robots.txt']
|
||||
@ -712,7 +712,7 @@ whose url contains ``/sitemap_shop``::
|
||||
|
||||
Combine SitemapSpider with other sources of urls::
|
||||
|
||||
from scrapy.contrib.spiders import SitemapSpider
|
||||
from scrapy.spiders import SitemapSpider
|
||||
|
||||
class MySpider(SitemapSpider):
|
||||
sitemap_urls = ['http://www.example.com/robots.txt']
|
||||
|
@ -75,7 +75,7 @@ available in Scrapy which extend the basic Stats Collector. You can select
|
||||
which Stats Collector to use through the :setting:`STATS_CLASS` setting. The
|
||||
default Stats Collector used is the :class:`MemoryStatsCollector`.
|
||||
|
||||
.. module:: scrapy.statscol
|
||||
.. module:: scrapy.statscollectors
|
||||
:synopsis: Stats Collectors
|
||||
|
||||
MemoryStatsCollector
|
||||
|
@ -6,7 +6,7 @@ from six.moves.urllib.parse import urlencode
|
||||
|
||||
import scrapy
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
@ -1,65 +1,7 @@
|
||||
"""CloseSpider is an extension that forces spiders to be closed after certain
|
||||
conditions are met.
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.closespider` is deprecated, "
|
||||
"use `scrapy.extensions.closespider` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from twisted.internet import reactor
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class CloseSpider(object):
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
|
||||
self.close_on = {
|
||||
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
|
||||
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
|
||||
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
|
||||
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
|
||||
}
|
||||
|
||||
self.counter = defaultdict(int)
|
||||
|
||||
if self.close_on.get('errorcount'):
|
||||
crawler.signals.connect(self.error_count, signal=signals.spider_error)
|
||||
if self.close_on.get('pagecount'):
|
||||
crawler.signals.connect(self.page_count, signal=signals.response_received)
|
||||
if self.close_on.get('timeout'):
|
||||
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
if self.close_on.get('itemcount'):
|
||||
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
|
||||
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def error_count(self, failure, response, spider):
|
||||
self.counter['errorcount'] += 1
|
||||
if self.counter['errorcount'] == self.close_on['errorcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_errorcount')
|
||||
|
||||
def page_count(self, response, request, spider):
|
||||
self.counter['pagecount'] += 1
|
||||
if self.counter['pagecount'] == self.close_on['pagecount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_pagecount')
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.task = reactor.callLater(self.close_on['timeout'], \
|
||||
self.crawler.engine.close_spider, spider, \
|
||||
reason='closespider_timeout')
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.counter['itemcount'] += 1
|
||||
if self.counter['itemcount'] == self.close_on['itemcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
|
||||
|
||||
def spider_closed(self, spider):
|
||||
task = getattr(self, 'task', False)
|
||||
if task and task.active():
|
||||
task.cancel()
|
||||
from scrapy.extensions.closespider import *
|
||||
|
@ -1,39 +1,7 @@
|
||||
"""
|
||||
Extension for collecting core stats like items scraped and start/finish times
|
||||
"""
|
||||
import datetime
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.corestats` is deprecated, "
|
||||
"use `scrapy.extensions.corestats` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
class CoreStats(object):
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
|
||||
crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
|
||||
crawler.signals.connect(o.response_received, signal=signals.response_received)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
self.stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
|
||||
self.stats.set_value('finish_reason', reason, spider=spider)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.stats.inc_value('item_scraped_count', spider=spider)
|
||||
|
||||
def response_received(self, spider):
|
||||
self.stats.inc_value('response_received_count', spider=spider)
|
||||
|
||||
def item_dropped(self, item, spider, exception):
|
||||
reason = exception.__class__.__name__
|
||||
self.stats.inc_value('item_dropped_count', spider=spider)
|
||||
self.stats.inc_value('item_dropped_reasons_count/%s' % reason, spider=spider)
|
||||
from scrapy.extensions.corestats import *
|
||||
|
@ -1,64 +1,7 @@
|
||||
"""
|
||||
Extensions for debugging Scrapy
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.debug` is deprecated, "
|
||||
"use `scrapy.extensions.debug` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
import sys
|
||||
import signal
|
||||
import logging
|
||||
import traceback
|
||||
import threading
|
||||
from pdb import Pdb
|
||||
|
||||
from scrapy.utils.engine import format_engine_status
|
||||
from scrapy.utils.trackref import format_live_refs
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StackTraceDump(object):
|
||||
|
||||
def __init__(self, crawler=None):
|
||||
self.crawler = crawler
|
||||
try:
|
||||
signal.signal(signal.SIGUSR2, self.dump_stacktrace)
|
||||
signal.signal(signal.SIGQUIT, self.dump_stacktrace)
|
||||
except AttributeError:
|
||||
# win32 platforms don't support SIGUSR signals
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def dump_stacktrace(self, signum, frame):
|
||||
log_args = {
|
||||
'stackdumps': self._thread_stacks(),
|
||||
'enginestatus': format_engine_status(self.crawler.engine),
|
||||
'liverefs': format_live_refs(),
|
||||
}
|
||||
logger.info("Dumping stack trace and engine status\n"
|
||||
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
|
||||
log_args, extra={'crawler': self.crawler})
|
||||
|
||||
def _thread_stacks(self):
|
||||
id2name = dict((th.ident, th.name) for th in threading.enumerate())
|
||||
dumps = ''
|
||||
for id_, frame in sys._current_frames().items():
|
||||
name = id2name.get(id_, '')
|
||||
dump = ''.join(traceback.format_stack(frame))
|
||||
dumps += "# Thread: {0}({1})\n{2}\n".format(name, id_, dump)
|
||||
return dumps
|
||||
|
||||
|
||||
class Debugger(object):
|
||||
def __init__(self):
|
||||
try:
|
||||
signal.signal(signal.SIGUSR2, self._enter_debugger)
|
||||
except AttributeError:
|
||||
# win32 platforms don't support SIGUSR signals
|
||||
pass
|
||||
|
||||
def _enter_debugger(self, signum, frame):
|
||||
Pdb().set_trace(frame.f_back)
|
||||
from scrapy.extensions.debug import *
|
||||
|
@ -1,96 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
import re
|
||||
import logging
|
||||
|
||||
import six
|
||||
from w3lib import html
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import _noscript_re, _script_re
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AjaxCrawlMiddleware(object):
|
||||
"""
|
||||
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
|
||||
For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('AJAXCRAWL_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
||||
# middleware parses first 4k. 4k turns out to be insufficient
|
||||
# for this middleware, and parsing 100k could be slow.
|
||||
# We use something in between (32K) by default.
|
||||
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
|
||||
if not isinstance(response, HtmlResponse) or response.status != 200:
|
||||
return response
|
||||
|
||||
if request.method != 'GET':
|
||||
# other HTTP methods are either not safe or don't have a body
|
||||
return response
|
||||
|
||||
if 'ajax_crawlable' in request.meta: # prevent loops
|
||||
return response
|
||||
|
||||
if not self._has_ajax_crawlable_variant(response):
|
||||
return response
|
||||
|
||||
# scrapy already handles #! links properly
|
||||
ajax_crawl_request = request.replace(url=request.url+'#!')
|
||||
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
|
||||
extra={'spider': spider})
|
||||
|
||||
ajax_crawl_request.meta['ajax_crawlable'] = True
|
||||
return ajax_crawl_request
|
||||
|
||||
def _has_ajax_crawlable_variant(self, response):
|
||||
"""
|
||||
Return True if a page without hash fragment could be "AJAX crawlable"
|
||||
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
body = response.body_as_unicode()[:self.lookup_bytes]
|
||||
return _has_ajaxcrawlable_meta(body)
|
||||
|
||||
|
||||
# XXX: move it to w3lib?
|
||||
_ajax_crawlable_re = re.compile(six.u(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'))
|
||||
def _has_ajaxcrawlable_meta(text):
|
||||
"""
|
||||
>>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>')
|
||||
True
|
||||
>>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
|
||||
True
|
||||
>>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>')
|
||||
False
|
||||
>>> _has_ajaxcrawlable_meta('<html></html>')
|
||||
False
|
||||
"""
|
||||
|
||||
# Stripping scripts and comments is slow (about 20x slower than
|
||||
# just checking if a string is in text); this is a quick fail-fast
|
||||
# path that should work for most pages.
|
||||
if 'fragment' not in text:
|
||||
return False
|
||||
if 'content' not in text:
|
||||
return False
|
||||
|
||||
text = _script_re.sub(u'', text)
|
||||
text = _noscript_re.sub(u'', text)
|
||||
text = html.remove_comments(html.replace_entities(text))
|
||||
return _ajax_crawlable_re.search(text) is not None
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.ajaxcrawl` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.ajaxcrawl` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.downloadermiddlewares.ajaxcrawl import *
|
||||
|
@ -1,13 +1,7 @@
|
||||
from scrapy.utils.http import decode_chunked_transfer
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.chunked` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.chunked` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
|
||||
class ChunkedTransferMiddleware(object):
|
||||
"""This middleware adds support for chunked transfer encoding, as
|
||||
documented in: http://en.wikipedia.org/wiki/Chunked_transfer_encoding
|
||||
"""
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if response.headers.get('Transfer-Encoding') == 'chunked':
|
||||
body = decode_chunked_transfer(response.body)
|
||||
return response.replace(body=body)
|
||||
return response
|
||||
from scrapy.downloadermiddlewares.chunked import *
|
||||
|
@ -1,91 +1,7 @@
|
||||
import os
|
||||
import six
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.cookies` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.cookies` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import Response
|
||||
from scrapy.http.cookies import CookieJar
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CookiesMiddleware(object):
|
||||
"""This middleware enables working with sites that need cookies"""
|
||||
|
||||
def __init__(self, debug=False):
|
||||
self.jars = defaultdict(CookieJar)
|
||||
self.debug = debug
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COOKIES_ENABLED'):
|
||||
raise NotConfigured
|
||||
return cls(crawler.settings.getbool('COOKIES_DEBUG'))
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
return
|
||||
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
jar = self.jars[cookiejarkey]
|
||||
cookies = self._get_request_cookies(jar, request)
|
||||
for cookie in cookies:
|
||||
jar.set_cookie_if_ok(cookie, request)
|
||||
|
||||
# set Cookie header
|
||||
request.headers.pop('Cookie', None)
|
||||
jar.add_cookie_header(request)
|
||||
self._debug_cookie(request, spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
return response
|
||||
|
||||
# extract cookies from Set-Cookie and drop invalid/expired cookies
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
jar = self.jars[cookiejarkey]
|
||||
jar.extract_cookies(response, request)
|
||||
self._debug_set_cookie(response, spider)
|
||||
|
||||
return response
|
||||
|
||||
def _debug_cookie(self, request, spider):
|
||||
if self.debug:
|
||||
cl = request.headers.getlist('Cookie')
|
||||
if cl:
|
||||
msg = "Sending cookies to: %s" % request + os.linesep
|
||||
msg += os.linesep.join("Cookie: %s" % c for c in cl)
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _debug_set_cookie(self, response, spider):
|
||||
if self.debug:
|
||||
cl = response.headers.getlist('Set-Cookie')
|
||||
if cl:
|
||||
msg = "Received cookies from: %s" % response + os.linesep
|
||||
msg += os.linesep.join("Set-Cookie: %s" % c for c in cl)
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _format_cookie(self, cookie):
|
||||
# build cookie string
|
||||
cookie_str = '%s=%s' % (cookie['name'], cookie['value'])
|
||||
|
||||
if cookie.get('path', None):
|
||||
cookie_str += '; Path=%s' % cookie['path']
|
||||
if cookie.get('domain', None):
|
||||
cookie_str += '; Domain=%s' % cookie['domain']
|
||||
|
||||
return cookie_str
|
||||
|
||||
def _get_request_cookies(self, jar, request):
|
||||
if isinstance(request.cookies, dict):
|
||||
cookie_list = [{'name': k, 'value': v} for k, v in \
|
||||
six.iteritems(request.cookies)]
|
||||
else:
|
||||
cookie_list = request.cookies
|
||||
|
||||
cookies = [self._format_cookie(x) for x in cookie_list]
|
||||
headers = {'Set-Cookie': cookies}
|
||||
response = Response(request.url, headers=headers)
|
||||
|
||||
return jar.make_cookies(response, request)
|
||||
from scrapy.downloadermiddlewares.cookies import *
|
||||
|
@ -1,88 +1,7 @@
|
||||
""" This module implements the DecompressionMiddleware which tries to recognise
|
||||
and extract the potentially compressed responses that may arrive.
|
||||
"""
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.decompression` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.decompression` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
import bz2
|
||||
import gzip
|
||||
import zipfile
|
||||
import tarfile
|
||||
import logging
|
||||
from tempfile import mktemp
|
||||
|
||||
import six
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO as BytesIO
|
||||
except ImportError:
|
||||
from io import BytesIO
|
||||
|
||||
from scrapy.responsetypes import responsetypes
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DecompressionMiddleware(object):
|
||||
""" This middleware tries to recognise and extract the possibly compressed
|
||||
responses that may arrive. """
|
||||
|
||||
def __init__(self):
|
||||
self._formats = {
|
||||
'tar': self._is_tar,
|
||||
'zip': self._is_zip,
|
||||
'gz': self._is_gzip,
|
||||
'bz2': self._is_bzip2
|
||||
}
|
||||
|
||||
def _is_tar(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
|
||||
except tarfile.ReadError:
|
||||
return
|
||||
|
||||
body = tar_file.extractfile(tar_file.members[0]).read()
|
||||
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_zip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
zip_file = zipfile.ZipFile(archive)
|
||||
except zipfile.BadZipfile:
|
||||
return
|
||||
|
||||
namelist = zip_file.namelist()
|
||||
body = zip_file.read(namelist[0])
|
||||
respcls = responsetypes.from_args(filename=namelist[0], body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_gzip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
body = gzip.GzipFile(fileobj=archive).read()
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_bzip2(self, response):
|
||||
try:
|
||||
body = bz2.decompress(response.body)
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if not response.body:
|
||||
return response
|
||||
|
||||
for fmt, func in six.iteritems(self._formats):
|
||||
new_response = func(response)
|
||||
if new_response:
|
||||
logger.debug('Decompressed response with format: %(responsefmt)s',
|
||||
{'responsefmt': fmt}, extra={'spider': spider})
|
||||
return new_response
|
||||
return response
|
||||
from scrapy.downloadermiddlewares.decompression import *
|
||||
|
@ -1,19 +1,7 @@
|
||||
"""
|
||||
DefaultHeaders downloader middleware
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.defaultheaders` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.defaultheaders` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
|
||||
class DefaultHeadersMiddleware(object):
|
||||
|
||||
def __init__(self, headers):
|
||||
self._headers = headers
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings.get('DEFAULT_REQUEST_HEADERS').items())
|
||||
|
||||
def process_request(self, request, spider):
|
||||
for k, v in self._headers:
|
||||
request.headers.setdefault(k, v)
|
||||
from scrapy.downloadermiddlewares.defaultheaders import *
|
||||
|
@ -1,26 +1,7 @@
|
||||
"""
|
||||
Download timeout middleware
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.downloadtimeout` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.downloadtimeout` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class DownloadTimeoutMiddleware(object):
|
||||
|
||||
def __init__(self, timeout=180):
|
||||
self._timeout = timeout
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings.getfloat('DOWNLOAD_TIMEOUT'))
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self._timeout = getattr(spider, 'download_timeout', self._timeout)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self._timeout:
|
||||
request.meta.setdefault('download_timeout', self._timeout)
|
||||
from scrapy.downloadermiddlewares.downloadtimeout import *
|
||||
|
@ -1,31 +1,7 @@
|
||||
"""
|
||||
HTTP basic auth downloader middleware
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.httpauth` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.httpauth` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class HttpAuthMiddleware(object):
|
||||
"""Set Basic HTTP Authorization header
|
||||
(http_user and http_pass spider class attributes)"""
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls()
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
usr = getattr(spider, 'http_user', '')
|
||||
pwd = getattr(spider, 'http_pass', '')
|
||||
if usr or pwd:
|
||||
self.auth = basic_auth_header(usr, pwd)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
auth = getattr(self, 'auth', None)
|
||||
if auth and 'Authorization' not in request.headers:
|
||||
request.headers['Authorization'] = auth
|
||||
from scrapy.downloadermiddlewares.httpauth import *
|
||||
|
@ -1,105 +1,7 @@
|
||||
from email.utils import formatdate
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.utils.misc import load_object
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.httpcache` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.httpcache` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
|
||||
class HttpCacheMiddleware(object):
|
||||
|
||||
def __init__(self, settings, stats):
|
||||
if not settings.getbool('HTTPCACHE_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
|
||||
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings, crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.storage.open_spider(spider)
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.storage.close_spider(spider)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_cache', False):
|
||||
return
|
||||
|
||||
# Skip uncacheable requests
|
||||
if not self.policy.should_cache_request(request):
|
||||
request.meta['_dont_cache'] = True # flag as uncacheable
|
||||
return
|
||||
|
||||
# Look for cached response and check if expired
|
||||
cachedresponse = self.storage.retrieve_response(spider, request)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/miss', spider=spider)
|
||||
if self.ignore_missing:
|
||||
self.stats.inc_value('httpcache/ignore', spider=spider)
|
||||
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
||||
return # first time request
|
||||
|
||||
# Return cached response only if not expired
|
||||
cachedresponse.flags.append('cached')
|
||||
if self.policy.is_cached_response_fresh(cachedresponse, request):
|
||||
self.stats.inc_value('httpcache/hit', spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
# Keep a reference to cached response to avoid a second cache lookup on
|
||||
# process_response hook
|
||||
request.meta['cached_response'] = cachedresponse
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_cache', False):
|
||||
return response
|
||||
|
||||
# Skip cached responses and uncacheable requests
|
||||
if 'cached' in response.flags or '_dont_cache' in request.meta:
|
||||
request.meta.pop('_dont_cache', None)
|
||||
return response
|
||||
|
||||
# RFC2616 requires origin server to set Date header,
|
||||
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
|
||||
if 'Date' not in response.headers:
|
||||
response.headers['Date'] = formatdate(usegmt=1)
|
||||
|
||||
# Do not validate first-hand responses
|
||||
cachedresponse = request.meta.pop('cached_response', None)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/firsthand', spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
if self.policy.is_cached_response_valid(cachedresponse, response, request):
|
||||
self.stats.inc_value('httpcache/revalidate', spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
self.stats.inc_value('httpcache/invalidate', spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
def _cache_response(self, spider, response, request, cachedresponse):
|
||||
if self.policy.should_cache_response(response, request):
|
||||
self.stats.inc_value('httpcache/store', spider=spider)
|
||||
self.storage.store_response(spider, request, response)
|
||||
else:
|
||||
self.stats.inc_value('httpcache/uncacheable', spider=spider)
|
||||
|
||||
|
||||
from scrapy.contrib.httpcache import FilesystemCacheStorage as _FilesystemCacheStorage
|
||||
class FilesystemCacheStorage(_FilesystemCacheStorage):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn('Importing FilesystemCacheStorage from '
|
||||
'scrapy.contrib.downloadermiddlware.httpcache is '
|
||||
'deprecated, use scrapy.contrib.httpcache instead.',
|
||||
category=ScrapyDeprecationWarning, stacklevel=1)
|
||||
super(FilesystemCacheStorage, self).__init__(*args, **kwargs)
|
||||
from scrapy.downloadermiddlewares.httpcache import *
|
||||
|
@ -1,56 +1,7 @@
|
||||
import zlib
|
||||
|
||||
from scrapy.utils.gz import gunzip, is_gzipped
|
||||
from scrapy.http import Response, TextResponse
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class HttpCompressionMiddleware(object):
|
||||
"""This middleware allows compressed (gzip, deflate) traffic to be
|
||||
sent/received from web sites"""
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
|
||||
raise NotConfigured
|
||||
return cls()
|
||||
|
||||
def process_request(self, request, spider):
|
||||
request.headers.setdefault('Accept-Encoding', 'gzip,deflate')
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if isinstance(response, Response):
|
||||
content_encoding = response.headers.getlist('Content-Encoding')
|
||||
if content_encoding and not is_gzipped(response):
|
||||
encoding = content_encoding.pop()
|
||||
decoded_body = self._decode(response.body, encoding.lower())
|
||||
respcls = responsetypes.from_args(headers=response.headers, \
|
||||
url=response.url)
|
||||
kwargs = dict(cls=respcls, body=decoded_body)
|
||||
if issubclass(respcls, TextResponse):
|
||||
# force recalculating the encoding until we make sure the
|
||||
# responsetypes guessing is reliable
|
||||
kwargs['encoding'] = None
|
||||
response = response.replace(**kwargs)
|
||||
if not content_encoding:
|
||||
del response.headers['Content-Encoding']
|
||||
|
||||
return response
|
||||
|
||||
def _decode(self, body, encoding):
|
||||
if encoding == 'gzip' or encoding == 'x-gzip':
|
||||
body = gunzip(body)
|
||||
|
||||
if encoding == 'deflate':
|
||||
try:
|
||||
body = zlib.decompress(body)
|
||||
except zlib.error:
|
||||
# ugly hack to work with raw deflate content that may
|
||||
# be sent by microsoft servers. For more information, see:
|
||||
# http://carsten.codimi.de/gzip.yaws/
|
||||
# http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
|
||||
# http://www.gzip.org/zlib/zlib_faq.html#faq38
|
||||
body = zlib.decompress(body, -15)
|
||||
return body
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.httpcompression` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.httpcompression` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.downloadermiddlewares.httpcompression import *
|
||||
|
@ -1,55 +1,7 @@
|
||||
import base64
|
||||
from six.moves.urllib.request import getproxies, proxy_bypass
|
||||
from six.moves.urllib.parse import unquote
|
||||
try:
|
||||
from urllib2 import _parse_proxy
|
||||
except ImportError:
|
||||
from urllib.request import _parse_proxy
|
||||
from six.moves.urllib.parse import urlunparse
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.httpproxy` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.httpproxy` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class HttpProxyMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.proxies = {}
|
||||
for type, url in getproxies().items():
|
||||
self.proxies[type] = self._get_proxy(url, type)
|
||||
|
||||
if not self.proxies:
|
||||
raise NotConfigured
|
||||
|
||||
def _get_proxy(self, url, orig_type):
|
||||
proxy_type, user, password, hostport = _parse_proxy(url)
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
|
||||
|
||||
if user and password:
|
||||
user_pass = '%s:%s' % (unquote(user), unquote(password))
|
||||
creds = base64.b64encode(user_pass).strip()
|
||||
else:
|
||||
creds = None
|
||||
|
||||
return creds, proxy_url
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# ignore if proxy is already seted
|
||||
if 'proxy' in request.meta:
|
||||
return
|
||||
|
||||
parsed = urlparse_cached(request)
|
||||
scheme = parsed.scheme
|
||||
|
||||
# 'no_proxy' is only supported by http schemes
|
||||
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
|
||||
return
|
||||
|
||||
if scheme in self.proxies:
|
||||
self._set_proxy(request, scheme)
|
||||
|
||||
def _set_proxy(self, request, scheme):
|
||||
creds, proxy = self.proxies[scheme]
|
||||
request.meta['proxy'] = proxy
|
||||
if creds:
|
||||
request.headers['Proxy-Authorization'] = 'Basic ' + creds
|
||||
from scrapy.downloadermiddlewares.httpproxy import *
|
||||
|
@ -1,101 +1,7 @@
|
||||
import logging
|
||||
from six.moves.urllib.parse import urljoin
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.redirect` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.redirect` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import get_meta_refresh
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseRedirectMiddleware(object):
|
||||
|
||||
enabled_setting = 'REDIRECT_ENABLED'
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool(self.enabled_setting):
|
||||
raise NotConfigured
|
||||
|
||||
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
|
||||
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def _redirect(self, redirected, request, spider, reason):
|
||||
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
|
||||
redirects = request.meta.get('redirect_times', 0) + 1
|
||||
|
||||
if ttl and redirects <= self.max_redirect_times:
|
||||
redirected.meta['redirect_times'] = redirects
|
||||
redirected.meta['redirect_ttl'] = ttl - 1
|
||||
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
|
||||
[request.url]
|
||||
redirected.dont_filter = request.dont_filter
|
||||
redirected.priority = request.priority + self.priority_adjust
|
||||
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
|
||||
{'reason': reason, 'redirected': redirected, 'request': request},
|
||||
extra={'spider': spider})
|
||||
return redirected
|
||||
else:
|
||||
logger.debug("Discarding %(request)s: max redirections reached",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
raise IgnoreRequest("max redirections reached")
|
||||
|
||||
def _redirect_request_using_get(self, request, redirect_url):
|
||||
redirected = request.replace(url=redirect_url, method='GET', body='')
|
||||
redirected.headers.pop('Content-Type', None)
|
||||
redirected.headers.pop('Content-Length', None)
|
||||
return redirected
|
||||
|
||||
|
||||
class RedirectMiddleware(BaseRedirectMiddleware):
|
||||
"""Handle redirection of requests based on response status and meta-refresh html tag"""
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_redirect', False):
|
||||
return response
|
||||
|
||||
if request.method == 'HEAD':
|
||||
if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
|
||||
redirected_url = urljoin(request.url, response.headers['location'])
|
||||
redirected = request.replace(url=redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
else:
|
||||
return response
|
||||
|
||||
if response.status in [302, 303] and 'Location' in response.headers:
|
||||
redirected_url = urljoin(request.url, response.headers['location'])
|
||||
redirected = self._redirect_request_using_get(request, redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
if response.status in [301, 307] and 'Location' in response.headers:
|
||||
redirected_url = urljoin(request.url, response.headers['location'])
|
||||
redirected = request.replace(url=redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
enabled_setting = 'METAREFRESH_ENABLED'
|
||||
|
||||
def __init__(self, settings):
|
||||
super(MetaRefreshMiddleware, self).__init__(settings)
|
||||
self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY',
|
||||
settings.getint('METAREFRESH_MAXDELAY'))
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_redirect', False) or request.method == 'HEAD' or \
|
||||
not isinstance(response, HtmlResponse):
|
||||
return response
|
||||
|
||||
if isinstance(response, HtmlResponse):
|
||||
interval, url = get_meta_refresh(response)
|
||||
if url and interval < self._maxdelay:
|
||||
redirected = self._redirect_request_using_get(request, url)
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
|
||||
return response
|
||||
from scrapy.downloadermiddlewares.redirect import *
|
||||
|
@ -1,82 +1,7 @@
|
||||
"""
|
||||
An extension to retry failed requests that are potentially caused by temporary
|
||||
problems such as a connection timeout or HTTP 500 error.
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.retry` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.retry` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
You can change the behaviour of this middleware by modifing the scraping settings:
|
||||
RETRY_TIMES - how many times to retry a failed page
|
||||
RETRY_HTTP_CODES - which HTTP response codes to retry
|
||||
|
||||
Failed pages are collected on the scraping process and rescheduled at the end,
|
||||
once the spider has finished crawling all regular (non failed) pages. Once
|
||||
there is no more failed pages to retry this middleware sends a signal
|
||||
(retry_complete), so other extensions could connect to that signal.
|
||||
|
||||
About HTTP errors to consider:
|
||||
|
||||
- You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP
|
||||
protocol. It's included by default because it's a common code used to
|
||||
indicate server overload, which would be something we want to retry
|
||||
"""
|
||||
import logging
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import TimeoutError, DNSLookupError, \
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError, \
|
||||
ConnectionLost, TCPTimedOutError
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.response import response_status_message
|
||||
from scrapy.xlib.tx import ResponseFailed
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetryMiddleware(object):
|
||||
|
||||
# IOError is raised by the HttpCompression middleware when trying to
|
||||
# decompress an empty response
|
||||
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost, TCPTimedOutError, ResponseFailed,
|
||||
IOError)
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('RETRY_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.max_retry_times = settings.getint('RETRY_TIMES')
|
||||
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
|
||||
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_retry', False):
|
||||
return response
|
||||
if response.status in self.retry_http_codes:
|
||||
reason = response_status_message(response.status)
|
||||
return self._retry(request, reason, spider) or response
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
|
||||
and not request.meta.get('dont_retry', False):
|
||||
return self._retry(request, exception, spider)
|
||||
|
||||
def _retry(self, request, reason, spider):
|
||||
retries = request.meta.get('retry_times', 0) + 1
|
||||
|
||||
if retries <= self.max_retry_times:
|
||||
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
retryreq = request.copy()
|
||||
retryreq.meta['retry_times'] = retries
|
||||
retryreq.dont_filter = True
|
||||
retryreq.priority = request.priority + self.priority_adjust
|
||||
return retryreq
|
||||
else:
|
||||
logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
from scrapy.downloadermiddlewares.retry import *
|
||||
|
@ -1,67 +1,7 @@
|
||||
"""
|
||||
This is a middleware to respect robots.txt policies. To activate it you must
|
||||
enable this middleware and enable the ROBOTSTXT_OBEY setting.
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.robotstxt` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.robotstxt` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from six.moves.urllib import robotparser
|
||||
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RobotsTxtMiddleware(object):
|
||||
DOWNLOAD_PRIORITY = 1000
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
|
||||
raise NotConfigured
|
||||
|
||||
self.crawler = crawler
|
||||
self._useragent = crawler.settings.get('USER_AGENT')
|
||||
self._parsers = {}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_obey_robotstxt'):
|
||||
return
|
||||
rp = self.robot_parser(request, spider)
|
||||
if rp and not rp.can_fetch(self._useragent, request.url):
|
||||
logger.debug("Forbidden by robots.txt: %(request)s",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
raise IgnoreRequest
|
||||
|
||||
def robot_parser(self, request, spider):
|
||||
url = urlparse_cached(request)
|
||||
netloc = url.netloc
|
||||
if netloc not in self._parsers:
|
||||
self._parsers[netloc] = None
|
||||
robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
|
||||
robotsreq = Request(
|
||||
robotsurl,
|
||||
priority=self.DOWNLOAD_PRIORITY,
|
||||
meta={'dont_obey_robotstxt': True}
|
||||
)
|
||||
dfd = self.crawler.engine.download(robotsreq, spider)
|
||||
dfd.addCallback(self._parse_robots)
|
||||
dfd.addErrback(self._logerror, robotsreq, spider)
|
||||
return self._parsers[netloc]
|
||||
|
||||
def _logerror(self, failure, request, spider):
|
||||
if failure.type is not IgnoreRequest:
|
||||
logger.error("Error downloading %(request)s: %(f_exception)s",
|
||||
{'request': request, 'f_exception': failure.value},
|
||||
extra={'spider': spider, 'failure': failure})
|
||||
|
||||
def _parse_robots(self, response):
|
||||
rp = robotparser.RobotFileParser(response.url)
|
||||
rp.parse(response.body.splitlines())
|
||||
self._parsers[urlparse_cached(response).netloc] = rp
|
||||
from scrapy.downloadermiddlewares.robotstxt import *
|
||||
|
@ -1,32 +1,7 @@
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.request import request_httprepr
|
||||
from scrapy.utils.response import response_httprepr
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.stats` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.stats` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
class DownloaderStats(object):
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('DOWNLOADER_STATS'):
|
||||
raise NotConfigured
|
||||
return cls(crawler.stats)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
self.stats.inc_value('downloader/request_count', spider=spider)
|
||||
self.stats.inc_value('downloader/request_method_count/%s' % request.method, spider=spider)
|
||||
reqlen = len(request_httprepr(request))
|
||||
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
self.stats.inc_value('downloader/response_count', spider=spider)
|
||||
self.stats.inc_value('downloader/response_status_count/%s' % response.status, spider=spider)
|
||||
reslen = len(response_httprepr(response))
|
||||
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__)
|
||||
self.stats.inc_value('downloader/exception_count', spider=spider)
|
||||
self.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
|
||||
from scrapy.downloadermiddlewares.stats import *
|
||||
|
@ -1,23 +1,7 @@
|
||||
"""Set User-Agent header per spider or use a default value from settings"""
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.downloadermiddleware.useragent` is deprecated, "
|
||||
"use `scrapy.downloadermiddlewares.useragent` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class UserAgentMiddleware(object):
|
||||
"""This middleware allows spiders to override the user_agent"""
|
||||
|
||||
def __init__(self, user_agent='Scrapy'):
|
||||
self.user_agent = user_agent
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings['USER_AGENT'])
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self.user_agent:
|
||||
request.headers.setdefault('User-Agent', self.user_agent)
|
||||
from scrapy.downloadermiddlewares.useragent import *
|
||||
|
@ -1,261 +1,7 @@
|
||||
"""
|
||||
Item Exporters are used to export/serialize items into different formats.
|
||||
"""
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.exporter` is deprecated, "
|
||||
"use `scrapy.exporters` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
import csv
|
||||
import sys
|
||||
import pprint
|
||||
import marshal
|
||||
import six
|
||||
from six.moves import cPickle as pickle
|
||||
from xml.sax.saxutils import XMLGenerator
|
||||
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||
from scrapy.item import BaseItem
|
||||
|
||||
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
|
||||
'CsvItemExporter', 'XmlItemExporter', 'JsonLinesItemExporter',
|
||||
'JsonItemExporter', 'MarshalItemExporter']
|
||||
|
||||
|
||||
class BaseItemExporter(object):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._configure(kwargs)
|
||||
|
||||
def _configure(self, options, dont_fail=False):
|
||||
"""Configure the exporter by poping options from the ``options`` dict.
|
||||
If dont_fail is set, it won't raise an exception on unexpected options
|
||||
(useful for using with keyword arguments in subclasses constructors)
|
||||
"""
|
||||
self.fields_to_export = options.pop('fields_to_export', None)
|
||||
self.export_empty_fields = options.pop('export_empty_fields', False)
|
||||
self.encoding = options.pop('encoding', 'utf-8')
|
||||
if not dont_fail and options:
|
||||
raise TypeError("Unexpected options: %s" % ', '.join(options.keys()))
|
||||
|
||||
def export_item(self, item):
|
||||
raise NotImplementedError
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', self._to_str_if_unicode)
|
||||
return serializer(value)
|
||||
|
||||
def start_exporting(self):
|
||||
pass
|
||||
|
||||
def finish_exporting(self):
|
||||
pass
|
||||
|
||||
def _to_str_if_unicode(self, value):
|
||||
return value.encode(self.encoding) if isinstance(value, unicode) else value
|
||||
|
||||
def _get_serialized_fields(self, item, default_value=None, include_empty=None):
|
||||
"""Return the fields to export as an iterable of tuples
|
||||
(name, serialized_value)
|
||||
"""
|
||||
if include_empty is None:
|
||||
include_empty = self.export_empty_fields
|
||||
if self.fields_to_export is None:
|
||||
if include_empty and not isinstance(item, dict):
|
||||
field_iter = six.iterkeys(item.fields)
|
||||
else:
|
||||
field_iter = six.iterkeys(item)
|
||||
else:
|
||||
if include_empty:
|
||||
field_iter = self.fields_to_export
|
||||
else:
|
||||
field_iter = (x for x in self.fields_to_export if x in item)
|
||||
|
||||
for field_name in field_iter:
|
||||
if field_name in item:
|
||||
field = {} if isinstance(item, dict) else item.fields[field_name]
|
||||
value = self.serialize_field(field, field_name, item[field_name])
|
||||
else:
|
||||
value = default_value
|
||||
|
||||
yield field_name, value
|
||||
|
||||
|
||||
class JsonLinesItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs, dont_fail=True)
|
||||
self.file = file
|
||||
self.encoder = ScrapyJSONEncoder(**kwargs)
|
||||
|
||||
def export_item(self, item):
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(self.encoder.encode(itemdict) + '\n')
|
||||
|
||||
|
||||
class JsonItemExporter(JsonLinesItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs, dont_fail=True)
|
||||
self.file = file
|
||||
self.encoder = ScrapyJSONEncoder(**kwargs)
|
||||
self.first_item = True
|
||||
|
||||
def start_exporting(self):
|
||||
self.file.write("[")
|
||||
|
||||
def finish_exporting(self):
|
||||
self.file.write("]")
|
||||
|
||||
def export_item(self, item):
|
||||
if self.first_item:
|
||||
self.first_item = False
|
||||
else:
|
||||
self.file.write(',\n')
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(self.encoder.encode(itemdict))
|
||||
|
||||
|
||||
class XmlItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self.item_element = kwargs.pop('item_element', 'item')
|
||||
self.root_element = kwargs.pop('root_element', 'items')
|
||||
self._configure(kwargs)
|
||||
self.xg = XMLGenerator(file, encoding=self.encoding)
|
||||
|
||||
def start_exporting(self):
|
||||
self.xg.startDocument()
|
||||
self.xg.startElement(self.root_element, {})
|
||||
|
||||
def export_item(self, item):
|
||||
self.xg.startElement(self.item_element, {})
|
||||
for name, value in self._get_serialized_fields(item, default_value=''):
|
||||
self._export_xml_field(name, value)
|
||||
self.xg.endElement(self.item_element)
|
||||
|
||||
def finish_exporting(self):
|
||||
self.xg.endElement(self.root_element)
|
||||
self.xg.endDocument()
|
||||
|
||||
def _export_xml_field(self, name, serialized_value):
|
||||
self.xg.startElement(name, {})
|
||||
if hasattr(serialized_value, 'items'):
|
||||
for subname, value in serialized_value.items():
|
||||
self._export_xml_field(subname, value)
|
||||
elif hasattr(serialized_value, '__iter__'):
|
||||
for value in serialized_value:
|
||||
self._export_xml_field('value', value)
|
||||
else:
|
||||
self._xg_characters(serialized_value)
|
||||
self.xg.endElement(name)
|
||||
|
||||
# Workaround for http://bugs.python.org/issue17606
|
||||
# Before Python 2.7.4 xml.sax.saxutils required bytes;
|
||||
# since 2.7.4 it requires unicode. The bug is likely to be
|
||||
# fixed in 2.7.6, but 2.7.6 will still support unicode,
|
||||
# and Python 3.x will require unicode, so ">= 2.7.4" should be fine.
|
||||
if sys.version_info[:3] >= (2, 7, 4):
|
||||
def _xg_characters(self, serialized_value):
|
||||
if not isinstance(serialized_value, unicode):
|
||||
serialized_value = serialized_value.decode(self.encoding)
|
||||
return self.xg.characters(serialized_value)
|
||||
else:
|
||||
def _xg_characters(self, serialized_value):
|
||||
return self.xg.characters(serialized_value)
|
||||
|
||||
|
||||
class CsvItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
|
||||
self._configure(kwargs, dont_fail=True)
|
||||
self.include_headers_line = include_headers_line
|
||||
self.csv_writer = csv.writer(file, **kwargs)
|
||||
self._headers_not_written = True
|
||||
self._join_multivalued = join_multivalued
|
||||
|
||||
def _to_str_if_unicode(self, value):
|
||||
if isinstance(value, (list, tuple)):
|
||||
try:
|
||||
value = self._join_multivalued.join(value)
|
||||
except TypeError: # list in value may not contain strings
|
||||
pass
|
||||
return super(CsvItemExporter, self)._to_str_if_unicode(value)
|
||||
|
||||
def export_item(self, item):
|
||||
if self._headers_not_written:
|
||||
self._headers_not_written = False
|
||||
self._write_headers_and_set_fields_to_export(item)
|
||||
|
||||
fields = self._get_serialized_fields(item, default_value='',
|
||||
include_empty=True)
|
||||
values = [x[1] for x in fields]
|
||||
self.csv_writer.writerow(values)
|
||||
|
||||
def _write_headers_and_set_fields_to_export(self, item):
|
||||
if self.include_headers_line:
|
||||
if not self.fields_to_export:
|
||||
if isinstance(item, dict):
|
||||
# for dicts try using fields of the first item
|
||||
self.fields_to_export = list(item.keys())
|
||||
else:
|
||||
# use fields declared in Item
|
||||
self.fields_to_export = list(item.fields.keys())
|
||||
self.csv_writer.writerow(self.fields_to_export)
|
||||
|
||||
|
||||
class PickleItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, protocol=2, **kwargs):
|
||||
self._configure(kwargs)
|
||||
self.file = file
|
||||
self.protocol = protocol
|
||||
|
||||
def export_item(self, item):
|
||||
d = dict(self._get_serialized_fields(item))
|
||||
pickle.dump(d, self.file, self.protocol)
|
||||
|
||||
|
||||
class MarshalItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs)
|
||||
self.file = file
|
||||
|
||||
def export_item(self, item):
|
||||
marshal.dump(dict(self._get_serialized_fields(item)), self.file)
|
||||
|
||||
|
||||
class PprintItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs)
|
||||
self.file = file
|
||||
|
||||
def export_item(self, item):
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(pprint.pformat(itemdict) + '\n')
|
||||
|
||||
|
||||
class PythonItemExporter(BaseItemExporter):
|
||||
"""The idea behind this exporter is to have a mechanism to serialize items
|
||||
to built-in python types so any serialization library (like
|
||||
json, msgpack, binc, etc) can be used on top of it. Its main goal is to
|
||||
seamless support what BaseItemExporter does plus nested items.
|
||||
"""
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', self._serialize_value)
|
||||
return serializer(value)
|
||||
|
||||
def _serialize_value(self, value):
|
||||
if isinstance(value, BaseItem):
|
||||
return self.export_item(value)
|
||||
if isinstance(value, dict):
|
||||
return dict(self._serialize_dict(value))
|
||||
if hasattr(value, '__iter__'):
|
||||
return [self._serialize_value(v) for v in value]
|
||||
return self._to_str_if_unicode(value)
|
||||
|
||||
def _serialize_dict(self, value):
|
||||
for key, val in six.iteritems(value):
|
||||
yield key, self._serialize_value(val)
|
||||
|
||||
def export_item(self, item):
|
||||
return dict(self._get_serialized_fields(item))
|
||||
from scrapy.exporters import *
|
||||
|
@ -1,238 +1,7 @@
|
||||
"""
|
||||
Feed Exports extension
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.feedexport` is deprecated, "
|
||||
"use `scrapy.extensions.feedexport` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/feed-exports.rst
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import posixpath
|
||||
from tempfile import TemporaryFile
|
||||
from datetime import datetime
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from ftplib import FTP
|
||||
|
||||
from zope.interface import Interface, implementer
|
||||
from twisted.internet import defer, threads
|
||||
from w3lib.url import file_uri_to_path
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.utils.ftp import ftp_makedirs_cwd
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.python import get_func_args
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IFeedStorage(Interface):
|
||||
"""Interface that all Feed Storages must implement"""
|
||||
|
||||
def __init__(uri):
|
||||
"""Initialize the storage with the parameters given in the URI"""
|
||||
|
||||
def open(spider):
|
||||
"""Open the storage for the given spider. It must return a file-like
|
||||
object that will be used for the exporters"""
|
||||
|
||||
def store(file):
|
||||
"""Store the given file stream"""
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class BlockingFeedStorage(object):
|
||||
|
||||
def open(self, spider):
|
||||
return TemporaryFile(prefix='feed-')
|
||||
|
||||
def store(self, file):
|
||||
return threads.deferToThread(self._store_in_thread, file)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class StdoutFeedStorage(object):
|
||||
|
||||
def __init__(self, uri, _stdout=sys.stdout):
|
||||
self._stdout = _stdout
|
||||
|
||||
def open(self, spider):
|
||||
return self._stdout
|
||||
|
||||
def store(self, file):
|
||||
pass
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class FileFeedStorage(object):
|
||||
|
||||
def __init__(self, uri):
|
||||
self.path = file_uri_to_path(uri)
|
||||
|
||||
def open(self, spider):
|
||||
dirname = os.path.dirname(self.path)
|
||||
if dirname and not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
return open(self.path, 'ab')
|
||||
|
||||
def store(self, file):
|
||||
file.close()
|
||||
|
||||
|
||||
class S3FeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri):
|
||||
from scrapy.conf import settings
|
||||
try:
|
||||
import boto
|
||||
except ImportError:
|
||||
raise NotConfigured
|
||||
self.connect_s3 = boto.connect_s3
|
||||
u = urlparse(uri)
|
||||
self.bucketname = u.hostname
|
||||
self.access_key = u.username or settings['AWS_ACCESS_KEY_ID']
|
||||
self.secret_key = u.password or settings['AWS_SECRET_ACCESS_KEY']
|
||||
self.keyname = u.path
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
conn = self.connect_s3(self.access_key, self.secret_key)
|
||||
bucket = conn.get_bucket(self.bucketname, validate=False)
|
||||
key = bucket.new_key(self.keyname)
|
||||
key.set_contents_from_file(file)
|
||||
key.close()
|
||||
|
||||
|
||||
class FTPFeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri):
|
||||
u = urlparse(uri)
|
||||
self.host = u.hostname
|
||||
self.port = int(u.port or '21')
|
||||
self.username = u.username
|
||||
self.password = u.password
|
||||
self.path = u.path
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
ftp = FTP()
|
||||
ftp.connect(self.host, self.port)
|
||||
ftp.login(self.username, self.password)
|
||||
dirname, filename = posixpath.split(self.path)
|
||||
ftp_makedirs_cwd(ftp, dirname)
|
||||
ftp.storbinary('STOR %s' % filename, file)
|
||||
ftp.quit()
|
||||
|
||||
|
||||
class SpiderSlot(object):
|
||||
def __init__(self, file, exporter, storage, uri):
|
||||
self.file = file
|
||||
self.exporter = exporter
|
||||
self.storage = storage
|
||||
self.uri = uri
|
||||
self.itemcount = 0
|
||||
|
||||
|
||||
class FeedExporter(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
self.settings = settings
|
||||
self.urifmt = settings['FEED_URI']
|
||||
if not self.urifmt:
|
||||
raise NotConfigured
|
||||
self.format = settings['FEED_FORMAT'].lower()
|
||||
self.storages = self._load_components('FEED_STORAGES')
|
||||
self.exporters = self._load_components('FEED_EXPORTERS')
|
||||
if not self._storage_supported(self.urifmt):
|
||||
raise NotConfigured
|
||||
if not self._exporter_supported(self.format):
|
||||
raise NotConfigured
|
||||
self.store_empty = settings.getbool('FEED_STORE_EMPTY')
|
||||
self.export_fields = settings.getlist('FEED_EXPORT_FIELDS')
|
||||
uripar = settings['FEED_URI_PARAMS']
|
||||
self._uripar = load_object(uripar) if uripar else lambda x, y: None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings)
|
||||
crawler.signals.connect(o.open_spider, signals.spider_opened)
|
||||
crawler.signals.connect(o.close_spider, signals.spider_closed)
|
||||
crawler.signals.connect(o.item_scraped, signals.item_scraped)
|
||||
return o
|
||||
|
||||
def open_spider(self, spider):
|
||||
uri = self.urifmt % self._get_uri_params(spider)
|
||||
storage = self._get_storage(uri)
|
||||
file = storage.open(spider)
|
||||
exporter = self._get_exporter(file, fields_to_export=self.export_fields)
|
||||
exporter.start_exporting()
|
||||
self.slot = SpiderSlot(file, exporter, storage, uri)
|
||||
|
||||
def close_spider(self, spider):
|
||||
slot = self.slot
|
||||
if not slot.itemcount and not self.store_empty:
|
||||
return
|
||||
slot.exporter.finish_exporting()
|
||||
logfmt = "%%s %(format)s feed (%(itemcount)d items) in: %(uri)s"
|
||||
log_args = {'format': self.format,
|
||||
'itemcount': slot.itemcount,
|
||||
'uri': slot.uri}
|
||||
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args,
|
||||
extra={'spider': spider}))
|
||||
d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args,
|
||||
extra={'spider': spider, 'failure': f}))
|
||||
return d
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
slot = self.slot
|
||||
slot.exporter.export_item(item)
|
||||
slot.itemcount += 1
|
||||
return item
|
||||
|
||||
def _load_components(self, setting_prefix):
|
||||
conf = dict(self.settings['%s_BASE' % setting_prefix])
|
||||
conf.update(self.settings[setting_prefix])
|
||||
d = {}
|
||||
for k, v in conf.items():
|
||||
try:
|
||||
d[k] = load_object(v)
|
||||
except NotConfigured:
|
||||
pass
|
||||
return d
|
||||
|
||||
def _exporter_supported(self, format):
|
||||
if format in self.exporters:
|
||||
return True
|
||||
logger.error("Unknown feed format: %(format)s", {'format': format})
|
||||
|
||||
def _storage_supported(self, uri):
|
||||
scheme = urlparse(uri).scheme
|
||||
if scheme in self.storages:
|
||||
try:
|
||||
self._get_storage(uri)
|
||||
return True
|
||||
except NotConfigured:
|
||||
logger.error("Disabled feed storage scheme: %(scheme)s",
|
||||
{'scheme': scheme})
|
||||
else:
|
||||
logger.error("Unknown feed storage scheme: %(scheme)s",
|
||||
{'scheme': scheme})
|
||||
|
||||
def _get_exporter(self, *args, **kwargs):
|
||||
return self.exporters[self.format](*args, **kwargs)
|
||||
|
||||
def _get_storage(self, uri):
|
||||
return self.storages[urlparse(uri).scheme](uri)
|
||||
|
||||
def _get_uri_params(self, spider):
|
||||
params = {}
|
||||
for k in dir(spider):
|
||||
params[k] = getattr(spider, k)
|
||||
ts = datetime.utcnow().replace(microsecond=0).isoformat().replace(':', '-')
|
||||
params['time'] = ts
|
||||
self._uripar(params, spider)
|
||||
return params
|
||||
from scrapy.extensions.feedexport import *
|
||||
|
@ -1,378 +1,7 @@
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import gzip
|
||||
from six.moves import cPickle as pickle
|
||||
from importlib import import_module
|
||||
from time import time
|
||||
from weakref import WeakKeyDictionary
|
||||
from email.utils import mktime_tz, parsedate_tz
|
||||
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
|
||||
from scrapy.http import Headers
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.project import data_path
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.httpcache` is deprecated, "
|
||||
"use `scrapy.extensions.httpcache` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
|
||||
class DummyPolicy(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
return urlparse_cached(request).scheme not in self.ignore_schemes
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
return response.status not in self.ignore_http_codes
|
||||
|
||||
def is_cached_response_fresh(self, response, request):
|
||||
return True
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
return True
|
||||
|
||||
|
||||
class RFC2616Policy(object):
|
||||
|
||||
MAXAGE = 3600 * 24 * 365 # one year
|
||||
|
||||
def __init__(self, settings):
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self._cc_parsed = WeakKeyDictionary()
|
||||
|
||||
def _parse_cachecontrol(self, r):
|
||||
if r not in self._cc_parsed:
|
||||
cch = r.headers.get('Cache-Control', '')
|
||||
self._cc_parsed[r] = parse_cachecontrol(cch)
|
||||
return self._cc_parsed[r]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
if urlparse_cached(request).scheme in self.ignore_schemes:
|
||||
return False
|
||||
cc = self._parse_cachecontrol(request)
|
||||
# obey user-agent directive "Cache-Control: no-store"
|
||||
if 'no-store' in cc:
|
||||
return False
|
||||
# Any other is eligible for caching
|
||||
return True
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
# What is cacheable - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1
|
||||
# Response cacheability - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
|
||||
# Status code 206 is not included because cache can not deal with partial contents
|
||||
cc = self._parse_cachecontrol(response)
|
||||
# obey directive "Cache-Control: no-store"
|
||||
if 'no-store' in cc:
|
||||
return False
|
||||
# Never cache 304 (Not Modified) responses
|
||||
elif response.status == 304:
|
||||
return False
|
||||
# Any hint on response expiration is good
|
||||
elif 'max-age' in cc or 'Expires' in response.headers:
|
||||
return True
|
||||
# Firefox fallbacks this statuses to one year expiration if none is set
|
||||
elif response.status in (300, 301, 308):
|
||||
return True
|
||||
# Other statuses without expiration requires at least one validator
|
||||
elif response.status in (200, 203, 401):
|
||||
return 'Last-Modified' in response.headers or 'ETag' in response.headers
|
||||
# Any other is probably not eligible for caching
|
||||
# Makes no sense to cache responses that does not contain expiration
|
||||
# info and can not be revalidated
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
ccreq = self._parse_cachecontrol(request)
|
||||
if 'no-cache' in cc or 'no-cache' in ccreq:
|
||||
return False
|
||||
|
||||
now = time()
|
||||
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
|
||||
currentage = self._compute_current_age(cachedresponse, request, now)
|
||||
if currentage < freshnesslifetime:
|
||||
return True
|
||||
# Cached response is stale, try to set validators if any
|
||||
self._set_conditional_validators(request, cachedresponse)
|
||||
return False
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
return response.status == 304
|
||||
|
||||
def _set_conditional_validators(self, request, cachedresponse):
|
||||
if 'Last-Modified' in cachedresponse.headers:
|
||||
request.headers['If-Modified-Since'] = cachedresponse.headers['Last-Modified']
|
||||
|
||||
if 'ETag' in cachedresponse.headers:
|
||||
request.headers['If-None-Match'] = cachedresponse.headers['ETag']
|
||||
|
||||
def _compute_freshness_lifetime(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeFreshnessLifetime
|
||||
# http://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#410
|
||||
cc = self._parse_cachecontrol(response)
|
||||
if 'max-age' in cc:
|
||||
try:
|
||||
return max(0, int(cc['max-age']))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Parse date header or synthesize it if none exists
|
||||
date = rfc1123_to_epoch(response.headers.get('Date')) or now
|
||||
|
||||
# Try HTTP/1.0 Expires header
|
||||
if 'Expires' in response.headers:
|
||||
expires = rfc1123_to_epoch(response.headers['Expires'])
|
||||
# When parsing Expires header fails RFC 2616 section 14.21 says we
|
||||
# should treat this as an expiration time in the past.
|
||||
return max(0, expires - date) if expires else 0
|
||||
|
||||
# Fallback to heuristic using last-modified header
|
||||
# This is not in RFC but on Firefox caching implementation
|
||||
lastmodified = rfc1123_to_epoch(response.headers.get('Last-Modified'))
|
||||
if lastmodified and lastmodified <= date:
|
||||
return (date - lastmodified) / 10
|
||||
|
||||
# This request can be cached indefinitely
|
||||
if response.status in (300, 301, 308):
|
||||
return self.MAXAGE
|
||||
|
||||
# Insufficient information to compute fresshness lifetime
|
||||
return 0
|
||||
|
||||
def _compute_current_age(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeCurrentAge
|
||||
# http://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#366
|
||||
currentage = 0
|
||||
# If Date header is not set we assume it is a fast connection, and
|
||||
# clock is in sync with the server
|
||||
date = rfc1123_to_epoch(response.headers.get('Date')) or now
|
||||
if now > date:
|
||||
currentage = now - date
|
||||
|
||||
if 'Age' in response.headers:
|
||||
try:
|
||||
age = int(response.headers['Age'])
|
||||
currentage = max(currentage, age)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return currentage
|
||||
|
||||
|
||||
class DbmCacheStorage(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
|
||||
self.db = None
|
||||
|
||||
def open_spider(self, spider):
|
||||
dbpath = os.path.join(self.cachedir, '%s.db' % spider.name)
|
||||
self.db = self.dbmodule.open(dbpath, 'c')
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.db.close()
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
data = self._read_data(spider, request)
|
||||
if data is None:
|
||||
return # not cached
|
||||
url = data['url']
|
||||
status = data['status']
|
||||
headers = Headers(data['headers'])
|
||||
body = data['body']
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
key = self._request_key(request)
|
||||
data = {
|
||||
'status': response.status,
|
||||
'url': response.url,
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
}
|
||||
self.db['%s_data' % key] = pickle.dumps(data, protocol=2)
|
||||
self.db['%s_time' % key] = str(time())
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
key = self._request_key(request)
|
||||
db = self.db
|
||||
tkey = '%s_time' % key
|
||||
if tkey not in db:
|
||||
return # not found
|
||||
|
||||
ts = db[tkey]
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return # expired
|
||||
|
||||
return pickle.loads(db['%s_data' % key])
|
||||
|
||||
def _request_key(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
|
||||
class FilesystemCacheStorage(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
|
||||
self._open = gzip.open if self.use_gzip else open
|
||||
|
||||
def open_spider(self, spider):
|
||||
pass
|
||||
|
||||
def close_spider(self, spider):
|
||||
pass
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
"""Return response if present in cache, or None otherwise."""
|
||||
metadata = self._read_meta(spider, request)
|
||||
if metadata is None:
|
||||
return # not cached
|
||||
rpath = self._get_request_path(spider, request)
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
|
||||
body = f.read()
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
|
||||
rawheaders = f.read()
|
||||
url = metadata.get('response_url')
|
||||
status = metadata['status']
|
||||
headers = Headers(headers_raw_to_dict(rawheaders))
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
"""Store the given response in the cache."""
|
||||
rpath = self._get_request_path(spider, request)
|
||||
if not os.path.exists(rpath):
|
||||
os.makedirs(rpath)
|
||||
metadata = {
|
||||
'url': request.url,
|
||||
'method': request.method,
|
||||
'status': response.status,
|
||||
'response_url': response.url,
|
||||
'timestamp': time(),
|
||||
}
|
||||
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
|
||||
f.write(repr(metadata))
|
||||
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
||||
pickle.dump(metadata, f, protocol=2)
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(response.headers))
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
||||
f.write(response.body)
|
||||
with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(request.headers))
|
||||
with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
|
||||
f.write(request.body)
|
||||
|
||||
def _get_request_path(self, spider, request):
|
||||
key = request_fingerprint(request)
|
||||
return os.path.join(self.cachedir, spider.name, key[0:2], key)
|
||||
|
||||
def _read_meta(self, spider, request):
|
||||
rpath = self._get_request_path(spider, request)
|
||||
metapath = os.path.join(rpath, 'pickled_meta')
|
||||
if not os.path.exists(metapath):
|
||||
return # not found
|
||||
mtime = os.stat(rpath).st_mtime
|
||||
if 0 < self.expiration_secs < time() - mtime:
|
||||
return # expired
|
||||
with self._open(metapath, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
class LeveldbCacheStorage(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
import leveldb
|
||||
self._leveldb = leveldb
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.db = None
|
||||
|
||||
def open_spider(self, spider):
|
||||
dbpath = os.path.join(self.cachedir, '%s.leveldb' % spider.name)
|
||||
self.db = self._leveldb.LevelDB(dbpath)
|
||||
|
||||
def close_spider(self, spider):
|
||||
del self.db
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
data = self._read_data(spider, request)
|
||||
if data is None:
|
||||
return # not cached
|
||||
url = data['url']
|
||||
status = data['status']
|
||||
headers = Headers(data['headers'])
|
||||
body = data['body']
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
key = self._request_key(request)
|
||||
data = {
|
||||
'status': response.status,
|
||||
'url': response.url,
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
}
|
||||
batch = self._leveldb.WriteBatch()
|
||||
batch.Put('%s_data' % key, pickle.dumps(data, protocol=2))
|
||||
batch.Put('%s_time' % key, str(time()))
|
||||
self.db.Write(batch)
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
key = self._request_key(request)
|
||||
try:
|
||||
ts = self.db.Get('%s_time' % key)
|
||||
except KeyError:
|
||||
return # not found or invalid entry
|
||||
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return # expired
|
||||
|
||||
try:
|
||||
data = self.db.Get('%s_data' % key)
|
||||
except KeyError:
|
||||
return # invalid entry
|
||||
else:
|
||||
return pickle.loads(data)
|
||||
|
||||
def _request_key(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
|
||||
|
||||
def parse_cachecontrol(header):
|
||||
"""Parse Cache-Control header
|
||||
|
||||
http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
|
||||
|
||||
>>> parse_cachecontrol('public, max-age=3600') == {'public': None,
|
||||
... 'max-age': '3600'}
|
||||
True
|
||||
>>> parse_cachecontrol('') == {}
|
||||
True
|
||||
|
||||
"""
|
||||
directives = {}
|
||||
for directive in header.split(','):
|
||||
key, sep, val = directive.strip().partition('=')
|
||||
if key:
|
||||
directives[key.lower()] = val if sep else None
|
||||
return directives
|
||||
|
||||
|
||||
def rfc1123_to_epoch(date_str):
|
||||
try:
|
||||
return mktime_tz(parsedate_tz(date_str))
|
||||
except Exception:
|
||||
return None
|
||||
from scrapy.extensions.httpcache import *
|
||||
|
@ -1,8 +1,7 @@
|
||||
"""
|
||||
scrapy.contrib.linkextractors
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.linkextractors` is deprecated, "
|
||||
"use `scrapy.linkextractors` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
This package contains a collection of Link Extractors.
|
||||
|
||||
For more info see docs/topics/link-extractors.rst
|
||||
"""
|
||||
from .lxmlhtml import LxmlLinkExtractor as LinkExtractor
|
||||
from scrapy.linkextractors import *
|
||||
|
@ -1,75 +1,7 @@
|
||||
"""
|
||||
HTMLParser-based link extractor
|
||||
"""
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.linkextractors.htmlparser` is deprecated, "
|
||||
"use `scrapy.linkextractors.htmlparser` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
from six.moves.urllib.parse import urljoin
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.python import unique as unique_list
|
||||
|
||||
class HtmlParserLinkExtractor(HTMLParser):
|
||||
|
||||
def __init__(self, tag="a", attr="href", process=None, unique=False):
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.process_attr = process if callable(process) else lambda v: v
|
||||
self.unique = unique
|
||||
|
||||
def _extract_links(self, response_text, response_url, response_encoding):
|
||||
self.reset()
|
||||
self.feed(response_text)
|
||||
self.close()
|
||||
|
||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||
|
||||
ret = []
|
||||
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
|
||||
for link in links:
|
||||
if isinstance(link.url, unicode):
|
||||
link.url = link.url.encode(response_encoding)
|
||||
link.url = urljoin(base_url, link.url)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
link.text = link.text.decode(response_encoding)
|
||||
ret.append(link)
|
||||
|
||||
return ret
|
||||
|
||||
def extract_links(self, response):
|
||||
# wrapper needed to allow to work directly with text
|
||||
return self._extract_links(response.body, response.url, response.encoding)
|
||||
|
||||
def reset(self):
|
||||
HTMLParser.reset(self)
|
||||
|
||||
self.base_url = None
|
||||
self.current_link = None
|
||||
self.links = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'base':
|
||||
self.base_url = dict(attrs).get('href')
|
||||
if self.scan_tag(tag):
|
||||
for attr, value in attrs:
|
||||
if self.scan_attr(attr):
|
||||
url = self.process_attr(value)
|
||||
link = Link(url=url)
|
||||
self.links.append(link)
|
||||
self.current_link = link
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if self.scan_tag(tag):
|
||||
self.current_link = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.current_link:
|
||||
self.current_link.text = self.current_link.text + data
|
||||
|
||||
def matches(self, url):
|
||||
"""This extractor matches with any url, since
|
||||
it doesn't contain any patterns"""
|
||||
return True
|
||||
from scrapy.linkextractors.htmlparser import *
|
||||
|
@ -1,111 +1,7 @@
|
||||
"""
|
||||
Link extractor based on lxml.html
|
||||
"""
|
||||
|
||||
import re
|
||||
from six.moves.urllib.parse import urlparse, urljoin
|
||||
|
||||
import lxml.etree as etree
|
||||
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.python import unique as unique_list, str_to_unicode
|
||||
from scrapy.linkextractor import FilteringLinkExtractor
|
||||
from scrapy.utils.response import get_base_url
|
||||
|
||||
|
||||
# from lxml/src/lxml/html/__init__.py
|
||||
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
|
||||
|
||||
_collect_string_content = etree.XPath("string()")
|
||||
|
||||
def _nons(tag):
|
||||
if isinstance(tag, basestring):
|
||||
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
|
||||
return tag.split('}')[-1]
|
||||
return tag
|
||||
|
||||
|
||||
class LxmlParserLinkExtractor(object):
|
||||
def __init__(self, tag="a", attr="href", process=None, unique=False):
|
||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.process_attr = process if callable(process) else lambda v: v
|
||||
self.unique = unique
|
||||
|
||||
def _iter_links(self, document):
|
||||
for el in document.iter(etree.Element):
|
||||
if not self.scan_tag(_nons(el.tag)):
|
||||
continue
|
||||
attribs = el.attrib
|
||||
for attrib in attribs:
|
||||
if not self.scan_attr(attrib):
|
||||
continue
|
||||
yield (el, attrib, attribs[attrib])
|
||||
|
||||
def _extract_links(self, selector, response_url, response_encoding, base_url):
|
||||
links = []
|
||||
# hacky way to get the underlying lxml parsed document
|
||||
for el, attr, attr_val in self._iter_links(selector._root):
|
||||
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
|
||||
attr_val = urljoin(base_url, attr_val)
|
||||
url = self.process_attr(attr_val)
|
||||
if url is None:
|
||||
continue
|
||||
if isinstance(url, unicode):
|
||||
url = url.encode(response_encoding)
|
||||
# to fix relative links after process_value
|
||||
url = urljoin(response_url, url)
|
||||
link = Link(url, _collect_string_content(el) or u'',
|
||||
nofollow=True if el.get('rel') == 'nofollow' else False)
|
||||
links.append(link)
|
||||
|
||||
return unique_list(links, key=lambda link: link.url) \
|
||||
if self.unique else links
|
||||
|
||||
def extract_links(self, response):
|
||||
html = Selector(response)
|
||||
base_url = get_base_url(response)
|
||||
return self._extract_links(html, response.url, response.encoding, base_url)
|
||||
|
||||
def _process_links(self, links):
|
||||
""" Normalize and filter extracted links
|
||||
|
||||
The subclass should override it if neccessary
|
||||
"""
|
||||
links = unique_list(links, key=lambda link: link.url) if self.unique else links
|
||||
return links
|
||||
|
||||
|
||||
class LxmlLinkExtractor(FilteringLinkExtractor):
|
||||
|
||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
||||
tags=('a', 'area'), attrs=('href',), canonicalize=True,
|
||||
unique=True, process_value=None, deny_extensions=None, restrict_css=()):
|
||||
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
|
||||
tag_func = lambda x: x in tags
|
||||
attr_func = lambda x: x in attrs
|
||||
lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func,
|
||||
unique=unique, process=process_value)
|
||||
|
||||
super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
|
||||
allow_domains=allow_domains, deny_domains=deny_domains,
|
||||
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
|
||||
canonicalize=canonicalize, deny_extensions=deny_extensions)
|
||||
|
||||
def extract_links(self, response):
|
||||
html = Selector(response)
|
||||
base_url = get_base_url(response)
|
||||
if self.restrict_xpaths:
|
||||
docs = [subdoc
|
||||
for x in self.restrict_xpaths
|
||||
for subdoc in html.xpath(x)]
|
||||
else:
|
||||
docs = [html]
|
||||
all_links = []
|
||||
for doc in docs:
|
||||
links = self._extract_links(doc, response.url, response.encoding, base_url)
|
||||
all_links.extend(self._process_links(links))
|
||||
return unique_list(all_links)
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.linkextractors.lxmlhtml` is deprecated, "
|
||||
"use `scrapy.linkextractors.lxmlhtml` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.linkextractors.lxmlhtml import *
|
||||
|
@ -1,30 +1,7 @@
|
||||
import re
|
||||
from six.moves.urllib.parse import urljoin
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.linkextractors.regex` is deprecated, "
|
||||
"use `scrapy.linkextractors.regex` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from w3lib.html import remove_tags, replace_entities, replace_escape_chars
|
||||
|
||||
from scrapy.link import Link
|
||||
from .sgml import SgmlLinkExtractor
|
||||
|
||||
linkre = re.compile(
|
||||
"<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>",
|
||||
re.DOTALL | re.IGNORECASE)
|
||||
|
||||
def clean_link(link_text):
|
||||
"""Remove leading and trailing whitespace and punctuation"""
|
||||
return link_text.strip("\t\r\n '\"")
|
||||
|
||||
class RegexLinkExtractor(SgmlLinkExtractor):
|
||||
"""High performant link extractor"""
|
||||
|
||||
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
|
||||
if base_url is None:
|
||||
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
|
||||
|
||||
clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
|
||||
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
|
||||
|
||||
links_text = linkre.findall(response_text)
|
||||
return [Link(clean_url(url).encode(response_encoding),
|
||||
clean_text(text))
|
||||
for url, _, text in links_text]
|
||||
from scrapy.linkextractors.regex import *
|
||||
|
@ -1,140 +1,7 @@
|
||||
"""
|
||||
SGMLParser-based Link extractors
|
||||
"""
|
||||
from six.moves.urllib.parse import urljoin
|
||||
import warnings
|
||||
from sgmllib import SGMLParser
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.link import Link
|
||||
from scrapy.linkextractor import FilteringLinkExtractor
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.python import unique as unique_list, str_to_unicode
|
||||
from scrapy.utils.response import get_base_url
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.linkextractors.sgml` is deprecated, "
|
||||
"use `scrapy.linkextractors.sgml` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
|
||||
class BaseSgmlLinkExtractor(SGMLParser):
|
||||
|
||||
def __init__(self, tag="a", attr="href", unique=False, process_value=None):
|
||||
warnings.warn(
|
||||
"BaseSgmlLinkExtractor is deprecated and will be removed in future releases. "
|
||||
"Please use scrapy.contrib.linkextractors.LinkExtractor",
|
||||
ScrapyDeprecationWarning
|
||||
)
|
||||
SGMLParser.__init__(self)
|
||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.process_value = (lambda v: v) if process_value is None else process_value
|
||||
self.current_link = None
|
||||
self.unique = unique
|
||||
|
||||
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
|
||||
""" Do the real extraction work """
|
||||
self.reset()
|
||||
self.feed(response_text)
|
||||
self.close()
|
||||
|
||||
ret = []
|
||||
if base_url is None:
|
||||
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
|
||||
for link in self.links:
|
||||
if isinstance(link.url, unicode):
|
||||
link.url = link.url.encode(response_encoding)
|
||||
link.url = urljoin(base_url, link.url)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
link.text = str_to_unicode(link.text, response_encoding, errors='replace').strip()
|
||||
ret.append(link)
|
||||
|
||||
return ret
|
||||
|
||||
def _process_links(self, links):
|
||||
""" Normalize and filter extracted links
|
||||
|
||||
The subclass should override it if necessary
|
||||
"""
|
||||
links = unique_list(links, key=lambda link: link.url) if self.unique else links
|
||||
return links
|
||||
|
||||
def extract_links(self, response):
|
||||
# wrapper needed to allow to work directly with text
|
||||
links = self._extract_links(response.body, response.url, response.encoding)
|
||||
links = self._process_links(links)
|
||||
return links
|
||||
|
||||
def reset(self):
|
||||
SGMLParser.reset(self)
|
||||
self.links = []
|
||||
self.base_url = None
|
||||
self.current_link = None
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
if tag == 'base':
|
||||
self.base_url = dict(attrs).get('href')
|
||||
if self.scan_tag(tag):
|
||||
for attr, value in attrs:
|
||||
if self.scan_attr(attr):
|
||||
url = self.process_value(value)
|
||||
if url is not None:
|
||||
link = Link(url=url, nofollow=True if dict(attrs).get('rel') == 'nofollow' else False)
|
||||
self.links.append(link)
|
||||
self.current_link = link
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
if self.scan_tag(tag):
|
||||
self.current_link = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.current_link:
|
||||
self.current_link.text = self.current_link.text + data
|
||||
|
||||
def matches(self, url):
|
||||
"""This extractor matches with any url, since
|
||||
it doesn't contain any patterns"""
|
||||
return True
|
||||
|
||||
|
||||
class SgmlLinkExtractor(FilteringLinkExtractor):
|
||||
|
||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
||||
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True,
|
||||
process_value=None, deny_extensions=None, restrict_css=()):
|
||||
|
||||
warnings.warn(
|
||||
"SgmlLinkExtractor is deprecated and will be removed in future releases. "
|
||||
"Please use scrapy.contrib.linkextractors.LinkExtractor",
|
||||
ScrapyDeprecationWarning
|
||||
)
|
||||
|
||||
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
|
||||
tag_func = lambda x: x in tags
|
||||
attr_func = lambda x: x in attrs
|
||||
|
||||
with warnings.catch_warnings(record=True):
|
||||
lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
|
||||
unique=unique, process_value=process_value)
|
||||
|
||||
super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
|
||||
allow_domains=allow_domains, deny_domains=deny_domains,
|
||||
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
|
||||
canonicalize=canonicalize, deny_extensions=deny_extensions)
|
||||
|
||||
# FIXME: was added to fix a RegexLinkExtractor testcase
|
||||
self.base_url = None
|
||||
|
||||
def extract_links(self, response):
|
||||
base_url = None
|
||||
if self.restrict_xpaths:
|
||||
sel = Selector(response)
|
||||
base_url = get_base_url(response)
|
||||
body = u''.join(f
|
||||
for x in self.restrict_xpaths
|
||||
for f in sel.xpath(x).extract()
|
||||
).encode(response.encoding, errors='xmlcharrefreplace')
|
||||
else:
|
||||
body = response.body
|
||||
|
||||
links = self._extract_links(body, response.url, response.encoding, base_url)
|
||||
links = self._process_links(links)
|
||||
return links
|
||||
from scrapy.linkextractors.sgml import *
|
||||
|
@ -1,172 +1,7 @@
|
||||
"""Item Loader
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.loader` is deprecated, "
|
||||
"use `scrapy.loader` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/loaders.rst
|
||||
|
||||
"""
|
||||
from collections import defaultdict
|
||||
import six
|
||||
|
||||
from scrapy.item import Item
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.utils.decorator import deprecated
|
||||
from scrapy.utils.deprecate import create_deprecated_class
|
||||
from scrapy.utils.misc import arg_to_iter, extract_regex
|
||||
from scrapy.utils.python import flatten
|
||||
|
||||
from .common import wrap_loader_context
|
||||
from .processor import Identity
|
||||
|
||||
|
||||
class ItemLoader(object):
|
||||
|
||||
default_item_class = Item
|
||||
default_input_processor = Identity()
|
||||
default_output_processor = Identity()
|
||||
default_selector_class = Selector
|
||||
|
||||
def __init__(self, item=None, selector=None, response=None, **context):
|
||||
if selector is None and response is not None:
|
||||
selector = self.default_selector_class(response)
|
||||
self.selector = selector
|
||||
context.update(selector=selector, response=response)
|
||||
if item is None:
|
||||
item = self.default_item_class()
|
||||
self.item = context['item'] = item
|
||||
self.context = context
|
||||
self._values = defaultdict(list)
|
||||
|
||||
def add_value(self, field_name, value, *processors, **kw):
|
||||
value = self.get_value(value, *processors, **kw)
|
||||
if value is None:
|
||||
return
|
||||
if not field_name:
|
||||
for k, v in six.iteritems(value):
|
||||
self._add_value(k, v)
|
||||
else:
|
||||
self._add_value(field_name, value)
|
||||
|
||||
def replace_value(self, field_name, value, *processors, **kw):
|
||||
value = self.get_value(value, *processors, **kw)
|
||||
if value is None:
|
||||
return
|
||||
if not field_name:
|
||||
for k, v in six.iteritems(value):
|
||||
self._replace_value(k, v)
|
||||
else:
|
||||
self._replace_value(field_name, value)
|
||||
|
||||
def _add_value(self, field_name, value):
|
||||
value = arg_to_iter(value)
|
||||
processed_value = self._process_input_value(field_name, value)
|
||||
if processed_value:
|
||||
self._values[field_name] += arg_to_iter(processed_value)
|
||||
|
||||
def _replace_value(self, field_name, value):
|
||||
self._values.pop(field_name, None)
|
||||
self._add_value(field_name, value)
|
||||
|
||||
def get_value(self, value, *processors, **kw):
|
||||
regex = kw.get('re', None)
|
||||
if regex:
|
||||
value = arg_to_iter(value)
|
||||
value = flatten([extract_regex(regex, x) for x in value])
|
||||
|
||||
for proc in processors:
|
||||
if value is None:
|
||||
break
|
||||
proc = wrap_loader_context(proc, self.context)
|
||||
value = proc(value)
|
||||
return value
|
||||
|
||||
def load_item(self):
|
||||
item = self.item
|
||||
for field_name in tuple(self._values):
|
||||
value = self.get_output_value(field_name)
|
||||
if value is not None:
|
||||
item[field_name] = value
|
||||
return item
|
||||
|
||||
def get_output_value(self, field_name):
|
||||
proc = self.get_output_processor(field_name)
|
||||
proc = wrap_loader_context(proc, self.context)
|
||||
try:
|
||||
return proc(self._values[field_name])
|
||||
except Exception as e:
|
||||
raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" % \
|
||||
(field_name, self._values[field_name], type(e).__name__, str(e)))
|
||||
|
||||
def get_collected_values(self, field_name):
|
||||
return self._values[field_name]
|
||||
|
||||
def get_input_processor(self, field_name):
|
||||
proc = getattr(self, '%s_in' % field_name, None)
|
||||
if not proc:
|
||||
proc = self._get_item_field_attr(field_name, 'input_processor', \
|
||||
self.default_input_processor)
|
||||
return proc
|
||||
|
||||
def get_output_processor(self, field_name):
|
||||
proc = getattr(self, '%s_out' % field_name, None)
|
||||
if not proc:
|
||||
proc = self._get_item_field_attr(field_name, 'output_processor', \
|
||||
self.default_output_processor)
|
||||
return proc
|
||||
|
||||
def _process_input_value(self, field_name, value):
|
||||
proc = self.get_input_processor(field_name)
|
||||
proc = wrap_loader_context(proc, self.context)
|
||||
return proc(value)
|
||||
|
||||
def _get_item_field_attr(self, field_name, key, default=None):
|
||||
if isinstance(self.item, Item):
|
||||
value = self.item.fields[field_name].get(key, default)
|
||||
else:
|
||||
value = default
|
||||
return value
|
||||
|
||||
def _check_selector_method(self):
|
||||
if self.selector is None:
|
||||
raise RuntimeError("To use XPath or CSS selectors, "
|
||||
"%s must be instantiated with a selector "
|
||||
"or a response" % self.__class__.__name__)
|
||||
|
||||
def add_xpath(self, field_name, xpath, *processors, **kw):
|
||||
values = self._get_xpathvalues(xpath, **kw)
|
||||
self.add_value(field_name, values, *processors, **kw)
|
||||
|
||||
def replace_xpath(self, field_name, xpath, *processors, **kw):
|
||||
values = self._get_xpathvalues(xpath, **kw)
|
||||
self.replace_value(field_name, values, *processors, **kw)
|
||||
|
||||
def get_xpath(self, xpath, *processors, **kw):
|
||||
values = self._get_xpathvalues(xpath, **kw)
|
||||
return self.get_value(values, *processors, **kw)
|
||||
|
||||
@deprecated(use_instead='._get_xpathvalues()')
|
||||
def _get_values(self, xpaths, **kw):
|
||||
return self._get_xpathvalues(xpaths, **kw)
|
||||
|
||||
def _get_xpathvalues(self, xpaths, **kw):
|
||||
self._check_selector_method()
|
||||
xpaths = arg_to_iter(xpaths)
|
||||
return flatten([self.selector.xpath(xpath).extract() for xpath in xpaths])
|
||||
|
||||
def add_css(self, field_name, css, *processors, **kw):
|
||||
values = self._get_cssvalues(css, **kw)
|
||||
self.add_value(field_name, values, *processors, **kw)
|
||||
|
||||
def replace_css(self, field_name, css, *processors, **kw):
|
||||
values = self._get_cssvalues(css, **kw)
|
||||
self.replace_value(field_name, values, *processors, **kw)
|
||||
|
||||
def get_css(self, css, *processors, **kw):
|
||||
values = self._get_cssvalues(css, **kw)
|
||||
return self.get_value(values, *processors, **kw)
|
||||
|
||||
def _get_cssvalues(self, csss, **kw):
|
||||
self._check_selector_method()
|
||||
csss = arg_to_iter(csss)
|
||||
return flatten([self.selector.css(css).extract() for css in csss])
|
||||
|
||||
|
||||
XPathItemLoader = create_deprecated_class('XPathItemLoader', ItemLoader)
|
||||
from scrapy.loader import *
|
||||
|
@ -1,13 +1,7 @@
|
||||
"""Common functions used in Item Loaders code"""
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.loader.common` is deprecated, "
|
||||
"use `scrapy.loader.common` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from functools import partial
|
||||
from scrapy.utils.python import get_func_args
|
||||
|
||||
def wrap_loader_context(function, context):
|
||||
"""Wrap functions that receive loader_context to contain the context
|
||||
"pre-loaded" and expose a interface that receives only one argument
|
||||
"""
|
||||
if 'loader_context' in get_func_args(function):
|
||||
return partial(function, loader_context=context)
|
||||
else:
|
||||
return function
|
||||
from scrapy.loader.common import *
|
||||
|
@ -1,93 +1,7 @@
|
||||
"""
|
||||
This module provides some commonly used processors for Item Loaders.
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.loader.processor` is deprecated, "
|
||||
"use `scrapy.loader.processors` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/loaders.rst
|
||||
"""
|
||||
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.datatypes import MergeDict
|
||||
from .common import wrap_loader_context
|
||||
|
||||
|
||||
class MapCompose(object):
|
||||
|
||||
def __init__(self, *functions, **default_loader_context):
|
||||
self.functions = functions
|
||||
self.default_loader_context = default_loader_context
|
||||
|
||||
def __call__(self, value, loader_context=None):
|
||||
values = arg_to_iter(value)
|
||||
if loader_context:
|
||||
context = MergeDict(loader_context, self.default_loader_context)
|
||||
else:
|
||||
context = self.default_loader_context
|
||||
wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
|
||||
for func in wrapped_funcs:
|
||||
next_values = []
|
||||
for v in values:
|
||||
next_values += arg_to_iter(func(v))
|
||||
values = next_values
|
||||
return values
|
||||
|
||||
|
||||
class Compose(object):
|
||||
|
||||
def __init__(self, *functions, **default_loader_context):
|
||||
self.functions = functions
|
||||
self.stop_on_none = default_loader_context.get('stop_on_none', True)
|
||||
self.default_loader_context = default_loader_context
|
||||
|
||||
def __call__(self, value, loader_context=None):
|
||||
if loader_context:
|
||||
context = MergeDict(loader_context, self.default_loader_context)
|
||||
else:
|
||||
context = self.default_loader_context
|
||||
wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
|
||||
for func in wrapped_funcs:
|
||||
if value is None and self.stop_on_none:
|
||||
break
|
||||
value = func(value)
|
||||
return value
|
||||
|
||||
|
||||
class TakeFirst(object):
|
||||
|
||||
def __call__(self, values):
|
||||
for value in values:
|
||||
if value is not None and value != '':
|
||||
return value
|
||||
|
||||
|
||||
class Identity(object):
|
||||
|
||||
def __call__(self, values):
|
||||
return values
|
||||
|
||||
|
||||
class SelectJmes(object):
|
||||
"""
|
||||
Query the input string for the jmespath (given at instantiation),
|
||||
and return the answer
|
||||
Requires : jmespath(https://github.com/jmespath/jmespath)
|
||||
Note: SelectJmes accepts only one input element at a time.
|
||||
"""
|
||||
def __init__(self, json_path):
|
||||
self.json_path = json_path
|
||||
import jmespath
|
||||
self.compiled_path = jmespath.compile(self.json_path)
|
||||
|
||||
def __call__(self, value):
|
||||
"""Query value for the jmespath query and return answer
|
||||
:param str value: a string with JSON data to extract from
|
||||
:return: Element extracted according to jmespath query
|
||||
"""
|
||||
return self.compiled_path.search(value)
|
||||
|
||||
|
||||
class Join(object):
|
||||
|
||||
def __init__(self, separator=u' '):
|
||||
self.separator = separator
|
||||
|
||||
def __call__(self, values):
|
||||
return self.separator.join(values)
|
||||
from scrapy.loader.processors import *
|
||||
|
@ -1,51 +1,7 @@
|
||||
import logging
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.logstats` is deprecated, "
|
||||
"use `scrapy.extensions.logstats` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogStats(object):
|
||||
"""Log basic scraping stats periodically"""
|
||||
|
||||
def __init__(self, stats, interval=60.0):
|
||||
self.stats = stats
|
||||
self.interval = interval
|
||||
self.multiplier = 60.0 / self.interval
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
|
||||
if not interval:
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats, interval)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.pagesprev = 0
|
||||
self.itemsprev = 0
|
||||
|
||||
self.task = task.LoopingCall(self.log, spider)
|
||||
self.task.start(self.interval)
|
||||
|
||||
def log(self, spider):
|
||||
items = self.stats.get_value('item_scraped_count', 0)
|
||||
pages = self.stats.get_value('response_received_count', 0)
|
||||
irate = (items - self.itemsprev) * self.multiplier
|
||||
prate = (pages - self.pagesprev) * self.multiplier
|
||||
self.pagesprev, self.itemsprev = pages, items
|
||||
|
||||
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
|
||||
"scraped %(items)d items (at %(itemrate)d items/min)")
|
||||
log_args = {'pages': pages, 'pagerate': prate,
|
||||
'items': items, 'itemrate': irate}
|
||||
logger.info(msg, log_args, extra={'spider': spider})
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
if self.task.running:
|
||||
self.task.stop()
|
||||
from scrapy.extensions.logstats import *
|
||||
|
@ -1,34 +1,7 @@
|
||||
"""
|
||||
MemoryDebugger extension
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.memdebug` is deprecated, "
|
||||
"use `scrapy.extensions.memdebug` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
import gc
|
||||
import six
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.trackref import live_refs
|
||||
|
||||
|
||||
class MemoryDebugger(object):
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('MEMDEBUG_ENABLED'):
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
gc.collect()
|
||||
self.stats.set_value('memdebug/gc_garbage_count', len(gc.garbage), spider=spider)
|
||||
for cls, wdict in six.iteritems(live_refs):
|
||||
if not wdict:
|
||||
continue
|
||||
self.stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict), spider=spider)
|
||||
from scrapy.extensions.memdebug import *
|
||||
|
@ -1,122 +1,7 @@
|
||||
"""
|
||||
MemoryUsage extension
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.memusage` is deprecated, "
|
||||
"use `scrapy.extensions.memusage` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
import sys
|
||||
import socket
|
||||
import logging
|
||||
from pprint import pformat
|
||||
from importlib import import_module
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.mail import MailSender
|
||||
from scrapy.utils.engine import get_engine_status
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryUsage(object):
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
|
||||
raise NotConfigured
|
||||
try:
|
||||
# stdlib's resource module is only available on unix platforms.
|
||||
self.resource = import_module('resource')
|
||||
except ImportError:
|
||||
raise NotConfigured
|
||||
|
||||
self.crawler = crawler
|
||||
self.warned = False
|
||||
self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
|
||||
self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
|
||||
self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
|
||||
self.report = crawler.settings.getbool('MEMUSAGE_REPORT')
|
||||
self.mail = MailSender.from_settings(crawler.settings)
|
||||
crawler.signals.connect(self.engine_started, signal=signals.engine_started)
|
||||
crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def get_virtual_size(self):
|
||||
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
|
||||
if sys.platform != 'darwin':
|
||||
# on Mac OS X ru_maxrss is in bytes, on Linux it is in KB
|
||||
size *= 1024
|
||||
return size
|
||||
|
||||
def engine_started(self):
|
||||
self.crawler.stats.set_value('memusage/startup', self.get_virtual_size())
|
||||
self.tasks = []
|
||||
tsk = task.LoopingCall(self.update)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(60.0, now=True)
|
||||
if self.limit:
|
||||
tsk = task.LoopingCall(self._check_limit)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(60.0, now=True)
|
||||
if self.warning:
|
||||
tsk = task.LoopingCall(self._check_warning)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(60.0, now=True)
|
||||
|
||||
def engine_stopped(self):
|
||||
for tsk in self.tasks:
|
||||
if tsk.running:
|
||||
tsk.stop()
|
||||
|
||||
def update(self):
|
||||
self.crawler.stats.max_value('memusage/max', self.get_virtual_size())
|
||||
|
||||
def _check_limit(self):
|
||||
if self.get_virtual_size() > self.limit:
|
||||
self.crawler.stats.set_value('memusage/limit_reached', 1)
|
||||
mem = self.limit/1024/1024
|
||||
logger.error("Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = "%s terminated: memory usage exceeded %dM at %s" % \
|
||||
(self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/limit_notified', 1)
|
||||
|
||||
open_spiders = self.crawler.engine.open_spiders
|
||||
if open_spiders:
|
||||
for spider in open_spiders:
|
||||
self.crawler.engine.close_spider(spider, 'memusage_exceeded')
|
||||
else:
|
||||
self.crawler.stop()
|
||||
|
||||
def _check_warning(self):
|
||||
if self.warned: # warn only once
|
||||
return
|
||||
if self.get_virtual_size() > self.warning:
|
||||
self.crawler.stats.set_value('memusage/warning_reached', 1)
|
||||
mem = self.warning/1024/1024
|
||||
logger.warning("Memory usage reached %(memusage)dM",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = "%s warning: memory usage reached %dM at %s" % \
|
||||
(self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/warning_notified', 1)
|
||||
self.warned = True
|
||||
|
||||
def _send_report(self, rcpts, subject):
|
||||
"""send notification mail with some additional useful info"""
|
||||
stats = self.crawler.stats
|
||||
s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value('memusage/startup')/1024/1024)
|
||||
s += "Maximum memory usage : %dM\r\n" % (stats.get_value('memusage/max')/1024/1024)
|
||||
s += "Current memory usage : %dM\r\n" % (self.get_virtual_size()/1024/1024)
|
||||
|
||||
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
|
||||
s += "\r\n"
|
||||
s += pformat(get_engine_status(self.crawler.engine))
|
||||
s += "\r\n"
|
||||
self.mail.send(rcpts, subject, s)
|
||||
from scrapy.extensions.memusage import *
|
||||
|
@ -1,32 +1,7 @@
|
||||
"""
|
||||
Item pipeline
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.pipeline` is deprecated, "
|
||||
"use `scrapy.pipelines` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/item-pipeline.rst
|
||||
"""
|
||||
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.conf import build_component_list
|
||||
|
||||
class ItemPipelineManager(MiddlewareManager):
|
||||
|
||||
component_name = 'item pipeline'
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
item_pipelines = settings['ITEM_PIPELINES']
|
||||
if isinstance(item_pipelines, (tuple, list, set, frozenset)):
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
import warnings
|
||||
warnings.warn('ITEM_PIPELINES defined as a list or a set is deprecated, switch to a dict',
|
||||
category=ScrapyDeprecationWarning, stacklevel=1)
|
||||
# convert old ITEM_PIPELINE list to a dict with order 500
|
||||
item_pipelines = dict(zip(item_pipelines, range(500, 500+len(item_pipelines))))
|
||||
return build_component_list(settings['ITEM_PIPELINES_BASE'], item_pipelines)
|
||||
|
||||
def _add_middleware(self, pipe):
|
||||
super(ItemPipelineManager, self)._add_middleware(pipe)
|
||||
if hasattr(pipe, 'process_item'):
|
||||
self.methods['process_item'].append(pipe.process_item)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return self._process_chain('process_item', item, spider)
|
||||
from scrapy.pipelines import *
|
||||
|
@ -1,334 +1,7 @@
|
||||
"""
|
||||
Files Pipeline
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.pipeline.files` is deprecated, "
|
||||
"use `scrapy.pipelines.files` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in topics/media-pipeline.rst
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import os.path
|
||||
import rfc822
|
||||
import time
|
||||
import logging
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from collections import defaultdict
|
||||
import six
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO as BytesIO
|
||||
except ImportError:
|
||||
from io import BytesIO
|
||||
|
||||
from twisted.internet import defer, threads
|
||||
|
||||
from scrapy.contrib.pipeline.media import MediaPipeline
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.misc import md5sum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileException(Exception):
|
||||
"""General media error exception"""
|
||||
|
||||
|
||||
class FSFilesStore(object):
|
||||
|
||||
def __init__(self, basedir):
|
||||
if '://' in basedir:
|
||||
basedir = basedir.split('://', 1)[1]
|
||||
self.basedir = basedir
|
||||
self._mkdir(self.basedir)
|
||||
self.created_directories = defaultdict(set)
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
absolute_path = self._get_filesystem_path(path)
|
||||
self._mkdir(os.path.dirname(absolute_path), info)
|
||||
with open(absolute_path, 'wb') as f:
|
||||
f.write(buf.getvalue())
|
||||
|
||||
def stat_file(self, path, info):
|
||||
absolute_path = self._get_filesystem_path(path)
|
||||
try:
|
||||
last_modified = os.path.getmtime(absolute_path)
|
||||
except: # FIXME: catching everything!
|
||||
return {}
|
||||
|
||||
with open(absolute_path, 'rb') as f:
|
||||
checksum = md5sum(f)
|
||||
|
||||
return {'last_modified': last_modified, 'checksum': checksum}
|
||||
|
||||
def _get_filesystem_path(self, path):
|
||||
path_comps = path.split('/')
|
||||
return os.path.join(self.basedir, *path_comps)
|
||||
|
||||
def _mkdir(self, dirname, domain=None):
|
||||
seen = self.created_directories[domain] if domain else set()
|
||||
if dirname not in seen:
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
seen.add(dirname)
|
||||
|
||||
|
||||
class S3FilesStore(object):
|
||||
|
||||
AWS_ACCESS_KEY_ID = None
|
||||
AWS_SECRET_ACCESS_KEY = None
|
||||
|
||||
POLICY = 'public-read'
|
||||
HEADERS = {
|
||||
'Cache-Control': 'max-age=172800',
|
||||
}
|
||||
|
||||
def __init__(self, uri):
|
||||
assert uri.startswith('s3://')
|
||||
self.bucket, self.prefix = uri[5:].split('/', 1)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
def _onsuccess(boto_key):
|
||||
checksum = boto_key.etag.strip('"')
|
||||
last_modified = boto_key.last_modified
|
||||
modified_tuple = rfc822.parsedate_tz(last_modified)
|
||||
modified_stamp = int(rfc822.mktime_tz(modified_tuple))
|
||||
return {'checksum': checksum, 'last_modified': modified_stamp}
|
||||
|
||||
return self._get_boto_key(path).addCallback(_onsuccess)
|
||||
|
||||
def _get_boto_bucket(self):
|
||||
from boto.s3.connection import S3Connection
|
||||
# disable ssl (is_secure=False) because of this python bug:
|
||||
# http://bugs.python.org/issue5103
|
||||
c = S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, is_secure=False)
|
||||
return c.get_bucket(self.bucket, validate=False)
|
||||
|
||||
def _get_boto_key(self, path):
|
||||
b = self._get_boto_bucket()
|
||||
key_name = '%s%s' % (self.prefix, path)
|
||||
return threads.deferToThread(b.get_key, key_name)
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
"""Upload file to S3 storage"""
|
||||
b = self._get_boto_bucket()
|
||||
key_name = '%s%s' % (self.prefix, path)
|
||||
k = b.new_key(key_name)
|
||||
if meta:
|
||||
for metakey, metavalue in six.iteritems(meta):
|
||||
k.set_metadata(metakey, str(metavalue))
|
||||
h = self.HEADERS.copy()
|
||||
if headers:
|
||||
h.update(headers)
|
||||
buf.seek(0)
|
||||
return threads.deferToThread(k.set_contents_from_string, buf.getvalue(),
|
||||
headers=h, policy=self.POLICY)
|
||||
|
||||
|
||||
class FilesPipeline(MediaPipeline):
|
||||
"""Abstract pipeline that implement the file downloading
|
||||
|
||||
This pipeline tries to minimize network transfers and file processing,
|
||||
doing stat of the files and determining if file is new, uptodate or
|
||||
expired.
|
||||
|
||||
`new` files are those that pipeline never processed and needs to be
|
||||
downloaded from supplier site the first time.
|
||||
|
||||
`uptodate` files are the ones that the pipeline processed and are still
|
||||
valid files.
|
||||
|
||||
`expired` files are those that pipeline already processed but the last
|
||||
modification was made long time ago, so a reprocessing is recommended to
|
||||
refresh it in case of change.
|
||||
|
||||
"""
|
||||
|
||||
MEDIA_NAME = "file"
|
||||
EXPIRES = 90
|
||||
STORE_SCHEMES = {
|
||||
'': FSFilesStore,
|
||||
'file': FSFilesStore,
|
||||
's3': S3FilesStore,
|
||||
}
|
||||
DEFAULT_FILES_URLS_FIELD = 'file_urls'
|
||||
DEFAULT_FILES_RESULT_FIELD = 'files'
|
||||
|
||||
def __init__(self, store_uri, download_func=None):
|
||||
if not store_uri:
|
||||
raise NotConfigured
|
||||
self.store = self._get_store(store_uri)
|
||||
super(FilesPipeline, self).__init__(download_func=download_func)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
s3store = cls.STORE_SCHEMES['s3']
|
||||
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||
|
||||
cls.FILES_URLS_FIELD = settings.get('FILES_URLS_FIELD', cls.DEFAULT_FILES_URLS_FIELD)
|
||||
cls.FILES_RESULT_FIELD = settings.get('FILES_RESULT_FIELD', cls.DEFAULT_FILES_RESULT_FIELD)
|
||||
cls.EXPIRES = settings.getint('FILES_EXPIRES', 90)
|
||||
store_uri = settings['FILES_STORE']
|
||||
return cls(store_uri)
|
||||
|
||||
def _get_store(self, uri):
|
||||
if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir
|
||||
scheme = 'file'
|
||||
else:
|
||||
scheme = urlparse(uri).scheme
|
||||
store_cls = self.STORE_SCHEMES[scheme]
|
||||
return store_cls(uri)
|
||||
|
||||
def media_to_download(self, request, info):
|
||||
def _onsuccess(result):
|
||||
if not result:
|
||||
return # returning None force download
|
||||
|
||||
last_modified = result.get('last_modified', None)
|
||||
if not last_modified:
|
||||
return # returning None force download
|
||||
|
||||
age_seconds = time.time() - last_modified
|
||||
age_days = age_seconds / 60 / 60 / 24
|
||||
if age_days > self.EXPIRES:
|
||||
return # returning None force download
|
||||
|
||||
referer = request.headers.get('Referer')
|
||||
logger.debug(
|
||||
'File (uptodate): Downloaded %(medianame)s from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
self.inc_stats(info.spider, 'uptodate')
|
||||
|
||||
checksum = result.get('checksum', None)
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum}
|
||||
|
||||
path = self.file_path(request, info=info)
|
||||
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
|
||||
dfd.addCallbacks(_onsuccess, lambda _: None)
|
||||
dfd.addErrback(
|
||||
lambda f:
|
||||
logger.error(self.__class__.__name__ + '.store.stat_file',
|
||||
extra={'spider': info.spider, 'failure': f})
|
||||
)
|
||||
return dfd
|
||||
|
||||
def media_failed(self, failure, request, info):
|
||||
if not isinstance(failure.value, IgnoreRequest):
|
||||
referer = request.headers.get('Referer')
|
||||
logger.warning(
|
||||
'File (unknown-error): Error downloading %(medianame)s from '
|
||||
'%(request)s referred in <%(referer)s>: %(exception)s',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer, 'exception': failure.value},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
|
||||
raise FileException
|
||||
|
||||
def media_downloaded(self, response, request, info):
|
||||
referer = request.headers.get('Referer')
|
||||
|
||||
if response.status != 200:
|
||||
logger.warning(
|
||||
'File (code: %(status)s): Error downloading file from '
|
||||
'%(request)s referred in <%(referer)s>',
|
||||
{'status': response.status,
|
||||
'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException('download-error')
|
||||
|
||||
if not response.body:
|
||||
logger.warning(
|
||||
'File (empty-content): Empty file from %(request)s referred '
|
||||
'in <%(referer)s>: no-content',
|
||||
{'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException('empty-content')
|
||||
|
||||
status = 'cached' if 'cached' in response.flags else 'downloaded'
|
||||
logger.debug(
|
||||
'File (%(status)s): Downloaded file from %(request)s referred in '
|
||||
'<%(referer)s>',
|
||||
{'status': status, 'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
self.inc_stats(info.spider, status)
|
||||
|
||||
try:
|
||||
path = self.file_path(request, response=response, info=info)
|
||||
checksum = self.file_downloaded(response, request, info)
|
||||
except FileException as exc:
|
||||
logger.warning(
|
||||
'File (error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>: %(errormsg)s',
|
||||
{'request': request, 'referer': referer, 'errormsg': str(exc)},
|
||||
extra={'spider': info.spider}, exc_info=True
|
||||
)
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.exception(
|
||||
'File (unknown-error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException(str(exc))
|
||||
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum}
|
||||
|
||||
def inc_stats(self, spider, status):
|
||||
spider.crawler.stats.inc_value('file_count', spider=spider)
|
||||
spider.crawler.stats.inc_value('file_status_count/%s' % status, spider=spider)
|
||||
|
||||
### Overridable Interface
|
||||
def get_media_requests(self, item, info):
|
||||
return [Request(x) for x in item.get(self.FILES_URLS_FIELD, [])]
|
||||
|
||||
def file_downloaded(self, response, request, info):
|
||||
path = self.file_path(request, response=response, info=info)
|
||||
buf = BytesIO(response.body)
|
||||
self.store.persist_file(path, buf, info)
|
||||
checksum = md5sum(buf)
|
||||
return checksum
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
if isinstance(item, dict) or self.FILES_RESULT_FIELD in item.fields:
|
||||
item[self.FILES_RESULT_FIELD] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None):
|
||||
## start of deprecation warning block (can be removed in the future)
|
||||
def _warn():
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
import warnings
|
||||
warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
|
||||
'file_path(request, response=None, info=None) instead',
|
||||
category=ScrapyDeprecationWarning, stacklevel=1)
|
||||
|
||||
# check if called from file_key with url as first argument
|
||||
if not isinstance(request, Request):
|
||||
_warn()
|
||||
url = request
|
||||
else:
|
||||
url = request.url
|
||||
|
||||
# detect if file_key() method has been overridden
|
||||
if not hasattr(self.file_key, '_base'):
|
||||
_warn()
|
||||
return self.file_key(url)
|
||||
## end of deprecation warning block
|
||||
|
||||
media_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation
|
||||
media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
|
||||
return 'full/%s%s' % (media_guid, media_ext)
|
||||
|
||||
# deprecated
|
||||
def file_key(self, url):
|
||||
return self.file_path(url)
|
||||
file_key._base = True
|
||||
from scrapy.pipelines.files import *
|
||||
|
@ -1,182 +1,7 @@
|
||||
"""
|
||||
Images Pipeline
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.pipeline.images` is deprecated, "
|
||||
"use `scrapy.pipelines.images` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in topics/media-pipeline.rst
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import six
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO as BytesIO
|
||||
except ImportError:
|
||||
from io import BytesIO
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from scrapy.utils.misc import md5sum
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import DropItem
|
||||
#TODO: from scrapy.contrib.pipeline.media import MediaPipeline
|
||||
from scrapy.contrib.pipeline.files import FileException, FilesPipeline
|
||||
|
||||
|
||||
class NoimagesDrop(DropItem):
|
||||
"""Product with no images exception"""
|
||||
|
||||
|
||||
class ImageException(FileException):
|
||||
"""General image error exception"""
|
||||
|
||||
|
||||
class ImagesPipeline(FilesPipeline):
|
||||
"""Abstract pipeline that implement the image thumbnail generation logic
|
||||
|
||||
"""
|
||||
|
||||
MEDIA_NAME = 'image'
|
||||
MIN_WIDTH = 0
|
||||
MIN_HEIGHT = 0
|
||||
THUMBS = {}
|
||||
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
|
||||
DEFAULT_IMAGES_RESULT_FIELD = 'images'
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
|
||||
cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
|
||||
cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
|
||||
cls.THUMBS = settings.get('IMAGES_THUMBS', {})
|
||||
s3store = cls.STORE_SCHEMES['s3']
|
||||
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||
|
||||
cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD)
|
||||
cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD)
|
||||
store_uri = settings['IMAGES_STORE']
|
||||
return cls(store_uri)
|
||||
|
||||
def file_downloaded(self, response, request, info):
|
||||
return self.image_downloaded(response, request, info)
|
||||
|
||||
def image_downloaded(self, response, request, info):
|
||||
checksum = None
|
||||
for path, image, buf in self.get_images(response, request, info):
|
||||
if checksum is None:
|
||||
buf.seek(0)
|
||||
checksum = md5sum(buf)
|
||||
width, height = image.size
|
||||
self.store.persist_file(
|
||||
path, buf, info,
|
||||
meta={'width': width, 'height': height},
|
||||
headers={'Content-Type': 'image/jpeg'})
|
||||
return checksum
|
||||
|
||||
def get_images(self, response, request, info):
|
||||
path = self.file_path(request, response=response, info=info)
|
||||
orig_image = Image.open(BytesIO(response.body))
|
||||
|
||||
width, height = orig_image.size
|
||||
if width < self.MIN_WIDTH or height < self.MIN_HEIGHT:
|
||||
raise ImageException("Image too small (%dx%d < %dx%d)" %
|
||||
(width, height, self.MIN_WIDTH, self.MIN_HEIGHT))
|
||||
|
||||
image, buf = self.convert_image(orig_image)
|
||||
yield path, image, buf
|
||||
|
||||
for thumb_id, size in six.iteritems(self.THUMBS):
|
||||
thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
|
||||
thumb_image, thumb_buf = self.convert_image(image, size)
|
||||
yield thumb_path, thumb_image, thumb_buf
|
||||
|
||||
def convert_image(self, image, size=None):
|
||||
if image.format == 'PNG' and image.mode == 'RGBA':
|
||||
background = Image.new('RGBA', image.size, (255, 255, 255))
|
||||
background.paste(image, image)
|
||||
image = background.convert('RGB')
|
||||
elif image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
if size:
|
||||
image = image.copy()
|
||||
image.thumbnail(size, Image.ANTIALIAS)
|
||||
|
||||
buf = BytesIO()
|
||||
image.save(buf, 'JPEG')
|
||||
return image, buf
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
if isinstance(item, dict) or self.IMAGES_RESULT_FIELD in item.fields:
|
||||
item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None):
|
||||
## start of deprecation warning block (can be removed in the future)
|
||||
def _warn():
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
import warnings
|
||||
warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
|
||||
'please use file_path(request, response=None, info=None) instead',
|
||||
category=ScrapyDeprecationWarning, stacklevel=1)
|
||||
|
||||
# check if called from image_key or file_key with url as first argument
|
||||
if not isinstance(request, Request):
|
||||
_warn()
|
||||
url = request
|
||||
else:
|
||||
url = request.url
|
||||
|
||||
# detect if file_key() or image_key() methods have been overridden
|
||||
if not hasattr(self.file_key, '_base'):
|
||||
_warn()
|
||||
return self.file_key(url)
|
||||
elif not hasattr(self.image_key, '_base'):
|
||||
_warn()
|
||||
return self.image_key(url)
|
||||
## end of deprecation warning block
|
||||
|
||||
image_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation
|
||||
return 'full/%s.jpg' % (image_guid)
|
||||
|
||||
def thumb_path(self, request, thumb_id, response=None, info=None):
|
||||
## start of deprecation warning block (can be removed in the future)
|
||||
def _warn():
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
import warnings
|
||||
warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use '
|
||||
'thumb_path(request, thumb_id, response=None, info=None) instead',
|
||||
category=ScrapyDeprecationWarning, stacklevel=1)
|
||||
|
||||
# check if called from thumb_key with url as first argument
|
||||
if not isinstance(request, Request):
|
||||
_warn()
|
||||
url = request
|
||||
else:
|
||||
url = request.url
|
||||
|
||||
# detect if thumb_key() method has been overridden
|
||||
if not hasattr(self.thumb_key, '_base'):
|
||||
_warn()
|
||||
return self.thumb_key(url, thumb_id)
|
||||
## end of deprecation warning block
|
||||
|
||||
thumb_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation
|
||||
return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
|
||||
|
||||
# deprecated
|
||||
def file_key(self, url):
|
||||
return self.image_key(url)
|
||||
file_key._base = True
|
||||
|
||||
# deprecated
|
||||
def image_key(self, url):
|
||||
return self.file_path(url)
|
||||
image_key._base = True
|
||||
|
||||
# deprecated
|
||||
def thumb_key(self, url, thumb_id):
|
||||
return self.thumb_path(url, thumb_id)
|
||||
thumb_key._base = True
|
||||
from scrapy.pipelines.images import *
|
||||
|
@ -1,132 +1,7 @@
|
||||
from __future__ import print_function
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.pipeline.media` is deprecated, "
|
||||
"use `scrapy.pipelines.media` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from twisted.internet.defer import Deferred, DeferredList
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.utils.defer import mustbe_deferred, defer_result
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MediaPipeline(object):
|
||||
|
||||
LOG_FAILED_RESULTS = True
|
||||
|
||||
class SpiderInfo(object):
|
||||
def __init__(self, spider):
|
||||
self.spider = spider
|
||||
self.downloading = set()
|
||||
self.downloaded = {}
|
||||
self.waiting = defaultdict(list)
|
||||
|
||||
def __init__(self, download_func=None):
|
||||
self.download_func = download_func
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
try:
|
||||
pipe = cls.from_settings(crawler.settings)
|
||||
except AttributeError:
|
||||
pipe = cls()
|
||||
pipe.crawler = crawler
|
||||
return pipe
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.spiderinfo = self.SpiderInfo(spider)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
info = self.spiderinfo
|
||||
requests = arg_to_iter(self.get_media_requests(item, info))
|
||||
dlist = [self._process_request(r, info) for r in requests]
|
||||
dfd = DeferredList(dlist, consumeErrors=1)
|
||||
return dfd.addCallback(self.item_completed, item, info)
|
||||
|
||||
def _process_request(self, request, info):
|
||||
fp = request_fingerprint(request)
|
||||
cb = request.callback or (lambda _: _)
|
||||
eb = request.errback
|
||||
request.callback = None
|
||||
request.errback = None
|
||||
|
||||
# Return cached result if request was already seen
|
||||
if fp in info.downloaded:
|
||||
return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)
|
||||
|
||||
# Otherwise, wait for result
|
||||
wad = Deferred().addCallbacks(cb, eb)
|
||||
info.waiting[fp].append(wad)
|
||||
|
||||
# Check if request is downloading right now to avoid doing it twice
|
||||
if fp in info.downloading:
|
||||
return wad
|
||||
|
||||
# Download request checking media_to_download hook output first
|
||||
info.downloading.add(fp)
|
||||
dfd = mustbe_deferred(self.media_to_download, request, info)
|
||||
dfd.addCallback(self._check_media_to_download, request, info)
|
||||
dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
|
||||
dfd.addErrback(lambda f: logger.error(
|
||||
f.value, extra={'spider': info.spider, 'failure': f})
|
||||
)
|
||||
return dfd.addBoth(lambda _: wad) # it must return wad at last
|
||||
|
||||
def _check_media_to_download(self, result, request, info):
|
||||
if result is not None:
|
||||
return result
|
||||
if self.download_func:
|
||||
# this ugly code was left only to support tests. TODO: remove
|
||||
dfd = mustbe_deferred(self.download_func, request, info.spider)
|
||||
dfd.addCallbacks(
|
||||
callback=self.media_downloaded, callbackArgs=(request, info),
|
||||
errback=self.media_failed, errbackArgs=(request, info))
|
||||
else:
|
||||
request.meta['handle_httpstatus_all'] = True
|
||||
dfd = self.crawler.engine.download(request, info.spider)
|
||||
dfd.addCallbacks(
|
||||
callback=self.media_downloaded, callbackArgs=(request, info),
|
||||
errback=self.media_failed, errbackArgs=(request, info))
|
||||
return dfd
|
||||
|
||||
def _cache_result_and_execute_waiters(self, result, fp, info):
|
||||
if isinstance(result, Failure):
|
||||
# minimize cached information for failure
|
||||
result.cleanFailure()
|
||||
result.frames = []
|
||||
result.stack = None
|
||||
info.downloading.remove(fp)
|
||||
info.downloaded[fp] = result # cache result
|
||||
for wad in info.waiting.pop(fp):
|
||||
defer_result(result).chainDeferred(wad)
|
||||
|
||||
### Overridable Interface
|
||||
def media_to_download(self, request, info):
|
||||
"""Check request before starting download"""
|
||||
pass
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
"""Returns the media requests to download"""
|
||||
pass
|
||||
|
||||
def media_downloaded(self, response, request, info):
|
||||
"""Handler for success downloads"""
|
||||
return response
|
||||
|
||||
def media_failed(self, failure, request, info):
|
||||
"""Handler for failed downloads"""
|
||||
return failure
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
"""Called per item when all media requests has been processed"""
|
||||
if self.LOG_FAILED_RESULTS:
|
||||
for ok, value in results:
|
||||
if not ok:
|
||||
logger.error(
|
||||
'%(class)s found errors processing %(item)s',
|
||||
{'class': self.__class__.__name__, 'item': item},
|
||||
extra={'spider': info.spider, 'failure': value}
|
||||
)
|
||||
return item
|
||||
from scrapy.pipelines.media import *
|
||||
|
@ -1,54 +1,7 @@
|
||||
"""
|
||||
Depth Spider Middleware
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spidermiddleware.depth` is deprecated, "
|
||||
"use `scrapy.spidermiddlewares.depth` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from scrapy.http import Request
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DepthMiddleware(object):
|
||||
|
||||
def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1):
|
||||
self.maxdepth = maxdepth
|
||||
self.stats = stats
|
||||
self.verbose_stats = verbose_stats
|
||||
self.prio = prio
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
settings = crawler.settings
|
||||
maxdepth = settings.getint('DEPTH_LIMIT')
|
||||
verbose = settings.getbool('DEPTH_STATS_VERBOSE')
|
||||
prio = settings.getint('DEPTH_PRIORITY')
|
||||
return cls(maxdepth, crawler.stats, verbose, prio)
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
def _filter(request):
|
||||
if isinstance(request, Request):
|
||||
depth = response.meta['depth'] + 1
|
||||
request.meta['depth'] = depth
|
||||
if self.prio:
|
||||
request.priority -= depth * self.prio
|
||||
if self.maxdepth and depth > self.maxdepth:
|
||||
logger.debug("Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
|
||||
{'maxdepth': self.maxdepth, 'requrl': request.url},
|
||||
extra={'spider': spider})
|
||||
return False
|
||||
elif self.stats:
|
||||
if self.verbose_stats:
|
||||
self.stats.inc_value('request_depth_count/%s' % depth, spider=spider)
|
||||
self.stats.max_value('request_depth_max', depth, spider=spider)
|
||||
return True
|
||||
|
||||
# base case (depth=0)
|
||||
if self.stats and 'depth' not in response.meta:
|
||||
response.meta['depth'] = 0
|
||||
if self.verbose_stats:
|
||||
self.stats.inc_value('request_depth_count/0', spider=spider)
|
||||
|
||||
return (r for r in result or () if _filter(r))
|
||||
from scrapy.spidermiddlewares.depth import *
|
||||
|
@ -1,53 +1,7 @@
|
||||
"""
|
||||
HttpError Spider Middleware
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spidermiddleware.httperror` is deprecated, "
|
||||
"use `scrapy.spidermiddlewares.httperror` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
import logging
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HttpError(IgnoreRequest):
|
||||
"""A non-200 response was filtered"""
|
||||
|
||||
def __init__(self, response, *args, **kwargs):
|
||||
self.response = response
|
||||
super(HttpError, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class HttpErrorMiddleware(object):
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def __init__(self, settings):
|
||||
self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
|
||||
self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
if 200 <= response.status < 300: # common case
|
||||
return
|
||||
meta = response.meta
|
||||
if 'handle_httpstatus_all' in meta:
|
||||
return
|
||||
if 'handle_httpstatus_list' in meta:
|
||||
allowed_statuses = meta['handle_httpstatus_list']
|
||||
elif self.handle_httpstatus_all:
|
||||
return
|
||||
else:
|
||||
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
|
||||
if response.status in allowed_statuses:
|
||||
return
|
||||
raise HttpError(response, 'Ignoring non-200 response')
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
if isinstance(exception, HttpError):
|
||||
logger.debug(
|
||||
"Ignoring response %(response)r: HTTP status code is not handled or not allowed",
|
||||
{'response': response}, extra={'spider': spider},
|
||||
)
|
||||
return []
|
||||
from scrapy.spidermiddlewares.httperror import *
|
||||
|
@ -1,59 +1,7 @@
|
||||
"""
|
||||
Offsite Spider Middleware
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spidermiddleware.offsite` is deprecated, "
|
||||
"use `scrapy.spidermiddlewares.offsite` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class OffsiteMiddleware(object):
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
for x in result:
|
||||
if isinstance(x, Request):
|
||||
if x.dont_filter or self.should_follow(x, spider):
|
||||
yield x
|
||||
else:
|
||||
domain = urlparse_cached(x).hostname
|
||||
if domain and domain not in self.domains_seen:
|
||||
self.domains_seen.add(domain)
|
||||
logger.debug("Filtered offsite request to %(domain)r: %(request)s",
|
||||
{'domain': domain, 'request': x}, extra={'spider': spider})
|
||||
self.stats.inc_value('offsite/domains', spider=spider)
|
||||
self.stats.inc_value('offsite/filtered', spider=spider)
|
||||
else:
|
||||
yield x
|
||||
|
||||
def should_follow(self, request, spider):
|
||||
regex = self.host_regex
|
||||
# hostname can be None for wrong urls (like javascript links)
|
||||
host = urlparse_cached(request).hostname or ''
|
||||
return bool(regex.search(host))
|
||||
|
||||
def get_host_regex(self, spider):
|
||||
"""Override this method to implement a different offsite policy"""
|
||||
allowed_domains = getattr(spider, 'allowed_domains', None)
|
||||
if not allowed_domains:
|
||||
return re.compile('') # allow all by default
|
||||
regex = r'^(.*\.)?(%s)$' % '|'.join(re.escape(d) for d in allowed_domains if d is not None)
|
||||
return re.compile(regex)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.host_regex = self.get_host_regex(spider)
|
||||
self.domains_seen = set()
|
||||
from scrapy.spidermiddlewares.offsite import *
|
||||
|
@ -1,23 +1,7 @@
|
||||
"""
|
||||
RefererMiddleware: populates Request referer field, based on the Response which
|
||||
originated it.
|
||||
"""
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
class RefererMiddleware(object):
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('REFERER_ENABLED'):
|
||||
raise NotConfigured
|
||||
return cls()
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
def _set_referer(r):
|
||||
if isinstance(r, Request):
|
||||
r.headers.setdefault('Referer', response.url)
|
||||
return r
|
||||
return (_set_referer(r) for r in result or ())
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spidermiddleware.referer` is deprecated, "
|
||||
"use `scrapy.spidermiddlewares.referer` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.spidermiddlewares.referer import *
|
||||
|
@ -1,37 +1,7 @@
|
||||
"""
|
||||
Url Length Spider Middleware
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spidermiddleware.urllength` is deprecated, "
|
||||
"use `scrapy.spidermiddlewares.urllength` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UrlLengthMiddleware(object):
|
||||
|
||||
def __init__(self, maxlength):
|
||||
self.maxlength = maxlength
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
maxlength = settings.getint('URLLENGTH_LIMIT')
|
||||
if not maxlength:
|
||||
raise NotConfigured
|
||||
return cls(maxlength)
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
def _filter(request):
|
||||
if isinstance(request, Request) and len(request.url) > self.maxlength:
|
||||
logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ",
|
||||
{'maxlength': self.maxlength, 'url': request.url},
|
||||
extra={'spider': spider})
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
return (r for r in result or () if _filter(r))
|
||||
from scrapy.spidermiddlewares.urllength import *
|
||||
|
@ -1,3 +1,7 @@
|
||||
from scrapy.contrib.spiders.crawl import CrawlSpider, Rule
|
||||
from scrapy.contrib.spiders.feed import XMLFeedSpider, CSVFeedSpider
|
||||
from scrapy.contrib.spiders.sitemap import SitemapSpider
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spiders` is deprecated, "
|
||||
"use `scrapy.spiders` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.spiders import *
|
||||
|
@ -1,98 +1,7 @@
|
||||
"""
|
||||
This modules implements the CrawlSpider which is the recommended spider to use
|
||||
for scraping typical web sites that requires crawling pages.
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spiders.crawl` is deprecated, "
|
||||
"use `scrapy.spiders.crawl` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
|
||||
import copy
|
||||
|
||||
from scrapy.http import Request, HtmlResponse
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.spider import Spider
|
||||
|
||||
def identity(x):
|
||||
return x
|
||||
|
||||
class Rule(object):
|
||||
|
||||
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
|
||||
self.link_extractor = link_extractor
|
||||
self.callback = callback
|
||||
self.cb_kwargs = cb_kwargs or {}
|
||||
self.process_links = process_links
|
||||
self.process_request = process_request
|
||||
if follow is None:
|
||||
self.follow = False if callback else True
|
||||
else:
|
||||
self.follow = follow
|
||||
|
||||
class CrawlSpider(Spider):
|
||||
|
||||
rules = ()
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
super(CrawlSpider, self).__init__(*a, **kw)
|
||||
self._compile_rules()
|
||||
|
||||
def parse(self, response):
|
||||
return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
|
||||
|
||||
def parse_start_url(self, response):
|
||||
return []
|
||||
|
||||
def process_results(self, response, results):
|
||||
return results
|
||||
|
||||
def _requests_to_follow(self, response):
|
||||
if not isinstance(response, HtmlResponse):
|
||||
return
|
||||
seen = set()
|
||||
for n, rule in enumerate(self._rules):
|
||||
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
|
||||
if links and rule.process_links:
|
||||
links = rule.process_links(links)
|
||||
for link in links:
|
||||
seen.add(link)
|
||||
r = Request(url=link.url, callback=self._response_downloaded)
|
||||
r.meta.update(rule=n, link_text=link.text)
|
||||
yield rule.process_request(r)
|
||||
|
||||
def _response_downloaded(self, response):
|
||||
rule = self._rules[response.meta['rule']]
|
||||
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
|
||||
|
||||
def _parse_response(self, response, callback, cb_kwargs, follow=True):
|
||||
if callback:
|
||||
cb_res = callback(response, **cb_kwargs) or ()
|
||||
cb_res = self.process_results(response, cb_res)
|
||||
for requests_or_item in iterate_spider_output(cb_res):
|
||||
yield requests_or_item
|
||||
|
||||
if follow and self._follow_links:
|
||||
for request_or_item in self._requests_to_follow(response):
|
||||
yield request_or_item
|
||||
|
||||
def _compile_rules(self):
|
||||
def get_method(method):
|
||||
if callable(method):
|
||||
return method
|
||||
elif isinstance(method, basestring):
|
||||
return getattr(self, method, None)
|
||||
|
||||
self._rules = [copy.copy(r) for r in self.rules]
|
||||
for rule in self._rules:
|
||||
rule.callback = get_method(rule.callback)
|
||||
rule.process_links = get_method(rule.process_links)
|
||||
rule.process_request = get_method(rule.process_request)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
|
||||
spider._follow_links = crawler.settings.getbool(
|
||||
'CRAWLSPIDER_FOLLOW_LINKS', True)
|
||||
return spider
|
||||
|
||||
def set_crawler(self, crawler):
|
||||
super(CrawlSpider, self).set_crawler(crawler)
|
||||
self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
|
||||
from scrapy.spiders.crawl import *
|
||||
|
@ -1,136 +1,7 @@
|
||||
"""
|
||||
This module implements the XMLFeedSpider which is the recommended spider to use
|
||||
for scraping from an XML feed.
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.iterators import xmliter, csviter
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.exceptions import NotConfigured, NotSupported
|
||||
|
||||
|
||||
class XMLFeedSpider(Spider):
|
||||
"""
|
||||
This class intends to be the base class for spiders that scrape
|
||||
from XML feeds.
|
||||
|
||||
You can choose whether to parse the file using the 'iternodes' iterator, an
|
||||
'xml' selector, or an 'html' selector. In most cases, it's convenient to
|
||||
use iternodes, since it's a faster and cleaner.
|
||||
"""
|
||||
|
||||
iterator = 'iternodes'
|
||||
itertag = 'item'
|
||||
namespaces = ()
|
||||
|
||||
def process_results(self, response, results):
|
||||
"""This overridable method is called for each result (item or request)
|
||||
returned by the spider, and it's intended to perform any last time
|
||||
processing required before returning the results to the framework core,
|
||||
for example setting the item GUIDs. It receives a list of results and
|
||||
the response which originated that results. It must return a list of
|
||||
results (Items or Requests).
|
||||
"""
|
||||
return results
|
||||
|
||||
def adapt_response(self, response):
|
||||
"""You can override this function in order to make any changes you want
|
||||
to into the feed before parsing it. This function must return a
|
||||
response.
|
||||
"""
|
||||
return response
|
||||
|
||||
def parse_node(self, response, selector):
|
||||
"""This method must be overriden with your custom spider functionality"""
|
||||
if hasattr(self, 'parse_item'): # backward compatibility
|
||||
return self.parse_item(response, selector)
|
||||
raise NotImplementedError
|
||||
|
||||
def parse_nodes(self, response, nodes):
|
||||
"""This method is called for the nodes matching the provided tag name
|
||||
(itertag). Receives the response and an Selector for each node.
|
||||
Overriding this method is mandatory. Otherwise, you spider won't work.
|
||||
This method must return either a BaseItem, a Request, or a list
|
||||
containing any of them.
|
||||
"""
|
||||
|
||||
for selector in nodes:
|
||||
ret = iterate_spider_output(self.parse_node(response, selector))
|
||||
for result_item in self.process_results(response, ret):
|
||||
yield result_item
|
||||
|
||||
def parse(self, response):
|
||||
if not hasattr(self, 'parse_node'):
|
||||
raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
|
||||
|
||||
response = self.adapt_response(response)
|
||||
if self.iterator == 'iternodes':
|
||||
nodes = self._iternodes(response)
|
||||
elif self.iterator == 'xml':
|
||||
selector = Selector(response, type='xml')
|
||||
self._register_namespaces(selector)
|
||||
nodes = selector.xpath('//%s' % self.itertag)
|
||||
elif self.iterator == 'html':
|
||||
selector = Selector(response, type='html')
|
||||
self._register_namespaces(selector)
|
||||
nodes = selector.xpath('//%s' % self.itertag)
|
||||
else:
|
||||
raise NotSupported('Unsupported node iterator')
|
||||
|
||||
return self.parse_nodes(response, nodes)
|
||||
|
||||
def _iternodes(self, response):
|
||||
for node in xmliter(response, self.itertag):
|
||||
self._register_namespaces(node)
|
||||
yield node
|
||||
|
||||
def _register_namespaces(self, selector):
|
||||
for (prefix, uri) in self.namespaces:
|
||||
selector.register_namespace(prefix, uri)
|
||||
|
||||
|
||||
class CSVFeedSpider(Spider):
|
||||
"""Spider for parsing CSV feeds.
|
||||
It receives a CSV file in a response; iterates through each of its rows,
|
||||
and calls parse_row with a dict containing each field's data.
|
||||
|
||||
You can set some options regarding the CSV file, such as the delimiter, quotechar
|
||||
and the file's headers.
|
||||
"""
|
||||
|
||||
delimiter = None # When this is None, python's csv module's default delimiter is used
|
||||
quotechar = None # When this is None, python's csv module's default quotechar is used
|
||||
headers = None
|
||||
|
||||
def process_results(self, response, results):
|
||||
"""This method has the same purpose as the one in XMLFeedSpider"""
|
||||
return results
|
||||
|
||||
def adapt_response(self, response):
|
||||
"""This method has the same purpose as the one in XMLFeedSpider"""
|
||||
return response
|
||||
|
||||
def parse_row(self, response, row):
|
||||
"""This method must be overriden with your custom spider functionality"""
|
||||
raise NotImplementedError
|
||||
|
||||
def parse_rows(self, response):
|
||||
"""Receives a response and a dict (representing each row) with a key for
|
||||
each provided (or detected) header of the CSV file. This spider also
|
||||
gives the opportunity to override adapt_response and
|
||||
process_results methods for pre and post-processing purposes.
|
||||
"""
|
||||
|
||||
for row in csviter(response, self.delimiter, self.headers, self.quotechar):
|
||||
ret = iterate_spider_output(self.parse_row(response, row))
|
||||
for result_item in self.process_results(response, ret):
|
||||
yield result_item
|
||||
|
||||
def parse(self, response):
|
||||
if not hasattr(self, 'parse_row'):
|
||||
raise NotConfigured('You must define parse_row method in order to scrape this CSV feed')
|
||||
response = self.adapt_response(response)
|
||||
return self.parse_rows(response)
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spiders.feed` is deprecated, "
|
||||
"use `scrapy.spiders.feed` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.spiders.feed import *
|
||||
|
@ -1,31 +1,7 @@
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
class InitSpider(Spider):
|
||||
"""Base Spider with initialization facilities"""
|
||||
|
||||
def start_requests(self):
|
||||
self._postinit_reqs = super(InitSpider, self).start_requests()
|
||||
return iterate_spider_output(self.init_request())
|
||||
|
||||
def initialized(self, response=None):
|
||||
"""This method must be set as the callback of your last initialization
|
||||
request. See self.init_request() docstring for more info.
|
||||
"""
|
||||
return self.__dict__.pop('_postinit_reqs')
|
||||
|
||||
def init_request(self):
|
||||
"""This function should return one initialization request, with the
|
||||
self.initialized method as callback. When the self.initialized method
|
||||
is called this spider is considered initialized. If you need to perform
|
||||
several requests for initializing your spider, you can do so by using
|
||||
different callbacks. The only requirement is that the final callback
|
||||
(of the last initialization request) must be self.initialized.
|
||||
|
||||
The default implementation calls self.initialized immediately, and
|
||||
means that no initialization is needed. This method should be
|
||||
overridden only when you need to perform requests to initialize your
|
||||
spider
|
||||
"""
|
||||
return self.initialized()
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spiders.init` is deprecated, "
|
||||
"use `scrapy.spiders.init` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.spiders.init import *
|
||||
|
@ -1,79 +1,7 @@
|
||||
import re
|
||||
import logging
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spiders.sitemap` is deprecated, "
|
||||
"use `scrapy.spiders.sitemap` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request, XmlResponse
|
||||
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
|
||||
from scrapy.utils.gz import gunzip, is_gzipped
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SitemapSpider(Spider):
|
||||
|
||||
sitemap_urls = ()
|
||||
sitemap_rules = [('', 'parse')]
|
||||
sitemap_follow = ['']
|
||||
sitemap_alternate_links = False
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
super(SitemapSpider, self).__init__(*a, **kw)
|
||||
self._cbs = []
|
||||
for r, c in self.sitemap_rules:
|
||||
if isinstance(c, basestring):
|
||||
c = getattr(self, c)
|
||||
self._cbs.append((regex(r), c))
|
||||
self._follow = [regex(x) for x in self.sitemap_follow]
|
||||
|
||||
def start_requests(self):
|
||||
return (Request(x, callback=self._parse_sitemap) for x in self.sitemap_urls)
|
||||
|
||||
def _parse_sitemap(self, response):
|
||||
if response.url.endswith('/robots.txt'):
|
||||
for url in sitemap_urls_from_robots(response.body):
|
||||
yield Request(url, callback=self._parse_sitemap)
|
||||
else:
|
||||
body = self._get_sitemap_body(response)
|
||||
if body is None:
|
||||
logger.warning("Ignoring invalid sitemap: %(response)s",
|
||||
{'response': response}, extra={'spider': self})
|
||||
return
|
||||
|
||||
s = Sitemap(body)
|
||||
if s.type == 'sitemapindex':
|
||||
for loc in iterloc(s, self.sitemap_alternate_links):
|
||||
if any(x.search(loc) for x in self._follow):
|
||||
yield Request(loc, callback=self._parse_sitemap)
|
||||
elif s.type == 'urlset':
|
||||
for loc in iterloc(s):
|
||||
for r, c in self._cbs:
|
||||
if r.search(loc):
|
||||
yield Request(loc, callback=c)
|
||||
break
|
||||
|
||||
def _get_sitemap_body(self, response):
|
||||
"""Return the sitemap body contained in the given response, or None if the
|
||||
response is not a sitemap.
|
||||
"""
|
||||
if isinstance(response, XmlResponse):
|
||||
return response.body
|
||||
elif is_gzipped(response):
|
||||
return gunzip(response.body)
|
||||
elif response.url.endswith('.xml'):
|
||||
return response.body
|
||||
elif response.url.endswith('.xml.gz'):
|
||||
return gunzip(response.body)
|
||||
|
||||
def regex(x):
|
||||
if isinstance(x, basestring):
|
||||
return re.compile(x)
|
||||
return x
|
||||
|
||||
def iterloc(it, alt=False):
|
||||
for d in it:
|
||||
yield d['loc']
|
||||
|
||||
# Also consider alternate URLs (xhtml:link rel="alternate")
|
||||
if alt and 'alternate' in d:
|
||||
for l in d['alternate']:
|
||||
yield l
|
||||
from scrapy.spiders.sitemap import *
|
||||
|
@ -1,34 +1,7 @@
|
||||
import os
|
||||
from six.moves import cPickle as pickle
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.spiderstate` is deprecated, "
|
||||
"use `scrapy.extensions.spiderstate` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.utils.job import job_dir
|
||||
|
||||
class SpiderState(object):
|
||||
"""Store and load spider state during a scraping job"""
|
||||
|
||||
def __init__(self, jobdir=None):
|
||||
self.jobdir = jobdir
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
obj = cls(job_dir(crawler.settings))
|
||||
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
|
||||
return obj
|
||||
|
||||
def spider_closed(self, spider):
|
||||
if self.jobdir:
|
||||
with open(self.statefn, 'wb') as f:
|
||||
pickle.dump(spider.state, f, protocol=2)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
if self.jobdir and os.path.exists(self.statefn):
|
||||
with open(self.statefn, 'rb') as f:
|
||||
spider.state = pickle.load(f)
|
||||
else:
|
||||
spider.state = {}
|
||||
|
||||
@property
|
||||
def statefn(self):
|
||||
return os.path.join(self.jobdir, 'spider.state')
|
||||
from scrapy.extensions.spiderstate import *
|
||||
|
@ -1,34 +1,7 @@
|
||||
"""
|
||||
StatsMailer extension sends an email when a spider finishes scraping.
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.statsmailer` is deprecated, "
|
||||
"use `scrapy.extensions.statsmailer` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
|
||||
"""
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.mail import MailSender
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
class StatsMailer(object):
|
||||
|
||||
def __init__(self, stats, recipients, mail):
|
||||
self.stats = stats
|
||||
self.recipients = recipients
|
||||
self.mail = mail
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
|
||||
if not recipients:
|
||||
raise NotConfigured
|
||||
mail = MailSender.from_settings(crawler.settings)
|
||||
o = cls(crawler.stats, recipients, mail)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_closed(self, spider):
|
||||
spider_stats = self.stats.get_stats(spider)
|
||||
body = "Global stats\n\n"
|
||||
body += "\n".join("%-50s : %s" % i for i in self.stats.get_stats().items())
|
||||
body += "\n\n%s stats\n\n" % spider.name
|
||||
body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
|
||||
return self.mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body)
|
||||
from scrapy.extensions.statsmailer import *
|
||||
|
@ -1,80 +1,7 @@
|
||||
import logging
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib.throttle` is deprecated, "
|
||||
"use `scrapy.extensions.throttle` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoThrottle(object):
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
|
||||
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def _spider_opened(self, spider):
|
||||
self.mindelay = self._min_delay(spider)
|
||||
self.maxdelay = self._max_delay(spider)
|
||||
spider.download_delay = self._start_delay(spider)
|
||||
|
||||
def _min_delay(self, spider):
|
||||
s = self.crawler.settings
|
||||
return getattr(spider, 'download_delay', 0.0) or \
|
||||
s.getfloat('AUTOTHROTTLE_MIN_DOWNLOAD_DELAY') or \
|
||||
s.getfloat('DOWNLOAD_DELAY')
|
||||
|
||||
def _max_delay(self, spider):
|
||||
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY', 60.0)
|
||||
|
||||
def _start_delay(self, spider):
|
||||
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY', 5.0))
|
||||
|
||||
def _response_downloaded(self, response, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
latency = request.meta.get('download_latency')
|
||||
if latency is None or slot is None:
|
||||
return
|
||||
|
||||
olddelay = slot.delay
|
||||
self._adjust_delay(slot, latency, response)
|
||||
if self.debug:
|
||||
diff = slot.delay - olddelay
|
||||
size = len(response.body)
|
||||
conc = len(slot.transferring)
|
||||
logger.info(
|
||||
"slot: %(slot)s | conc:%(concurrency)2d | "
|
||||
"delay:%(delay)5d ms (%(delaydiff)+d) | "
|
||||
"latency:%(latency)5d ms | size:%(size)6d bytes",
|
||||
{
|
||||
'slot': key, 'concurrency': conc,
|
||||
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
|
||||
'latency': latency * 1000, 'size': size
|
||||
},
|
||||
extra={'spider': spider}
|
||||
)
|
||||
|
||||
def _get_slot(self, request, spider):
|
||||
key = request.meta.get('download_slot')
|
||||
return key, self.crawler.engine.downloader.slots.get(key)
|
||||
|
||||
def _adjust_delay(self, slot, latency, response):
|
||||
"""Define delay adjustment policy"""
|
||||
# If latency is bigger than old delay, then use latency instead of mean.
|
||||
# It works better with problematic sites
|
||||
new_delay = min(max(self.mindelay, latency, (slot.delay + latency) / 2.0), self.maxdelay)
|
||||
|
||||
# Dont adjust delay if response status != 200 and new delay is smaller
|
||||
# than old one, as error pages (and redirections) are usually small and
|
||||
# so tend to reduce latency, thus provoking a positive feedback by
|
||||
# reducing delay instead of increase.
|
||||
if response.status == 200 or new_delay > slot.delay:
|
||||
slot.delay = new_delay
|
||||
from scrapy.extensions.throttle import *
|
||||
|
@ -1,7 +1,7 @@
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.contrib_exp.downloadermiddleware.decompression` is deprecated, "
|
||||
"use `scrapy.contrib.downloadermiddleware.decompression` instead",
|
||||
"use `scrapy.downloadermiddlewares.decompression` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.decompression import DecompressionMiddleware
|
||||
from scrapy.downloadermiddlewares.decompression import DecompressionMiddleware
|
||||
|
@ -1,6 +1,6 @@
|
||||
from w3lib.url import file_uri_to_path
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.decorator import defers
|
||||
from scrapy.utils.decorators import defers
|
||||
|
||||
class FileDownloadHandler(object):
|
||||
|
||||
|
0
scrapy/downloadermiddlewares/__init__.py
Normal file
0
scrapy/downloadermiddlewares/__init__.py
Normal file
96
scrapy/downloadermiddlewares/ajaxcrawl.py
Normal file
96
scrapy/downloadermiddlewares/ajaxcrawl.py
Normal file
@ -0,0 +1,96 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
import re
|
||||
import logging
|
||||
|
||||
import six
|
||||
from w3lib import html
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import _noscript_re, _script_re
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AjaxCrawlMiddleware(object):
|
||||
"""
|
||||
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
|
||||
For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('AJAXCRAWL_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
||||
# middleware parses first 4k. 4k turns out to be insufficient
|
||||
# for this middleware, and parsing 100k could be slow.
|
||||
# We use something in between (32K) by default.
|
||||
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
|
||||
if not isinstance(response, HtmlResponse) or response.status != 200:
|
||||
return response
|
||||
|
||||
if request.method != 'GET':
|
||||
# other HTTP methods are either not safe or don't have a body
|
||||
return response
|
||||
|
||||
if 'ajax_crawlable' in request.meta: # prevent loops
|
||||
return response
|
||||
|
||||
if not self._has_ajax_crawlable_variant(response):
|
||||
return response
|
||||
|
||||
# scrapy already handles #! links properly
|
||||
ajax_crawl_request = request.replace(url=request.url+'#!')
|
||||
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
|
||||
extra={'spider': spider})
|
||||
|
||||
ajax_crawl_request.meta['ajax_crawlable'] = True
|
||||
return ajax_crawl_request
|
||||
|
||||
def _has_ajax_crawlable_variant(self, response):
|
||||
"""
|
||||
Return True if a page without hash fragment could be "AJAX crawlable"
|
||||
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
body = response.body_as_unicode()[:self.lookup_bytes]
|
||||
return _has_ajaxcrawlable_meta(body)
|
||||
|
||||
|
||||
# XXX: move it to w3lib?
|
||||
_ajax_crawlable_re = re.compile(six.u(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'))
|
||||
def _has_ajaxcrawlable_meta(text):
|
||||
"""
|
||||
>>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>')
|
||||
True
|
||||
>>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
|
||||
True
|
||||
>>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>')
|
||||
False
|
||||
>>> _has_ajaxcrawlable_meta('<html></html>')
|
||||
False
|
||||
"""
|
||||
|
||||
# Stripping scripts and comments is slow (about 20x slower than
|
||||
# just checking if a string is in text); this is a quick fail-fast
|
||||
# path that should work for most pages.
|
||||
if 'fragment' not in text:
|
||||
return False
|
||||
if 'content' not in text:
|
||||
return False
|
||||
|
||||
text = _script_re.sub(u'', text)
|
||||
text = _noscript_re.sub(u'', text)
|
||||
text = html.remove_comments(html.replace_entities(text))
|
||||
return _ajax_crawlable_re.search(text) is not None
|
||||
|
13
scrapy/downloadermiddlewares/chunked.py
Normal file
13
scrapy/downloadermiddlewares/chunked.py
Normal file
@ -0,0 +1,13 @@
|
||||
from scrapy.utils.http import decode_chunked_transfer
|
||||
|
||||
|
||||
class ChunkedTransferMiddleware(object):
|
||||
"""This middleware adds support for chunked transfer encoding, as
|
||||
documented in: http://en.wikipedia.org/wiki/Chunked_transfer_encoding
|
||||
"""
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if response.headers.get('Transfer-Encoding') == 'chunked':
|
||||
body = decode_chunked_transfer(response.body)
|
||||
return response.replace(body=body)
|
||||
return response
|
91
scrapy/downloadermiddlewares/cookies.py
Normal file
91
scrapy/downloadermiddlewares/cookies.py
Normal file
@ -0,0 +1,91 @@
|
||||
import os
|
||||
import six
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import Response
|
||||
from scrapy.http.cookies import CookieJar
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CookiesMiddleware(object):
|
||||
"""This middleware enables working with sites that need cookies"""
|
||||
|
||||
def __init__(self, debug=False):
|
||||
self.jars = defaultdict(CookieJar)
|
||||
self.debug = debug
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COOKIES_ENABLED'):
|
||||
raise NotConfigured
|
||||
return cls(crawler.settings.getbool('COOKIES_DEBUG'))
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
return
|
||||
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
jar = self.jars[cookiejarkey]
|
||||
cookies = self._get_request_cookies(jar, request)
|
||||
for cookie in cookies:
|
||||
jar.set_cookie_if_ok(cookie, request)
|
||||
|
||||
# set Cookie header
|
||||
request.headers.pop('Cookie', None)
|
||||
jar.add_cookie_header(request)
|
||||
self._debug_cookie(request, spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
return response
|
||||
|
||||
# extract cookies from Set-Cookie and drop invalid/expired cookies
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
jar = self.jars[cookiejarkey]
|
||||
jar.extract_cookies(response, request)
|
||||
self._debug_set_cookie(response, spider)
|
||||
|
||||
return response
|
||||
|
||||
def _debug_cookie(self, request, spider):
|
||||
if self.debug:
|
||||
cl = request.headers.getlist('Cookie')
|
||||
if cl:
|
||||
msg = "Sending cookies to: %s" % request + os.linesep
|
||||
msg += os.linesep.join("Cookie: %s" % c for c in cl)
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _debug_set_cookie(self, response, spider):
|
||||
if self.debug:
|
||||
cl = response.headers.getlist('Set-Cookie')
|
||||
if cl:
|
||||
msg = "Received cookies from: %s" % response + os.linesep
|
||||
msg += os.linesep.join("Set-Cookie: %s" % c for c in cl)
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _format_cookie(self, cookie):
|
||||
# build cookie string
|
||||
cookie_str = '%s=%s' % (cookie['name'], cookie['value'])
|
||||
|
||||
if cookie.get('path', None):
|
||||
cookie_str += '; Path=%s' % cookie['path']
|
||||
if cookie.get('domain', None):
|
||||
cookie_str += '; Domain=%s' % cookie['domain']
|
||||
|
||||
return cookie_str
|
||||
|
||||
def _get_request_cookies(self, jar, request):
|
||||
if isinstance(request.cookies, dict):
|
||||
cookie_list = [{'name': k, 'value': v} for k, v in \
|
||||
six.iteritems(request.cookies)]
|
||||
else:
|
||||
cookie_list = request.cookies
|
||||
|
||||
cookies = [self._format_cookie(x) for x in cookie_list]
|
||||
headers = {'Set-Cookie': cookies}
|
||||
response = Response(request.url, headers=headers)
|
||||
|
||||
return jar.make_cookies(response, request)
|
88
scrapy/downloadermiddlewares/decompression.py
Normal file
88
scrapy/downloadermiddlewares/decompression.py
Normal file
@ -0,0 +1,88 @@
|
||||
""" This module implements the DecompressionMiddleware which tries to recognise
|
||||
and extract the potentially compressed responses that may arrive.
|
||||
"""
|
||||
|
||||
import bz2
|
||||
import gzip
|
||||
import zipfile
|
||||
import tarfile
|
||||
import logging
|
||||
from tempfile import mktemp
|
||||
|
||||
import six
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO as BytesIO
|
||||
except ImportError:
|
||||
from io import BytesIO
|
||||
|
||||
from scrapy.responsetypes import responsetypes
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DecompressionMiddleware(object):
|
||||
""" This middleware tries to recognise and extract the possibly compressed
|
||||
responses that may arrive. """
|
||||
|
||||
def __init__(self):
|
||||
self._formats = {
|
||||
'tar': self._is_tar,
|
||||
'zip': self._is_zip,
|
||||
'gz': self._is_gzip,
|
||||
'bz2': self._is_bzip2
|
||||
}
|
||||
|
||||
def _is_tar(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
|
||||
except tarfile.ReadError:
|
||||
return
|
||||
|
||||
body = tar_file.extractfile(tar_file.members[0]).read()
|
||||
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_zip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
zip_file = zipfile.ZipFile(archive)
|
||||
except zipfile.BadZipfile:
|
||||
return
|
||||
|
||||
namelist = zip_file.namelist()
|
||||
body = zip_file.read(namelist[0])
|
||||
respcls = responsetypes.from_args(filename=namelist[0], body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_gzip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
body = gzip.GzipFile(fileobj=archive).read()
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_bzip2(self, response):
|
||||
try:
|
||||
body = bz2.decompress(response.body)
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if not response.body:
|
||||
return response
|
||||
|
||||
for fmt, func in six.iteritems(self._formats):
|
||||
new_response = func(response)
|
||||
if new_response:
|
||||
logger.debug('Decompressed response with format: %(responsefmt)s',
|
||||
{'responsefmt': fmt}, extra={'spider': spider})
|
||||
return new_response
|
||||
return response
|
19
scrapy/downloadermiddlewares/defaultheaders.py
Normal file
19
scrapy/downloadermiddlewares/defaultheaders.py
Normal file
@ -0,0 +1,19 @@
|
||||
"""
|
||||
DefaultHeaders downloader middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
|
||||
class DefaultHeadersMiddleware(object):
|
||||
|
||||
def __init__(self, headers):
|
||||
self._headers = headers
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings.get('DEFAULT_REQUEST_HEADERS').items())
|
||||
|
||||
def process_request(self, request, spider):
|
||||
for k, v in self._headers:
|
||||
request.headers.setdefault(k, v)
|
26
scrapy/downloadermiddlewares/downloadtimeout.py
Normal file
26
scrapy/downloadermiddlewares/downloadtimeout.py
Normal file
@ -0,0 +1,26 @@
|
||||
"""
|
||||
Download timeout middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class DownloadTimeoutMiddleware(object):
|
||||
|
||||
def __init__(self, timeout=180):
|
||||
self._timeout = timeout
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings.getfloat('DOWNLOAD_TIMEOUT'))
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self._timeout = getattr(spider, 'download_timeout', self._timeout)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self._timeout:
|
||||
request.meta.setdefault('download_timeout', self._timeout)
|
31
scrapy/downloadermiddlewares/httpauth.py
Normal file
31
scrapy/downloadermiddlewares/httpauth.py
Normal file
@ -0,0 +1,31 @@
|
||||
"""
|
||||
HTTP basic auth downloader middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class HttpAuthMiddleware(object):
|
||||
"""Set Basic HTTP Authorization header
|
||||
(http_user and http_pass spider class attributes)"""
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls()
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
usr = getattr(spider, 'http_user', '')
|
||||
pwd = getattr(spider, 'http_pass', '')
|
||||
if usr or pwd:
|
||||
self.auth = basic_auth_header(usr, pwd)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
auth = getattr(self, 'auth', None)
|
||||
if auth and 'Authorization' not in request.headers:
|
||||
request.headers['Authorization'] = auth
|
92
scrapy/downloadermiddlewares/httpcache.py
Normal file
92
scrapy/downloadermiddlewares/httpcache.py
Normal file
@ -0,0 +1,92 @@
|
||||
from email.utils import formatdate
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
|
||||
class HttpCacheMiddleware(object):
|
||||
|
||||
def __init__(self, settings, stats):
|
||||
if not settings.getbool('HTTPCACHE_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
|
||||
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings, crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.storage.open_spider(spider)
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.storage.close_spider(spider)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_cache', False):
|
||||
return
|
||||
|
||||
# Skip uncacheable requests
|
||||
if not self.policy.should_cache_request(request):
|
||||
request.meta['_dont_cache'] = True # flag as uncacheable
|
||||
return
|
||||
|
||||
# Look for cached response and check if expired
|
||||
cachedresponse = self.storage.retrieve_response(spider, request)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/miss', spider=spider)
|
||||
if self.ignore_missing:
|
||||
self.stats.inc_value('httpcache/ignore', spider=spider)
|
||||
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
||||
return # first time request
|
||||
|
||||
# Return cached response only if not expired
|
||||
cachedresponse.flags.append('cached')
|
||||
if self.policy.is_cached_response_fresh(cachedresponse, request):
|
||||
self.stats.inc_value('httpcache/hit', spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
# Keep a reference to cached response to avoid a second cache lookup on
|
||||
# process_response hook
|
||||
request.meta['cached_response'] = cachedresponse
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_cache', False):
|
||||
return response
|
||||
|
||||
# Skip cached responses and uncacheable requests
|
||||
if 'cached' in response.flags or '_dont_cache' in request.meta:
|
||||
request.meta.pop('_dont_cache', None)
|
||||
return response
|
||||
|
||||
# RFC2616 requires origin server to set Date header,
|
||||
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
|
||||
if 'Date' not in response.headers:
|
||||
response.headers['Date'] = formatdate(usegmt=1)
|
||||
|
||||
# Do not validate first-hand responses
|
||||
cachedresponse = request.meta.pop('cached_response', None)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/firsthand', spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
if self.policy.is_cached_response_valid(cachedresponse, response, request):
|
||||
self.stats.inc_value('httpcache/revalidate', spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
self.stats.inc_value('httpcache/invalidate', spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
def _cache_response(self, spider, response, request, cachedresponse):
|
||||
if self.policy.should_cache_response(response, request):
|
||||
self.stats.inc_value('httpcache/store', spider=spider)
|
||||
self.storage.store_response(spider, request, response)
|
||||
else:
|
||||
self.stats.inc_value('httpcache/uncacheable', spider=spider)
|
56
scrapy/downloadermiddlewares/httpcompression.py
Normal file
56
scrapy/downloadermiddlewares/httpcompression.py
Normal file
@ -0,0 +1,56 @@
|
||||
import zlib
|
||||
|
||||
from scrapy.utils.gz import gunzip, is_gzipped
|
||||
from scrapy.http import Response, TextResponse
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class HttpCompressionMiddleware(object):
|
||||
"""This middleware allows compressed (gzip, deflate) traffic to be
|
||||
sent/received from web sites"""
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
|
||||
raise NotConfigured
|
||||
return cls()
|
||||
|
||||
def process_request(self, request, spider):
|
||||
request.headers.setdefault('Accept-Encoding', 'gzip,deflate')
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if isinstance(response, Response):
|
||||
content_encoding = response.headers.getlist('Content-Encoding')
|
||||
if content_encoding and not is_gzipped(response):
|
||||
encoding = content_encoding.pop()
|
||||
decoded_body = self._decode(response.body, encoding.lower())
|
||||
respcls = responsetypes.from_args(headers=response.headers, \
|
||||
url=response.url)
|
||||
kwargs = dict(cls=respcls, body=decoded_body)
|
||||
if issubclass(respcls, TextResponse):
|
||||
# force recalculating the encoding until we make sure the
|
||||
# responsetypes guessing is reliable
|
||||
kwargs['encoding'] = None
|
||||
response = response.replace(**kwargs)
|
||||
if not content_encoding:
|
||||
del response.headers['Content-Encoding']
|
||||
|
||||
return response
|
||||
|
||||
def _decode(self, body, encoding):
|
||||
if encoding == 'gzip' or encoding == 'x-gzip':
|
||||
body = gunzip(body)
|
||||
|
||||
if encoding == 'deflate':
|
||||
try:
|
||||
body = zlib.decompress(body)
|
||||
except zlib.error:
|
||||
# ugly hack to work with raw deflate content that may
|
||||
# be sent by microsoft servers. For more information, see:
|
||||
# http://carsten.codimi.de/gzip.yaws/
|
||||
# http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
|
||||
# http://www.gzip.org/zlib/zlib_faq.html#faq38
|
||||
body = zlib.decompress(body, -15)
|
||||
return body
|
||||
|
55
scrapy/downloadermiddlewares/httpproxy.py
Normal file
55
scrapy/downloadermiddlewares/httpproxy.py
Normal file
@ -0,0 +1,55 @@
|
||||
import base64
|
||||
from six.moves.urllib.request import getproxies, proxy_bypass
|
||||
from six.moves.urllib.parse import unquote
|
||||
try:
|
||||
from urllib2 import _parse_proxy
|
||||
except ImportError:
|
||||
from urllib.request import _parse_proxy
|
||||
from six.moves.urllib.parse import urlunparse
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class HttpProxyMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.proxies = {}
|
||||
for type, url in getproxies().items():
|
||||
self.proxies[type] = self._get_proxy(url, type)
|
||||
|
||||
if not self.proxies:
|
||||
raise NotConfigured
|
||||
|
||||
def _get_proxy(self, url, orig_type):
|
||||
proxy_type, user, password, hostport = _parse_proxy(url)
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
|
||||
|
||||
if user and password:
|
||||
user_pass = '%s:%s' % (unquote(user), unquote(password))
|
||||
creds = base64.b64encode(user_pass).strip()
|
||||
else:
|
||||
creds = None
|
||||
|
||||
return creds, proxy_url
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# ignore if proxy is already seted
|
||||
if 'proxy' in request.meta:
|
||||
return
|
||||
|
||||
parsed = urlparse_cached(request)
|
||||
scheme = parsed.scheme
|
||||
|
||||
# 'no_proxy' is only supported by http schemes
|
||||
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
|
||||
return
|
||||
|
||||
if scheme in self.proxies:
|
||||
self._set_proxy(request, scheme)
|
||||
|
||||
def _set_proxy(self, request, scheme):
|
||||
creds, proxy = self.proxies[scheme]
|
||||
request.meta['proxy'] = proxy
|
||||
if creds:
|
||||
request.headers['Proxy-Authorization'] = 'Basic ' + creds
|
101
scrapy/downloadermiddlewares/redirect.py
Normal file
101
scrapy/downloadermiddlewares/redirect.py
Normal file
@ -0,0 +1,101 @@
|
||||
import logging
|
||||
from six.moves.urllib.parse import urljoin
|
||||
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import get_meta_refresh
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseRedirectMiddleware(object):
|
||||
|
||||
enabled_setting = 'REDIRECT_ENABLED'
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool(self.enabled_setting):
|
||||
raise NotConfigured
|
||||
|
||||
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
|
||||
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def _redirect(self, redirected, request, spider, reason):
|
||||
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
|
||||
redirects = request.meta.get('redirect_times', 0) + 1
|
||||
|
||||
if ttl and redirects <= self.max_redirect_times:
|
||||
redirected.meta['redirect_times'] = redirects
|
||||
redirected.meta['redirect_ttl'] = ttl - 1
|
||||
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
|
||||
[request.url]
|
||||
redirected.dont_filter = request.dont_filter
|
||||
redirected.priority = request.priority + self.priority_adjust
|
||||
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
|
||||
{'reason': reason, 'redirected': redirected, 'request': request},
|
||||
extra={'spider': spider})
|
||||
return redirected
|
||||
else:
|
||||
logger.debug("Discarding %(request)s: max redirections reached",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
raise IgnoreRequest("max redirections reached")
|
||||
|
||||
def _redirect_request_using_get(self, request, redirect_url):
|
||||
redirected = request.replace(url=redirect_url, method='GET', body='')
|
||||
redirected.headers.pop('Content-Type', None)
|
||||
redirected.headers.pop('Content-Length', None)
|
||||
return redirected
|
||||
|
||||
|
||||
class RedirectMiddleware(BaseRedirectMiddleware):
|
||||
"""Handle redirection of requests based on response status and meta-refresh html tag"""
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_redirect', False):
|
||||
return response
|
||||
|
||||
if request.method == 'HEAD':
|
||||
if response.status in [301, 302, 303, 307] and 'Location' in response.headers:
|
||||
redirected_url = urljoin(request.url, response.headers['location'])
|
||||
redirected = request.replace(url=redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
else:
|
||||
return response
|
||||
|
||||
if response.status in [302, 303] and 'Location' in response.headers:
|
||||
redirected_url = urljoin(request.url, response.headers['location'])
|
||||
redirected = self._redirect_request_using_get(request, redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
if response.status in [301, 307] and 'Location' in response.headers:
|
||||
redirected_url = urljoin(request.url, response.headers['location'])
|
||||
redirected = request.replace(url=redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
enabled_setting = 'METAREFRESH_ENABLED'
|
||||
|
||||
def __init__(self, settings):
|
||||
super(MetaRefreshMiddleware, self).__init__(settings)
|
||||
self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY',
|
||||
settings.getint('METAREFRESH_MAXDELAY'))
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_redirect', False) or request.method == 'HEAD' or \
|
||||
not isinstance(response, HtmlResponse):
|
||||
return response
|
||||
|
||||
if isinstance(response, HtmlResponse):
|
||||
interval, url = get_meta_refresh(response)
|
||||
if url and interval < self._maxdelay:
|
||||
redirected = self._redirect_request_using_get(request, url)
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
|
||||
return response
|
82
scrapy/downloadermiddlewares/retry.py
Normal file
82
scrapy/downloadermiddlewares/retry.py
Normal file
@ -0,0 +1,82 @@
|
||||
"""
|
||||
An extension to retry failed requests that are potentially caused by temporary
|
||||
problems such as a connection timeout or HTTP 500 error.
|
||||
|
||||
You can change the behaviour of this middleware by modifing the scraping settings:
|
||||
RETRY_TIMES - how many times to retry a failed page
|
||||
RETRY_HTTP_CODES - which HTTP response codes to retry
|
||||
|
||||
Failed pages are collected on the scraping process and rescheduled at the end,
|
||||
once the spider has finished crawling all regular (non failed) pages. Once
|
||||
there is no more failed pages to retry this middleware sends a signal
|
||||
(retry_complete), so other extensions could connect to that signal.
|
||||
|
||||
About HTTP errors to consider:
|
||||
|
||||
- You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP
|
||||
protocol. It's included by default because it's a common code used to
|
||||
indicate server overload, which would be something we want to retry
|
||||
"""
|
||||
import logging
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import TimeoutError, DNSLookupError, \
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError, \
|
||||
ConnectionLost, TCPTimedOutError
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.response import response_status_message
|
||||
from scrapy.xlib.tx import ResponseFailed
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetryMiddleware(object):
|
||||
|
||||
# IOError is raised by the HttpCompression middleware when trying to
|
||||
# decompress an empty response
|
||||
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost, TCPTimedOutError, ResponseFailed,
|
||||
IOError)
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('RETRY_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.max_retry_times = settings.getint('RETRY_TIMES')
|
||||
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
|
||||
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_retry', False):
|
||||
return response
|
||||
if response.status in self.retry_http_codes:
|
||||
reason = response_status_message(response.status)
|
||||
return self._retry(request, reason, spider) or response
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
|
||||
and not request.meta.get('dont_retry', False):
|
||||
return self._retry(request, exception, spider)
|
||||
|
||||
def _retry(self, request, reason, spider):
|
||||
retries = request.meta.get('retry_times', 0) + 1
|
||||
|
||||
if retries <= self.max_retry_times:
|
||||
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
retryreq = request.copy()
|
||||
retryreq.meta['retry_times'] = retries
|
||||
retryreq.dont_filter = True
|
||||
retryreq.priority = request.priority + self.priority_adjust
|
||||
return retryreq
|
||||
else:
|
||||
logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
67
scrapy/downloadermiddlewares/robotstxt.py
Normal file
67
scrapy/downloadermiddlewares/robotstxt.py
Normal file
@ -0,0 +1,67 @@
|
||||
"""
|
||||
This is a middleware to respect robots.txt policies. To activate it you must
|
||||
enable this middleware and enable the ROBOTSTXT_OBEY setting.
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from six.moves.urllib import robotparser
|
||||
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RobotsTxtMiddleware(object):
|
||||
DOWNLOAD_PRIORITY = 1000
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
|
||||
raise NotConfigured
|
||||
|
||||
self.crawler = crawler
|
||||
self._useragent = crawler.settings.get('USER_AGENT')
|
||||
self._parsers = {}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_obey_robotstxt'):
|
||||
return
|
||||
rp = self.robot_parser(request, spider)
|
||||
if rp and not rp.can_fetch(self._useragent, request.url):
|
||||
logger.debug("Forbidden by robots.txt: %(request)s",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
raise IgnoreRequest
|
||||
|
||||
def robot_parser(self, request, spider):
|
||||
url = urlparse_cached(request)
|
||||
netloc = url.netloc
|
||||
if netloc not in self._parsers:
|
||||
self._parsers[netloc] = None
|
||||
robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
|
||||
robotsreq = Request(
|
||||
robotsurl,
|
||||
priority=self.DOWNLOAD_PRIORITY,
|
||||
meta={'dont_obey_robotstxt': True}
|
||||
)
|
||||
dfd = self.crawler.engine.download(robotsreq, spider)
|
||||
dfd.addCallback(self._parse_robots)
|
||||
dfd.addErrback(self._logerror, robotsreq, spider)
|
||||
return self._parsers[netloc]
|
||||
|
||||
def _logerror(self, failure, request, spider):
|
||||
if failure.type is not IgnoreRequest:
|
||||
logger.error("Error downloading %(request)s: %(f_exception)s",
|
||||
{'request': request, 'f_exception': failure.value},
|
||||
extra={'spider': spider, 'failure': failure})
|
||||
|
||||
def _parse_robots(self, response):
|
||||
rp = robotparser.RobotFileParser(response.url)
|
||||
rp.parse(response.body.splitlines())
|
||||
self._parsers[urlparse_cached(response).netloc] = rp
|
32
scrapy/downloadermiddlewares/stats.py
Normal file
32
scrapy/downloadermiddlewares/stats.py
Normal file
@ -0,0 +1,32 @@
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.request import request_httprepr
|
||||
from scrapy.utils.response import response_httprepr
|
||||
|
||||
class DownloaderStats(object):
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('DOWNLOADER_STATS'):
|
||||
raise NotConfigured
|
||||
return cls(crawler.stats)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
self.stats.inc_value('downloader/request_count', spider=spider)
|
||||
self.stats.inc_value('downloader/request_method_count/%s' % request.method, spider=spider)
|
||||
reqlen = len(request_httprepr(request))
|
||||
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
self.stats.inc_value('downloader/response_count', spider=spider)
|
||||
self.stats.inc_value('downloader/response_status_count/%s' % response.status, spider=spider)
|
||||
reslen = len(response_httprepr(response))
|
||||
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__)
|
||||
self.stats.inc_value('downloader/exception_count', spider=spider)
|
||||
self.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
|
23
scrapy/downloadermiddlewares/useragent.py
Normal file
23
scrapy/downloadermiddlewares/useragent.py
Normal file
@ -0,0 +1,23 @@
|
||||
"""Set User-Agent header per spider or use a default value from settings"""
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class UserAgentMiddleware(object):
|
||||
"""This middleware allows spiders to override the user_agent"""
|
||||
|
||||
def __init__(self, user_agent='Scrapy'):
|
||||
self.user_agent = user_agent
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings['USER_AGENT'])
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self.user_agent:
|
||||
request.headers.setdefault('User-Agent', self.user_agent)
|
@ -1,72 +1,7 @@
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import logging
|
||||
import warnings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
warnings.warn("Module `scrapy.dupefilter` is deprecated, "
|
||||
"use `scrapy.dupefilters` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
|
||||
|
||||
class BaseDupeFilter(object):
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
return cls()
|
||||
|
||||
def request_seen(self, request):
|
||||
return False
|
||||
|
||||
def open(self): # can return deferred
|
||||
pass
|
||||
|
||||
def close(self, reason): # can return a deferred
|
||||
pass
|
||||
|
||||
def log(self, request, spider): # log that a request has been filtered
|
||||
pass
|
||||
|
||||
|
||||
class RFPDupeFilter(BaseDupeFilter):
|
||||
"""Request Fingerprint duplicates filter"""
|
||||
|
||||
def __init__(self, path=None, debug=False):
|
||||
self.file = None
|
||||
self.fingerprints = set()
|
||||
self.logdupes = True
|
||||
self.debug = debug
|
||||
self.logger = logging.getLogger(__name__)
|
||||
if path:
|
||||
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
||||
self.fingerprints.update(x.rstrip() for x in self.file)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
debug = settings.getbool('DUPEFILTER_DEBUG')
|
||||
return cls(job_dir(settings), debug)
|
||||
|
||||
def request_seen(self, request):
|
||||
fp = self.request_fingerprint(request)
|
||||
if fp in self.fingerprints:
|
||||
return True
|
||||
self.fingerprints.add(fp)
|
||||
if self.file:
|
||||
self.file.write(fp + os.linesep)
|
||||
|
||||
def request_fingerprint(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
def close(self, reason):
|
||||
if self.file:
|
||||
self.file.close()
|
||||
|
||||
def log(self, request, spider):
|
||||
if self.debug:
|
||||
msg = "Filtered duplicate request: %(request)s"
|
||||
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||
elif self.logdupes:
|
||||
msg = ("Filtered duplicate request: %(request)s"
|
||||
" - no more duplicates will be shown"
|
||||
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
||||
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||
self.logdupes = False
|
||||
|
||||
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
||||
from scrapy.dupefilters import *
|
||||
|
72
scrapy/dupefilters.py
Normal file
72
scrapy/dupefilters.py
Normal file
@ -0,0 +1,72 @@
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import logging
|
||||
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
|
||||
|
||||
class BaseDupeFilter(object):
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
return cls()
|
||||
|
||||
def request_seen(self, request):
|
||||
return False
|
||||
|
||||
def open(self): # can return deferred
|
||||
pass
|
||||
|
||||
def close(self, reason): # can return a deferred
|
||||
pass
|
||||
|
||||
def log(self, request, spider): # log that a request has been filtered
|
||||
pass
|
||||
|
||||
|
||||
class RFPDupeFilter(BaseDupeFilter):
|
||||
"""Request Fingerprint duplicates filter"""
|
||||
|
||||
def __init__(self, path=None, debug=False):
|
||||
self.file = None
|
||||
self.fingerprints = set()
|
||||
self.logdupes = True
|
||||
self.debug = debug
|
||||
self.logger = logging.getLogger(__name__)
|
||||
if path:
|
||||
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
||||
self.fingerprints.update(x.rstrip() for x in self.file)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
debug = settings.getbool('DUPEFILTER_DEBUG')
|
||||
return cls(job_dir(settings), debug)
|
||||
|
||||
def request_seen(self, request):
|
||||
fp = self.request_fingerprint(request)
|
||||
if fp in self.fingerprints:
|
||||
return True
|
||||
self.fingerprints.add(fp)
|
||||
if self.file:
|
||||
self.file.write(fp + os.linesep)
|
||||
|
||||
def request_fingerprint(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
def close(self, reason):
|
||||
if self.file:
|
||||
self.file.close()
|
||||
|
||||
def log(self, request, spider):
|
||||
if self.debug:
|
||||
msg = "Filtered duplicate request: %(request)s"
|
||||
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||
elif self.logdupes:
|
||||
msg = ("Filtered duplicate request: %(request)s"
|
||||
" - no more duplicates will be shown"
|
||||
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
||||
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||
self.logdupes = False
|
||||
|
||||
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
261
scrapy/exporters/__init__.py
Normal file
261
scrapy/exporters/__init__.py
Normal file
@ -0,0 +1,261 @@
|
||||
"""
|
||||
Item Exporters are used to export/serialize items into different formats.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import sys
|
||||
import pprint
|
||||
import marshal
|
||||
import six
|
||||
from six.moves import cPickle as pickle
|
||||
from xml.sax.saxutils import XMLGenerator
|
||||
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||
from scrapy.item import BaseItem
|
||||
|
||||
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
|
||||
'CsvItemExporter', 'XmlItemExporter', 'JsonLinesItemExporter',
|
||||
'JsonItemExporter', 'MarshalItemExporter']
|
||||
|
||||
|
||||
class BaseItemExporter(object):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._configure(kwargs)
|
||||
|
||||
def _configure(self, options, dont_fail=False):
|
||||
"""Configure the exporter by poping options from the ``options`` dict.
|
||||
If dont_fail is set, it won't raise an exception on unexpected options
|
||||
(useful for using with keyword arguments in subclasses constructors)
|
||||
"""
|
||||
self.fields_to_export = options.pop('fields_to_export', None)
|
||||
self.export_empty_fields = options.pop('export_empty_fields', False)
|
||||
self.encoding = options.pop('encoding', 'utf-8')
|
||||
if not dont_fail and options:
|
||||
raise TypeError("Unexpected options: %s" % ', '.join(options.keys()))
|
||||
|
||||
def export_item(self, item):
|
||||
raise NotImplementedError
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', self._to_str_if_unicode)
|
||||
return serializer(value)
|
||||
|
||||
def start_exporting(self):
|
||||
pass
|
||||
|
||||
def finish_exporting(self):
|
||||
pass
|
||||
|
||||
def _to_str_if_unicode(self, value):
|
||||
return value.encode(self.encoding) if isinstance(value, unicode) else value
|
||||
|
||||
def _get_serialized_fields(self, item, default_value=None, include_empty=None):
|
||||
"""Return the fields to export as an iterable of tuples
|
||||
(name, serialized_value)
|
||||
"""
|
||||
if include_empty is None:
|
||||
include_empty = self.export_empty_fields
|
||||
if self.fields_to_export is None:
|
||||
if include_empty and not isinstance(item, dict):
|
||||
field_iter = six.iterkeys(item.fields)
|
||||
else:
|
||||
field_iter = six.iterkeys(item)
|
||||
else:
|
||||
if include_empty:
|
||||
field_iter = self.fields_to_export
|
||||
else:
|
||||
field_iter = (x for x in self.fields_to_export if x in item)
|
||||
|
||||
for field_name in field_iter:
|
||||
if field_name in item:
|
||||
field = {} if isinstance(item, dict) else item.fields[field_name]
|
||||
value = self.serialize_field(field, field_name, item[field_name])
|
||||
else:
|
||||
value = default_value
|
||||
|
||||
yield field_name, value
|
||||
|
||||
|
||||
class JsonLinesItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs, dont_fail=True)
|
||||
self.file = file
|
||||
self.encoder = ScrapyJSONEncoder(**kwargs)
|
||||
|
||||
def export_item(self, item):
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(self.encoder.encode(itemdict) + '\n')
|
||||
|
||||
|
||||
class JsonItemExporter(JsonLinesItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs, dont_fail=True)
|
||||
self.file = file
|
||||
self.encoder = ScrapyJSONEncoder(**kwargs)
|
||||
self.first_item = True
|
||||
|
||||
def start_exporting(self):
|
||||
self.file.write("[")
|
||||
|
||||
def finish_exporting(self):
|
||||
self.file.write("]")
|
||||
|
||||
def export_item(self, item):
|
||||
if self.first_item:
|
||||
self.first_item = False
|
||||
else:
|
||||
self.file.write(',\n')
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(self.encoder.encode(itemdict))
|
||||
|
||||
|
||||
class XmlItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self.item_element = kwargs.pop('item_element', 'item')
|
||||
self.root_element = kwargs.pop('root_element', 'items')
|
||||
self._configure(kwargs)
|
||||
self.xg = XMLGenerator(file, encoding=self.encoding)
|
||||
|
||||
def start_exporting(self):
|
||||
self.xg.startDocument()
|
||||
self.xg.startElement(self.root_element, {})
|
||||
|
||||
def export_item(self, item):
|
||||
self.xg.startElement(self.item_element, {})
|
||||
for name, value in self._get_serialized_fields(item, default_value=''):
|
||||
self._export_xml_field(name, value)
|
||||
self.xg.endElement(self.item_element)
|
||||
|
||||
def finish_exporting(self):
|
||||
self.xg.endElement(self.root_element)
|
||||
self.xg.endDocument()
|
||||
|
||||
def _export_xml_field(self, name, serialized_value):
|
||||
self.xg.startElement(name, {})
|
||||
if hasattr(serialized_value, 'items'):
|
||||
for subname, value in serialized_value.items():
|
||||
self._export_xml_field(subname, value)
|
||||
elif hasattr(serialized_value, '__iter__'):
|
||||
for value in serialized_value:
|
||||
self._export_xml_field('value', value)
|
||||
else:
|
||||
self._xg_characters(serialized_value)
|
||||
self.xg.endElement(name)
|
||||
|
||||
# Workaround for http://bugs.python.org/issue17606
|
||||
# Before Python 2.7.4 xml.sax.saxutils required bytes;
|
||||
# since 2.7.4 it requires unicode. The bug is likely to be
|
||||
# fixed in 2.7.6, but 2.7.6 will still support unicode,
|
||||
# and Python 3.x will require unicode, so ">= 2.7.4" should be fine.
|
||||
if sys.version_info[:3] >= (2, 7, 4):
|
||||
def _xg_characters(self, serialized_value):
|
||||
if not isinstance(serialized_value, unicode):
|
||||
serialized_value = serialized_value.decode(self.encoding)
|
||||
return self.xg.characters(serialized_value)
|
||||
else:
|
||||
def _xg_characters(self, serialized_value):
|
||||
return self.xg.characters(serialized_value)
|
||||
|
||||
|
||||
class CsvItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
|
||||
self._configure(kwargs, dont_fail=True)
|
||||
self.include_headers_line = include_headers_line
|
||||
self.csv_writer = csv.writer(file, **kwargs)
|
||||
self._headers_not_written = True
|
||||
self._join_multivalued = join_multivalued
|
||||
|
||||
def _to_str_if_unicode(self, value):
|
||||
if isinstance(value, (list, tuple)):
|
||||
try:
|
||||
value = self._join_multivalued.join(value)
|
||||
except TypeError: # list in value may not contain strings
|
||||
pass
|
||||
return super(CsvItemExporter, self)._to_str_if_unicode(value)
|
||||
|
||||
def export_item(self, item):
|
||||
if self._headers_not_written:
|
||||
self._headers_not_written = False
|
||||
self._write_headers_and_set_fields_to_export(item)
|
||||
|
||||
fields = self._get_serialized_fields(item, default_value='',
|
||||
include_empty=True)
|
||||
values = [x[1] for x in fields]
|
||||
self.csv_writer.writerow(values)
|
||||
|
||||
def _write_headers_and_set_fields_to_export(self, item):
|
||||
if self.include_headers_line:
|
||||
if not self.fields_to_export:
|
||||
if isinstance(item, dict):
|
||||
# for dicts try using fields of the first item
|
||||
self.fields_to_export = list(item.keys())
|
||||
else:
|
||||
# use fields declared in Item
|
||||
self.fields_to_export = list(item.fields.keys())
|
||||
self.csv_writer.writerow(self.fields_to_export)
|
||||
|
||||
|
||||
class PickleItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, protocol=2, **kwargs):
|
||||
self._configure(kwargs)
|
||||
self.file = file
|
||||
self.protocol = protocol
|
||||
|
||||
def export_item(self, item):
|
||||
d = dict(self._get_serialized_fields(item))
|
||||
pickle.dump(d, self.file, self.protocol)
|
||||
|
||||
|
||||
class MarshalItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs)
|
||||
self.file = file
|
||||
|
||||
def export_item(self, item):
|
||||
marshal.dump(dict(self._get_serialized_fields(item)), self.file)
|
||||
|
||||
|
||||
class PprintItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self._configure(kwargs)
|
||||
self.file = file
|
||||
|
||||
def export_item(self, item):
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(pprint.pformat(itemdict) + '\n')
|
||||
|
||||
|
||||
class PythonItemExporter(BaseItemExporter):
|
||||
"""The idea behind this exporter is to have a mechanism to serialize items
|
||||
to built-in python types so any serialization library (like
|
||||
json, msgpack, binc, etc) can be used on top of it. Its main goal is to
|
||||
seamless support what BaseItemExporter does plus nested items.
|
||||
"""
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', self._serialize_value)
|
||||
return serializer(value)
|
||||
|
||||
def _serialize_value(self, value):
|
||||
if isinstance(value, BaseItem):
|
||||
return self.export_item(value)
|
||||
if isinstance(value, dict):
|
||||
return dict(self._serialize_dict(value))
|
||||
if hasattr(value, '__iter__'):
|
||||
return [self._serialize_value(v) for v in value]
|
||||
return self._to_str_if_unicode(value)
|
||||
|
||||
def _serialize_dict(self, value):
|
||||
for key, val in six.iteritems(value):
|
||||
yield key, self._serialize_value(val)
|
||||
|
||||
def export_item(self, item):
|
||||
return dict(self._get_serialized_fields(item))
|
0
scrapy/extensions/__init__.py
Normal file
0
scrapy/extensions/__init__.py
Normal file
65
scrapy/extensions/closespider.py
Normal file
65
scrapy/extensions/closespider.py
Normal file
@ -0,0 +1,65 @@
|
||||
"""CloseSpider is an extension that forces spiders to be closed after certain
|
||||
conditions are met.
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from twisted.internet import reactor
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class CloseSpider(object):
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
|
||||
self.close_on = {
|
||||
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
|
||||
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
|
||||
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
|
||||
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
|
||||
}
|
||||
|
||||
self.counter = defaultdict(int)
|
||||
|
||||
if self.close_on.get('errorcount'):
|
||||
crawler.signals.connect(self.error_count, signal=signals.spider_error)
|
||||
if self.close_on.get('pagecount'):
|
||||
crawler.signals.connect(self.page_count, signal=signals.response_received)
|
||||
if self.close_on.get('timeout'):
|
||||
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
if self.close_on.get('itemcount'):
|
||||
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
|
||||
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def error_count(self, failure, response, spider):
|
||||
self.counter['errorcount'] += 1
|
||||
if self.counter['errorcount'] == self.close_on['errorcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_errorcount')
|
||||
|
||||
def page_count(self, response, request, spider):
|
||||
self.counter['pagecount'] += 1
|
||||
if self.counter['pagecount'] == self.close_on['pagecount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_pagecount')
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.task = reactor.callLater(self.close_on['timeout'], \
|
||||
self.crawler.engine.close_spider, spider, \
|
||||
reason='closespider_timeout')
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.counter['itemcount'] += 1
|
||||
if self.counter['itemcount'] == self.close_on['itemcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
|
||||
|
||||
def spider_closed(self, spider):
|
||||
task = getattr(self, 'task', False)
|
||||
if task and task.active():
|
||||
task.cancel()
|
39
scrapy/extensions/corestats.py
Normal file
39
scrapy/extensions/corestats.py
Normal file
@ -0,0 +1,39 @@
|
||||
"""
|
||||
Extension for collecting core stats like items scraped and start/finish times
|
||||
"""
|
||||
import datetime
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
class CoreStats(object):
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
|
||||
crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
|
||||
crawler.signals.connect(o.response_received, signal=signals.response_received)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
self.stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
|
||||
self.stats.set_value('finish_reason', reason, spider=spider)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.stats.inc_value('item_scraped_count', spider=spider)
|
||||
|
||||
def response_received(self, spider):
|
||||
self.stats.inc_value('response_received_count', spider=spider)
|
||||
|
||||
def item_dropped(self, item, spider, exception):
|
||||
reason = exception.__class__.__name__
|
||||
self.stats.inc_value('item_dropped_count', spider=spider)
|
||||
self.stats.inc_value('item_dropped_reasons_count/%s' % reason, spider=spider)
|
64
scrapy/extensions/debug.py
Normal file
64
scrapy/extensions/debug.py
Normal file
@ -0,0 +1,64 @@
|
||||
"""
|
||||
Extensions for debugging Scrapy
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
import sys
|
||||
import signal
|
||||
import logging
|
||||
import traceback
|
||||
import threading
|
||||
from pdb import Pdb
|
||||
|
||||
from scrapy.utils.engine import format_engine_status
|
||||
from scrapy.utils.trackref import format_live_refs
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StackTraceDump(object):
|
||||
|
||||
def __init__(self, crawler=None):
|
||||
self.crawler = crawler
|
||||
try:
|
||||
signal.signal(signal.SIGUSR2, self.dump_stacktrace)
|
||||
signal.signal(signal.SIGQUIT, self.dump_stacktrace)
|
||||
except AttributeError:
|
||||
# win32 platforms don't support SIGUSR signals
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def dump_stacktrace(self, signum, frame):
|
||||
log_args = {
|
||||
'stackdumps': self._thread_stacks(),
|
||||
'enginestatus': format_engine_status(self.crawler.engine),
|
||||
'liverefs': format_live_refs(),
|
||||
}
|
||||
logger.info("Dumping stack trace and engine status\n"
|
||||
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
|
||||
log_args, extra={'crawler': self.crawler})
|
||||
|
||||
def _thread_stacks(self):
|
||||
id2name = dict((th.ident, th.name) for th in threading.enumerate())
|
||||
dumps = ''
|
||||
for id_, frame in sys._current_frames().items():
|
||||
name = id2name.get(id_, '')
|
||||
dump = ''.join(traceback.format_stack(frame))
|
||||
dumps += "# Thread: {0}({1})\n{2}\n".format(name, id_, dump)
|
||||
return dumps
|
||||
|
||||
|
||||
class Debugger(object):
|
||||
def __init__(self):
|
||||
try:
|
||||
signal.signal(signal.SIGUSR2, self._enter_debugger)
|
||||
except AttributeError:
|
||||
# win32 platforms don't support SIGUSR signals
|
||||
pass
|
||||
|
||||
def _enter_debugger(self, signum, frame):
|
||||
Pdb().set_trace(frame.f_back)
|
238
scrapy/extensions/feedexport.py
Normal file
238
scrapy/extensions/feedexport.py
Normal file
@ -0,0 +1,238 @@
|
||||
"""
|
||||
Feed Exports extension
|
||||
|
||||
See documentation in docs/topics/feed-exports.rst
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import posixpath
|
||||
from tempfile import TemporaryFile
|
||||
from datetime import datetime
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from ftplib import FTP
|
||||
|
||||
from zope.interface import Interface, implementer
|
||||
from twisted.internet import defer, threads
|
||||
from w3lib.url import file_uri_to_path
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.utils.ftp import ftp_makedirs_cwd
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.python import get_func_args
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IFeedStorage(Interface):
|
||||
"""Interface that all Feed Storages must implement"""
|
||||
|
||||
def __init__(uri):
|
||||
"""Initialize the storage with the parameters given in the URI"""
|
||||
|
||||
def open(spider):
|
||||
"""Open the storage for the given spider. It must return a file-like
|
||||
object that will be used for the exporters"""
|
||||
|
||||
def store(file):
|
||||
"""Store the given file stream"""
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class BlockingFeedStorage(object):
|
||||
|
||||
def open(self, spider):
|
||||
return TemporaryFile(prefix='feed-')
|
||||
|
||||
def store(self, file):
|
||||
return threads.deferToThread(self._store_in_thread, file)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class StdoutFeedStorage(object):
|
||||
|
||||
def __init__(self, uri, _stdout=sys.stdout):
|
||||
self._stdout = _stdout
|
||||
|
||||
def open(self, spider):
|
||||
return self._stdout
|
||||
|
||||
def store(self, file):
|
||||
pass
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class FileFeedStorage(object):
|
||||
|
||||
def __init__(self, uri):
|
||||
self.path = file_uri_to_path(uri)
|
||||
|
||||
def open(self, spider):
|
||||
dirname = os.path.dirname(self.path)
|
||||
if dirname and not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
return open(self.path, 'ab')
|
||||
|
||||
def store(self, file):
|
||||
file.close()
|
||||
|
||||
|
||||
class S3FeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri):
|
||||
from scrapy.conf import settings
|
||||
try:
|
||||
import boto
|
||||
except ImportError:
|
||||
raise NotConfigured
|
||||
self.connect_s3 = boto.connect_s3
|
||||
u = urlparse(uri)
|
||||
self.bucketname = u.hostname
|
||||
self.access_key = u.username or settings['AWS_ACCESS_KEY_ID']
|
||||
self.secret_key = u.password or settings['AWS_SECRET_ACCESS_KEY']
|
||||
self.keyname = u.path
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
conn = self.connect_s3(self.access_key, self.secret_key)
|
||||
bucket = conn.get_bucket(self.bucketname, validate=False)
|
||||
key = bucket.new_key(self.keyname)
|
||||
key.set_contents_from_file(file)
|
||||
key.close()
|
||||
|
||||
|
||||
class FTPFeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri):
|
||||
u = urlparse(uri)
|
||||
self.host = u.hostname
|
||||
self.port = int(u.port or '21')
|
||||
self.username = u.username
|
||||
self.password = u.password
|
||||
self.path = u.path
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
ftp = FTP()
|
||||
ftp.connect(self.host, self.port)
|
||||
ftp.login(self.username, self.password)
|
||||
dirname, filename = posixpath.split(self.path)
|
||||
ftp_makedirs_cwd(ftp, dirname)
|
||||
ftp.storbinary('STOR %s' % filename, file)
|
||||
ftp.quit()
|
||||
|
||||
|
||||
class SpiderSlot(object):
|
||||
def __init__(self, file, exporter, storage, uri):
|
||||
self.file = file
|
||||
self.exporter = exporter
|
||||
self.storage = storage
|
||||
self.uri = uri
|
||||
self.itemcount = 0
|
||||
|
||||
|
||||
class FeedExporter(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
self.settings = settings
|
||||
self.urifmt = settings['FEED_URI']
|
||||
if not self.urifmt:
|
||||
raise NotConfigured
|
||||
self.format = settings['FEED_FORMAT'].lower()
|
||||
self.storages = self._load_components('FEED_STORAGES')
|
||||
self.exporters = self._load_components('FEED_EXPORTERS')
|
||||
if not self._storage_supported(self.urifmt):
|
||||
raise NotConfigured
|
||||
if not self._exporter_supported(self.format):
|
||||
raise NotConfigured
|
||||
self.store_empty = settings.getbool('FEED_STORE_EMPTY')
|
||||
self.export_fields = settings.getlist('FEED_EXPORT_FIELDS')
|
||||
uripar = settings['FEED_URI_PARAMS']
|
||||
self._uripar = load_object(uripar) if uripar else lambda x, y: None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings)
|
||||
crawler.signals.connect(o.open_spider, signals.spider_opened)
|
||||
crawler.signals.connect(o.close_spider, signals.spider_closed)
|
||||
crawler.signals.connect(o.item_scraped, signals.item_scraped)
|
||||
return o
|
||||
|
||||
def open_spider(self, spider):
|
||||
uri = self.urifmt % self._get_uri_params(spider)
|
||||
storage = self._get_storage(uri)
|
||||
file = storage.open(spider)
|
||||
exporter = self._get_exporter(file, fields_to_export=self.export_fields)
|
||||
exporter.start_exporting()
|
||||
self.slot = SpiderSlot(file, exporter, storage, uri)
|
||||
|
||||
def close_spider(self, spider):
|
||||
slot = self.slot
|
||||
if not slot.itemcount and not self.store_empty:
|
||||
return
|
||||
slot.exporter.finish_exporting()
|
||||
logfmt = "%%s %(format)s feed (%(itemcount)d items) in: %(uri)s"
|
||||
log_args = {'format': self.format,
|
||||
'itemcount': slot.itemcount,
|
||||
'uri': slot.uri}
|
||||
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args,
|
||||
extra={'spider': spider}))
|
||||
d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args,
|
||||
extra={'spider': spider, 'failure': f}))
|
||||
return d
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
slot = self.slot
|
||||
slot.exporter.export_item(item)
|
||||
slot.itemcount += 1
|
||||
return item
|
||||
|
||||
def _load_components(self, setting_prefix):
|
||||
conf = dict(self.settings['%s_BASE' % setting_prefix])
|
||||
conf.update(self.settings[setting_prefix])
|
||||
d = {}
|
||||
for k, v in conf.items():
|
||||
try:
|
||||
d[k] = load_object(v)
|
||||
except NotConfigured:
|
||||
pass
|
||||
return d
|
||||
|
||||
def _exporter_supported(self, format):
|
||||
if format in self.exporters:
|
||||
return True
|
||||
logger.error("Unknown feed format: %(format)s", {'format': format})
|
||||
|
||||
def _storage_supported(self, uri):
|
||||
scheme = urlparse(uri).scheme
|
||||
if scheme in self.storages:
|
||||
try:
|
||||
self._get_storage(uri)
|
||||
return True
|
||||
except NotConfigured:
|
||||
logger.error("Disabled feed storage scheme: %(scheme)s",
|
||||
{'scheme': scheme})
|
||||
else:
|
||||
logger.error("Unknown feed storage scheme: %(scheme)s",
|
||||
{'scheme': scheme})
|
||||
|
||||
def _get_exporter(self, *args, **kwargs):
|
||||
return self.exporters[self.format](*args, **kwargs)
|
||||
|
||||
def _get_storage(self, uri):
|
||||
return self.storages[urlparse(uri).scheme](uri)
|
||||
|
||||
def _get_uri_params(self, spider):
|
||||
params = {}
|
||||
for k in dir(spider):
|
||||
params[k] = getattr(spider, k)
|
||||
ts = datetime.utcnow().replace(microsecond=0).isoformat().replace(':', '-')
|
||||
params['time'] = ts
|
||||
self._uripar(params, spider)
|
||||
return params
|
378
scrapy/extensions/httpcache.py
Normal file
378
scrapy/extensions/httpcache.py
Normal file
@ -0,0 +1,378 @@
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import gzip
|
||||
from six.moves import cPickle as pickle
|
||||
from importlib import import_module
|
||||
from time import time
|
||||
from weakref import WeakKeyDictionary
|
||||
from email.utils import mktime_tz, parsedate_tz
|
||||
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
|
||||
from scrapy.http import Headers
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.project import data_path
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
|
||||
|
||||
class DummyPolicy(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
return urlparse_cached(request).scheme not in self.ignore_schemes
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
return response.status not in self.ignore_http_codes
|
||||
|
||||
def is_cached_response_fresh(self, response, request):
|
||||
return True
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
return True
|
||||
|
||||
|
||||
class RFC2616Policy(object):
|
||||
|
||||
MAXAGE = 3600 * 24 * 365 # one year
|
||||
|
||||
def __init__(self, settings):
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self._cc_parsed = WeakKeyDictionary()
|
||||
|
||||
def _parse_cachecontrol(self, r):
|
||||
if r not in self._cc_parsed:
|
||||
cch = r.headers.get('Cache-Control', '')
|
||||
self._cc_parsed[r] = parse_cachecontrol(cch)
|
||||
return self._cc_parsed[r]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
if urlparse_cached(request).scheme in self.ignore_schemes:
|
||||
return False
|
||||
cc = self._parse_cachecontrol(request)
|
||||
# obey user-agent directive "Cache-Control: no-store"
|
||||
if 'no-store' in cc:
|
||||
return False
|
||||
# Any other is eligible for caching
|
||||
return True
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
# What is cacheable - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1
|
||||
# Response cacheability - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
|
||||
# Status code 206 is not included because cache can not deal with partial contents
|
||||
cc = self._parse_cachecontrol(response)
|
||||
# obey directive "Cache-Control: no-store"
|
||||
if 'no-store' in cc:
|
||||
return False
|
||||
# Never cache 304 (Not Modified) responses
|
||||
elif response.status == 304:
|
||||
return False
|
||||
# Any hint on response expiration is good
|
||||
elif 'max-age' in cc or 'Expires' in response.headers:
|
||||
return True
|
||||
# Firefox fallbacks this statuses to one year expiration if none is set
|
||||
elif response.status in (300, 301, 308):
|
||||
return True
|
||||
# Other statuses without expiration requires at least one validator
|
||||
elif response.status in (200, 203, 401):
|
||||
return 'Last-Modified' in response.headers or 'ETag' in response.headers
|
||||
# Any other is probably not eligible for caching
|
||||
# Makes no sense to cache responses that does not contain expiration
|
||||
# info and can not be revalidated
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
ccreq = self._parse_cachecontrol(request)
|
||||
if 'no-cache' in cc or 'no-cache' in ccreq:
|
||||
return False
|
||||
|
||||
now = time()
|
||||
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
|
||||
currentage = self._compute_current_age(cachedresponse, request, now)
|
||||
if currentage < freshnesslifetime:
|
||||
return True
|
||||
# Cached response is stale, try to set validators if any
|
||||
self._set_conditional_validators(request, cachedresponse)
|
||||
return False
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
return response.status == 304
|
||||
|
||||
def _set_conditional_validators(self, request, cachedresponse):
|
||||
if 'Last-Modified' in cachedresponse.headers:
|
||||
request.headers['If-Modified-Since'] = cachedresponse.headers['Last-Modified']
|
||||
|
||||
if 'ETag' in cachedresponse.headers:
|
||||
request.headers['If-None-Match'] = cachedresponse.headers['ETag']
|
||||
|
||||
def _compute_freshness_lifetime(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeFreshnessLifetime
|
||||
# http://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#410
|
||||
cc = self._parse_cachecontrol(response)
|
||||
if 'max-age' in cc:
|
||||
try:
|
||||
return max(0, int(cc['max-age']))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Parse date header or synthesize it if none exists
|
||||
date = rfc1123_to_epoch(response.headers.get('Date')) or now
|
||||
|
||||
# Try HTTP/1.0 Expires header
|
||||
if 'Expires' in response.headers:
|
||||
expires = rfc1123_to_epoch(response.headers['Expires'])
|
||||
# When parsing Expires header fails RFC 2616 section 14.21 says we
|
||||
# should treat this as an expiration time in the past.
|
||||
return max(0, expires - date) if expires else 0
|
||||
|
||||
# Fallback to heuristic using last-modified header
|
||||
# This is not in RFC but on Firefox caching implementation
|
||||
lastmodified = rfc1123_to_epoch(response.headers.get('Last-Modified'))
|
||||
if lastmodified and lastmodified <= date:
|
||||
return (date - lastmodified) / 10
|
||||
|
||||
# This request can be cached indefinitely
|
||||
if response.status in (300, 301, 308):
|
||||
return self.MAXAGE
|
||||
|
||||
# Insufficient information to compute fresshness lifetime
|
||||
return 0
|
||||
|
||||
def _compute_current_age(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeCurrentAge
|
||||
# http://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#366
|
||||
currentage = 0
|
||||
# If Date header is not set we assume it is a fast connection, and
|
||||
# clock is in sync with the server
|
||||
date = rfc1123_to_epoch(response.headers.get('Date')) or now
|
||||
if now > date:
|
||||
currentage = now - date
|
||||
|
||||
if 'Age' in response.headers:
|
||||
try:
|
||||
age = int(response.headers['Age'])
|
||||
currentage = max(currentage, age)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return currentage
|
||||
|
||||
|
||||
class DbmCacheStorage(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
|
||||
self.db = None
|
||||
|
||||
def open_spider(self, spider):
|
||||
dbpath = os.path.join(self.cachedir, '%s.db' % spider.name)
|
||||
self.db = self.dbmodule.open(dbpath, 'c')
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.db.close()
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
data = self._read_data(spider, request)
|
||||
if data is None:
|
||||
return # not cached
|
||||
url = data['url']
|
||||
status = data['status']
|
||||
headers = Headers(data['headers'])
|
||||
body = data['body']
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
key = self._request_key(request)
|
||||
data = {
|
||||
'status': response.status,
|
||||
'url': response.url,
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
}
|
||||
self.db['%s_data' % key] = pickle.dumps(data, protocol=2)
|
||||
self.db['%s_time' % key] = str(time())
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
key = self._request_key(request)
|
||||
db = self.db
|
||||
tkey = '%s_time' % key
|
||||
if tkey not in db:
|
||||
return # not found
|
||||
|
||||
ts = db[tkey]
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return # expired
|
||||
|
||||
return pickle.loads(db['%s_data' % key])
|
||||
|
||||
def _request_key(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
|
||||
class FilesystemCacheStorage(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
|
||||
self._open = gzip.open if self.use_gzip else open
|
||||
|
||||
def open_spider(self, spider):
|
||||
pass
|
||||
|
||||
def close_spider(self, spider):
|
||||
pass
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
"""Return response if present in cache, or None otherwise."""
|
||||
metadata = self._read_meta(spider, request)
|
||||
if metadata is None:
|
||||
return # not cached
|
||||
rpath = self._get_request_path(spider, request)
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
|
||||
body = f.read()
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
|
||||
rawheaders = f.read()
|
||||
url = metadata.get('response_url')
|
||||
status = metadata['status']
|
||||
headers = Headers(headers_raw_to_dict(rawheaders))
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
"""Store the given response in the cache."""
|
||||
rpath = self._get_request_path(spider, request)
|
||||
if not os.path.exists(rpath):
|
||||
os.makedirs(rpath)
|
||||
metadata = {
|
||||
'url': request.url,
|
||||
'method': request.method,
|
||||
'status': response.status,
|
||||
'response_url': response.url,
|
||||
'timestamp': time(),
|
||||
}
|
||||
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
|
||||
f.write(repr(metadata))
|
||||
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
||||
pickle.dump(metadata, f, protocol=2)
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(response.headers))
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
||||
f.write(response.body)
|
||||
with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(request.headers))
|
||||
with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
|
||||
f.write(request.body)
|
||||
|
||||
def _get_request_path(self, spider, request):
|
||||
key = request_fingerprint(request)
|
||||
return os.path.join(self.cachedir, spider.name, key[0:2], key)
|
||||
|
||||
def _read_meta(self, spider, request):
|
||||
rpath = self._get_request_path(spider, request)
|
||||
metapath = os.path.join(rpath, 'pickled_meta')
|
||||
if not os.path.exists(metapath):
|
||||
return # not found
|
||||
mtime = os.stat(rpath).st_mtime
|
||||
if 0 < self.expiration_secs < time() - mtime:
|
||||
return # expired
|
||||
with self._open(metapath, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
class LeveldbCacheStorage(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
import leveldb
|
||||
self._leveldb = leveldb
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.db = None
|
||||
|
||||
def open_spider(self, spider):
|
||||
dbpath = os.path.join(self.cachedir, '%s.leveldb' % spider.name)
|
||||
self.db = self._leveldb.LevelDB(dbpath)
|
||||
|
||||
def close_spider(self, spider):
|
||||
del self.db
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
data = self._read_data(spider, request)
|
||||
if data is None:
|
||||
return # not cached
|
||||
url = data['url']
|
||||
status = data['status']
|
||||
headers = Headers(data['headers'])
|
||||
body = data['body']
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
key = self._request_key(request)
|
||||
data = {
|
||||
'status': response.status,
|
||||
'url': response.url,
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
}
|
||||
batch = self._leveldb.WriteBatch()
|
||||
batch.Put('%s_data' % key, pickle.dumps(data, protocol=2))
|
||||
batch.Put('%s_time' % key, str(time()))
|
||||
self.db.Write(batch)
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
key = self._request_key(request)
|
||||
try:
|
||||
ts = self.db.Get('%s_time' % key)
|
||||
except KeyError:
|
||||
return # not found or invalid entry
|
||||
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return # expired
|
||||
|
||||
try:
|
||||
data = self.db.Get('%s_data' % key)
|
||||
except KeyError:
|
||||
return # invalid entry
|
||||
else:
|
||||
return pickle.loads(data)
|
||||
|
||||
def _request_key(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
|
||||
|
||||
def parse_cachecontrol(header):
|
||||
"""Parse Cache-Control header
|
||||
|
||||
http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
|
||||
|
||||
>>> parse_cachecontrol('public, max-age=3600') == {'public': None,
|
||||
... 'max-age': '3600'}
|
||||
True
|
||||
>>> parse_cachecontrol('') == {}
|
||||
True
|
||||
|
||||
"""
|
||||
directives = {}
|
||||
for directive in header.split(','):
|
||||
key, sep, val = directive.strip().partition('=')
|
||||
if key:
|
||||
directives[key.lower()] = val if sep else None
|
||||
return directives
|
||||
|
||||
|
||||
def rfc1123_to_epoch(date_str):
|
||||
try:
|
||||
return mktime_tz(parsedate_tz(date_str))
|
||||
except Exception:
|
||||
return None
|
51
scrapy/extensions/logstats.py
Normal file
51
scrapy/extensions/logstats.py
Normal file
@ -0,0 +1,51 @@
|
||||
import logging
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogStats(object):
|
||||
"""Log basic scraping stats periodically"""
|
||||
|
||||
def __init__(self, stats, interval=60.0):
|
||||
self.stats = stats
|
||||
self.interval = interval
|
||||
self.multiplier = 60.0 / self.interval
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
|
||||
if not interval:
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats, interval)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.pagesprev = 0
|
||||
self.itemsprev = 0
|
||||
|
||||
self.task = task.LoopingCall(self.log, spider)
|
||||
self.task.start(self.interval)
|
||||
|
||||
def log(self, spider):
|
||||
items = self.stats.get_value('item_scraped_count', 0)
|
||||
pages = self.stats.get_value('response_received_count', 0)
|
||||
irate = (items - self.itemsprev) * self.multiplier
|
||||
prate = (pages - self.pagesprev) * self.multiplier
|
||||
self.pagesprev, self.itemsprev = pages, items
|
||||
|
||||
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
|
||||
"scraped %(items)d items (at %(itemrate)d items/min)")
|
||||
log_args = {'pages': pages, 'pagerate': prate,
|
||||
'items': items, 'itemrate': irate}
|
||||
logger.info(msg, log_args, extra={'spider': spider})
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
if self.task.running:
|
||||
self.task.stop()
|
34
scrapy/extensions/memdebug.py
Normal file
34
scrapy/extensions/memdebug.py
Normal file
@ -0,0 +1,34 @@
|
||||
"""
|
||||
MemoryDebugger extension
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
import gc
|
||||
import six
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.trackref import live_refs
|
||||
|
||||
|
||||
class MemoryDebugger(object):
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('MEMDEBUG_ENABLED'):
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
gc.collect()
|
||||
self.stats.set_value('memdebug/gc_garbage_count', len(gc.garbage), spider=spider)
|
||||
for cls, wdict in six.iteritems(live_refs):
|
||||
if not wdict:
|
||||
continue
|
||||
self.stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict), spider=spider)
|
122
scrapy/extensions/memusage.py
Normal file
122
scrapy/extensions/memusage.py
Normal file
@ -0,0 +1,122 @@
|
||||
"""
|
||||
MemoryUsage extension
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
import sys
|
||||
import socket
|
||||
import logging
|
||||
from pprint import pformat
|
||||
from importlib import import_module
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.mail import MailSender
|
||||
from scrapy.utils.engine import get_engine_status
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryUsage(object):
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
|
||||
raise NotConfigured
|
||||
try:
|
||||
# stdlib's resource module is only available on unix platforms.
|
||||
self.resource = import_module('resource')
|
||||
except ImportError:
|
||||
raise NotConfigured
|
||||
|
||||
self.crawler = crawler
|
||||
self.warned = False
|
||||
self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
|
||||
self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
|
||||
self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
|
||||
self.report = crawler.settings.getbool('MEMUSAGE_REPORT')
|
||||
self.mail = MailSender.from_settings(crawler.settings)
|
||||
crawler.signals.connect(self.engine_started, signal=signals.engine_started)
|
||||
crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def get_virtual_size(self):
|
||||
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
|
||||
if sys.platform != 'darwin':
|
||||
# on Mac OS X ru_maxrss is in bytes, on Linux it is in KB
|
||||
size *= 1024
|
||||
return size
|
||||
|
||||
def engine_started(self):
|
||||
self.crawler.stats.set_value('memusage/startup', self.get_virtual_size())
|
||||
self.tasks = []
|
||||
tsk = task.LoopingCall(self.update)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(60.0, now=True)
|
||||
if self.limit:
|
||||
tsk = task.LoopingCall(self._check_limit)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(60.0, now=True)
|
||||
if self.warning:
|
||||
tsk = task.LoopingCall(self._check_warning)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(60.0, now=True)
|
||||
|
||||
def engine_stopped(self):
|
||||
for tsk in self.tasks:
|
||||
if tsk.running:
|
||||
tsk.stop()
|
||||
|
||||
def update(self):
|
||||
self.crawler.stats.max_value('memusage/max', self.get_virtual_size())
|
||||
|
||||
def _check_limit(self):
|
||||
if self.get_virtual_size() > self.limit:
|
||||
self.crawler.stats.set_value('memusage/limit_reached', 1)
|
||||
mem = self.limit/1024/1024
|
||||
logger.error("Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = "%s terminated: memory usage exceeded %dM at %s" % \
|
||||
(self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/limit_notified', 1)
|
||||
|
||||
open_spiders = self.crawler.engine.open_spiders
|
||||
if open_spiders:
|
||||
for spider in open_spiders:
|
||||
self.crawler.engine.close_spider(spider, 'memusage_exceeded')
|
||||
else:
|
||||
self.crawler.stop()
|
||||
|
||||
def _check_warning(self):
|
||||
if self.warned: # warn only once
|
||||
return
|
||||
if self.get_virtual_size() > self.warning:
|
||||
self.crawler.stats.set_value('memusage/warning_reached', 1)
|
||||
mem = self.warning/1024/1024
|
||||
logger.warning("Memory usage reached %(memusage)dM",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = "%s warning: memory usage reached %dM at %s" % \
|
||||
(self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/warning_notified', 1)
|
||||
self.warned = True
|
||||
|
||||
def _send_report(self, rcpts, subject):
|
||||
"""send notification mail with some additional useful info"""
|
||||
stats = self.crawler.stats
|
||||
s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value('memusage/startup')/1024/1024)
|
||||
s += "Maximum memory usage : %dM\r\n" % (stats.get_value('memusage/max')/1024/1024)
|
||||
s += "Current memory usage : %dM\r\n" % (self.get_virtual_size()/1024/1024)
|
||||
|
||||
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
|
||||
s += "\r\n"
|
||||
s += pformat(get_engine_status(self.crawler.engine))
|
||||
s += "\r\n"
|
||||
self.mail.send(rcpts, subject, s)
|
34
scrapy/extensions/spiderstate.py
Normal file
34
scrapy/extensions/spiderstate.py
Normal file
@ -0,0 +1,34 @@
|
||||
import os
|
||||
from six.moves import cPickle as pickle
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.utils.job import job_dir
|
||||
|
||||
class SpiderState(object):
|
||||
"""Store and load spider state during a scraping job"""
|
||||
|
||||
def __init__(self, jobdir=None):
|
||||
self.jobdir = jobdir
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
obj = cls(job_dir(crawler.settings))
|
||||
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
|
||||
return obj
|
||||
|
||||
def spider_closed(self, spider):
|
||||
if self.jobdir:
|
||||
with open(self.statefn, 'wb') as f:
|
||||
pickle.dump(spider.state, f, protocol=2)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
if self.jobdir and os.path.exists(self.statefn):
|
||||
with open(self.statefn, 'rb') as f:
|
||||
spider.state = pickle.load(f)
|
||||
else:
|
||||
spider.state = {}
|
||||
|
||||
@property
|
||||
def statefn(self):
|
||||
return os.path.join(self.jobdir, 'spider.state')
|
34
scrapy/extensions/statsmailer.py
Normal file
34
scrapy/extensions/statsmailer.py
Normal file
@ -0,0 +1,34 @@
|
||||
"""
|
||||
StatsMailer extension sends an email when a spider finishes scraping.
|
||||
|
||||
Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
|
||||
"""
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.mail import MailSender
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
class StatsMailer(object):
|
||||
|
||||
def __init__(self, stats, recipients, mail):
|
||||
self.stats = stats
|
||||
self.recipients = recipients
|
||||
self.mail = mail
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
|
||||
if not recipients:
|
||||
raise NotConfigured
|
||||
mail = MailSender.from_settings(crawler.settings)
|
||||
o = cls(crawler.stats, recipients, mail)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_closed(self, spider):
|
||||
spider_stats = self.stats.get_stats(spider)
|
||||
body = "Global stats\n\n"
|
||||
body += "\n".join("%-50s : %s" % i for i in self.stats.get_stats().items())
|
||||
body += "\n\n%s stats\n\n" % spider.name
|
||||
body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
|
||||
return self.mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body)
|
80
scrapy/extensions/throttle.py
Normal file
80
scrapy/extensions/throttle.py
Normal file
@ -0,0 +1,80 @@
|
||||
import logging
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoThrottle(object):
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
|
||||
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def _spider_opened(self, spider):
|
||||
self.mindelay = self._min_delay(spider)
|
||||
self.maxdelay = self._max_delay(spider)
|
||||
spider.download_delay = self._start_delay(spider)
|
||||
|
||||
def _min_delay(self, spider):
|
||||
s = self.crawler.settings
|
||||
return getattr(spider, 'download_delay', 0.0) or \
|
||||
s.getfloat('AUTOTHROTTLE_MIN_DOWNLOAD_DELAY') or \
|
||||
s.getfloat('DOWNLOAD_DELAY')
|
||||
|
||||
def _max_delay(self, spider):
|
||||
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY', 60.0)
|
||||
|
||||
def _start_delay(self, spider):
|
||||
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY', 5.0))
|
||||
|
||||
def _response_downloaded(self, response, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
latency = request.meta.get('download_latency')
|
||||
if latency is None or slot is None:
|
||||
return
|
||||
|
||||
olddelay = slot.delay
|
||||
self._adjust_delay(slot, latency, response)
|
||||
if self.debug:
|
||||
diff = slot.delay - olddelay
|
||||
size = len(response.body)
|
||||
conc = len(slot.transferring)
|
||||
logger.info(
|
||||
"slot: %(slot)s | conc:%(concurrency)2d | "
|
||||
"delay:%(delay)5d ms (%(delaydiff)+d) | "
|
||||
"latency:%(latency)5d ms | size:%(size)6d bytes",
|
||||
{
|
||||
'slot': key, 'concurrency': conc,
|
||||
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
|
||||
'latency': latency * 1000, 'size': size
|
||||
},
|
||||
extra={'spider': spider}
|
||||
)
|
||||
|
||||
def _get_slot(self, request, spider):
|
||||
key = request.meta.get('download_slot')
|
||||
return key, self.crawler.engine.downloader.slots.get(key)
|
||||
|
||||
def _adjust_delay(self, slot, latency, response):
|
||||
"""Define delay adjustment policy"""
|
||||
# If latency is bigger than old delay, then use latency instead of mean.
|
||||
# It works better with problematic sites
|
||||
new_delay = min(max(self.mindelay, latency, (slot.delay + latency) / 2.0), self.maxdelay)
|
||||
|
||||
# Dont adjust delay if response status != 200 and new delay is smaller
|
||||
# than old one, as error pages (and redirections) are usually small and
|
||||
# so tend to reduce latency, thus provoking a positive feedback by
|
||||
# reducing delay instead of increase.
|
||||
if response.status == 200 or new_delay > slot.delay:
|
||||
slot.delay = new_delay
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user