mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 10:23:40 +00:00
Automated merge with http://hg.scrapy.org/scrapy-stable/
This commit is contained in:
commit
0d75a3a636
@ -9,12 +9,15 @@ API stability is one of Scrapy major goals.
|
||||
Versioning
|
||||
==========
|
||||
|
||||
Each Scrapy release consists of three version numbers:
|
||||
When Scrapy reaches 1.0, each release will consist of three version numbers:
|
||||
|
||||
* major - big, backwards-incompatible changes
|
||||
* minor - new features and backwards-compatible changes
|
||||
* micro - bug fixes only
|
||||
|
||||
Until Scrapy reaches 1.0, minor releases (0.7, 0.8, etc) will follow the same
|
||||
policy as major releases.
|
||||
|
||||
Sometimes the micro version can be omitted, for brevity, when it's not
|
||||
relevant.
|
||||
|
||||
|
@ -25,7 +25,7 @@ The :setting:`SCHEDULER_MIDDLEWARES` setting is merged with the
|
||||
:setting:`SCHEDULER_MIDDLEWARES_BASE` setting defined in Scrapy (and not meant
|
||||
to be overridden) and then sorted by order to get the final sorted list of
|
||||
enabled middlewares: the first middleware is the one closer to the engine and
|
||||
the last is the one closer to the spider.
|
||||
the last is the one closer to the scheduler.
|
||||
|
||||
To decide which order to assign to your middleware see the
|
||||
:setting:`SCHEDULER_MIDDLEWARES_BASE` setting and pick a value according to
|
||||
@ -38,9 +38,9 @@ If you want to disable a builtin middleware (the ones defined in
|
||||
in your project :setting:`SCHEDULER_MIDDLEWARES` setting and assign `None` as its
|
||||
value. For example, if you want to disable the duplicates filter middleware::
|
||||
|
||||
SPIDER_MIDDLEWARES = {
|
||||
SCHEDULER_MIDDLEWARES = {
|
||||
'myproject.middlewares.CustomSchedulerMiddleware': 543,
|
||||
'scrapy.contrib.spidermiddleware.duplicatesfilter.DuplicatesFilterMiddleware: None,
|
||||
'scrapy.contrib.schedulermiddleware.duplicatesfilter.DuplicatesFilterMiddleware: None,
|
||||
}
|
||||
|
||||
Finally, keep in mind that some middlewares may need to be enabled through a
|
||||
|
21
docs/faq.rst
21
docs/faq.rst
@ -54,12 +54,9 @@ to steal from us!
|
||||
Does Scrapy work with HTTP proxies?
|
||||
-----------------------------------
|
||||
|
||||
No. Support for HTTP proxies is not currently implemented in Scrapy, but it
|
||||
will be in the future. For more information about this, follow `this ticket
|
||||
<http://dev.scrapy.org/ticket/71>`_. Setting the ``http_proxy`` environment
|
||||
variable won't work because Twisted (the library used by Scrapy to download
|
||||
pages) doesn't support it. See `this Twisted ticket
|
||||
<http://twistedmatrix.com/trac/ticket/2714>`_ for more info.
|
||||
Yes. Support for HTTP proxies is provided (since Scrapy 0.8) through the HTTP
|
||||
Proxy downloader middleware. See
|
||||
:class:`~scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware`.
|
||||
|
||||
Scrapy crashes with: ImportError: No module named win32api
|
||||
----------------------------------------------------------
|
||||
@ -121,3 +118,15 @@ written in a ``my_spider.py`` file you can run it with::
|
||||
|
||||
scrapy-ctl.py runspider my_spider.py
|
||||
|
||||
I get "Filtered offsite request" messages. How can I fix them?
|
||||
--------------------------------------------------------------
|
||||
|
||||
Those messages (logged with ``DEBUG`` level) don't necesarilly mean there is a
|
||||
problem, so you may not need to fix them.
|
||||
|
||||
Those message are thrown by the Offsite Spider Middleware, which is a spider
|
||||
middleware (enabled by default) whose purpose is to filter out requests to
|
||||
domains outside the ones covered by the spider.
|
||||
|
||||
For more info see:
|
||||
:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware`.
|
||||
|
@ -153,7 +153,7 @@ extracted item into a file using `pickle`_::
|
||||
import pickle
|
||||
|
||||
class StoreItemPipeline(object):
|
||||
def process_item(self, domain, response, item):
|
||||
def process_item(self, spider, item):
|
||||
torrent_id = item['url'].split('/')[-1]
|
||||
f = open("torrent-%s.pickle" % torrent_id, "w")
|
||||
pickle.dump(item, f)
|
||||
|
@ -156,17 +156,17 @@ will get an output similar to this::
|
||||
[dmoz] INFO: Enabled downloader middlewares: ...
|
||||
[dmoz] INFO: Enabled spider middlewares: ...
|
||||
[dmoz] INFO: Enabled item pipelines: ...
|
||||
[dmoz.org] INFO: Domain opened
|
||||
[dmoz.org] INFO: Spider opened
|
||||
[dmoz.org] DEBUG: Crawled <http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> from <None>
|
||||
[dmoz.org] DEBUG: Crawled <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> from <None>
|
||||
[dmoz.org] INFO: Domain closed (finished)
|
||||
[dmoz.org] INFO: Spider closed (finished)
|
||||
[-] Main loop terminated.
|
||||
|
||||
Pay attention to the lines containing ``[dmoz.org]``, which corresponds to
|
||||
our spider (identified by the domain "dmoz.org"). You can see a log line for each
|
||||
URL defined in ``start_urls``. Because these URLs are the starting ones, they
|
||||
have no referrers, which is shown at the end of the log line, where it says
|
||||
``from <None>``.
|
||||
our spider (identified by the domain ``"dmoz.org"``). You can see a log line
|
||||
for each URL defined in ``start_urls``. Because these URLs are the starting
|
||||
ones, they have no referrers, which is shown at the end of the log line,
|
||||
where it says ``from <None>``.
|
||||
|
||||
But more interesting, as our ``parse`` method instructs, two files have been
|
||||
created: *Books* and *Resources*, with the content of both URLs.
|
||||
@ -445,7 +445,7 @@ creation step, it's in ``dmoz/pipelines.py`` and looks like this::
|
||||
# Define your item pipelines here
|
||||
|
||||
class DmozPipeline(object):
|
||||
def process_item(self, domain, item):
|
||||
def process_item(self, spider, item):
|
||||
return item
|
||||
|
||||
We have to override the ``process_item`` method in order to store our Items
|
||||
@ -461,7 +461,7 @@ separated values) file using the standard library `csv module`_::
|
||||
def __init__(self):
|
||||
self.csvwriter = csv.writer(open('items.csv', 'wb'))
|
||||
|
||||
def process_item(self, domain, item):
|
||||
def process_item(self, spider, item):
|
||||
self.csvwriter.writerow([item['title'][0], item['link'][0], item['desc'][0]])
|
||||
return item
|
||||
|
||||
|
@ -214,25 +214,86 @@ HttpCacheMiddleware
|
||||
.. class:: HttpCacheMiddleware
|
||||
|
||||
This middleware provides low-level cache to all HTTP requests and responses.
|
||||
Every request and its corresponding response are cached and then, when that
|
||||
same request is seen again, the response is returned without transferring
|
||||
Every request and its corresponding response are cached. When the same
|
||||
request is seen again, the response is returned without transferring
|
||||
anything from the Internet.
|
||||
|
||||
The HTTP cache is useful for testing spiders faster (without having to wait for
|
||||
downloads every time) and for trying your spider off-line when you don't have
|
||||
an Internet connection.
|
||||
|
||||
The :class:`HttpCacheMiddleware` can be configured through the following
|
||||
settings (see the settings documentation for more info):
|
||||
File system storage
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
By default, the :class:`HttpCacheMiddleware` uses a file system storage with the following structure:
|
||||
|
||||
Each request/response pair is stored in a different directory containing with
|
||||
the following files:
|
||||
|
||||
* ``request_body`` - the plain request body
|
||||
* ``request_headers`` - the request headers (in raw HTTP format)
|
||||
* ``response_body`` - the plain response body
|
||||
* ``response_headers`` - the request headers (in raw HTTP format)
|
||||
* ``meta`` - some metadata of this cache resource in Python ``repr()`` format
|
||||
(for easy grepeability)
|
||||
* ``pickled_meta`` - the same metadata in ``meta`` but pickled for more
|
||||
efficient deserialization
|
||||
|
||||
The directory name is made from the request fingerprint (see
|
||||
``scrapy.utils.request.fingerprint``), and one level of subdirectories is
|
||||
used to avoid creating too many files into the same directory (which is
|
||||
inefficient in many file systems). An example directory could be::
|
||||
|
||||
/path/to/cache/dir/example.com/72/72811f648e718090f041317756c03adb0ada46c7
|
||||
|
||||
The cache storage backend can be changed with the :setting:`HTTPCACHE_STORAGE`
|
||||
setting, but no other backend is provided with Scrapy yet.
|
||||
|
||||
Settings
|
||||
~~~~~~~~
|
||||
|
||||
The :class:`HttpCacheMiddleware` can be configured through the following
|
||||
settings:
|
||||
|
||||
.. setting:: HTTPCACHE_DIR
|
||||
|
||||
HTTPCACHE_DIR
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
Default: ``''`` (empty string)
|
||||
|
||||
The directory to use for storing the (low-level) HTTP cache. If empty the HTTP
|
||||
cache will be disabled.
|
||||
|
||||
.. setting:: HTTPCACHE_EXPIRATION_SECS
|
||||
|
||||
HTTPCACHE_EXPIRATION_SECS
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Default: ``0``
|
||||
|
||||
Number of seconds to use for HTTP cache expiration. Requests that were cached
|
||||
before this time will be re-downloaded. If zero, cached requests will always
|
||||
expire. Negative numbers means requests will never expire.
|
||||
|
||||
.. setting:: HTTPCACHE_IGNORE_MISSING
|
||||
|
||||
HTTPCACHE_IGNORE_MISSING
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Default: ``False``
|
||||
|
||||
If enabled, requests not found in the cache will be ignored instead of downloaded.
|
||||
|
||||
.. setting:: HTTPCACHE_STORAGE
|
||||
|
||||
HTTPCACHE_STORAGE
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
Default: ``'scrapy.contrib.downloadermiddleware.httpcache.FilesystemCacheStorage'``
|
||||
|
||||
The class which implements the cache storage backend.
|
||||
|
||||
* :setting:`HTTPCACHE_DIR` - this one actually enables the cache besides
|
||||
settings the cache dir
|
||||
* :setting:`HTTPCACHE_IGNORE_MISSING` - ignoring missing requests instead
|
||||
of downloading them
|
||||
* :setting:`HTTPCACHE_SECTORIZE` - split HTTP cache in several directories
|
||||
(for performance reasons)
|
||||
* :setting:`HTTPCACHE_EXPIRATION_SECS` - how many secs until the cache is
|
||||
considered out of date
|
||||
|
||||
.. _topics-dlmw-robots:
|
||||
|
||||
@ -247,6 +308,29 @@ HttpCompressionMiddleware
|
||||
This middleware allows compressed (gzip, deflate) traffic to be
|
||||
sent/received from web sites.
|
||||
|
||||
HttpProxyMiddleware
|
||||
-------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.httpproxy
|
||||
:synopsis: Http Proxy Middleware
|
||||
|
||||
.. versionadded:: 0.8
|
||||
|
||||
.. class:: HttpProxyMiddleware
|
||||
|
||||
This middleware sets the HTTP proxy to use for requests, by setting the
|
||||
``proxy`` meta value to :class:`~scrapy.http.Request` objects.
|
||||
|
||||
Like the Python standard library modules `urllib`_ and `urllib2`_ it obeys
|
||||
the following enviroment variables:
|
||||
|
||||
* ``http_proxy``
|
||||
* ``https_proxy``
|
||||
* ``no_proxy``
|
||||
|
||||
.. _urllib: http://docs.python.org/library/urllib.html
|
||||
.. _urllib2: http://docs.python.org/library/urllib2.html
|
||||
|
||||
RedirectMiddleware
|
||||
-------------------
|
||||
|
||||
|
@ -46,22 +46,22 @@ Exporter to export scraped items to different files, one per spider::
|
||||
class XmlExportPipeline(object):
|
||||
|
||||
def __init__(self):
|
||||
dispatcher.connect(self.domain_opened, signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signals.domain_closed)
|
||||
dispatcher.connect(self.spider_opened, signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signals.spider_closed)
|
||||
self.files = {}
|
||||
|
||||
def domain_opened(self, domain):
|
||||
file = open('%s_products.xml' % domain, 'w+b')
|
||||
self.files[domain] = file
|
||||
def spider_opened(self, spider):
|
||||
file = open('%s_products.xml' % spider.domain_name, 'w+b')
|
||||
self.files[spider] = file
|
||||
self.exporter = XmlItemExporter(file)
|
||||
self.exporter.start_exporting()
|
||||
|
||||
def domain_closed(self, domain):
|
||||
def spider_closed(self, spider):
|
||||
self.exporter.finish_exporting()
|
||||
file = self.files.pop(domain)
|
||||
file = self.files.pop(spider)
|
||||
file.close()
|
||||
|
||||
def process_item(self, domain, item):
|
||||
def process_item(self, spider, item):
|
||||
self.exporter.export_item(item)
|
||||
return item
|
||||
|
||||
|
@ -101,14 +101,14 @@ everytime a domain/spider is opened and closed::
|
||||
class SpiderOpenCloseLogging(object):
|
||||
|
||||
def __init__(self):
|
||||
dispatcher.connect(self.domain_opened, signal=signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def domain_opened(self, domain, spider):
|
||||
log.msg("opened domain %s" % domain)
|
||||
def spider_opened(self, spider):
|
||||
log.msg("opened spider %s" % spider.domain_name)
|
||||
|
||||
def domain_closed(self, domain, spider):
|
||||
log.msg("closed domain %s" % domain)
|
||||
def spider_closed(self, spider):
|
||||
log.msg("closed spider %s" % spider.domain_name)
|
||||
|
||||
|
||||
.. _topics-extensions-ref-manager:
|
||||
@ -279,10 +279,10 @@ report will also be sent to those addresses.
|
||||
Close domain extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.closedomain
|
||||
.. module:: scrapy.contrib.closespider
|
||||
:synopsis: Close domain extension
|
||||
|
||||
.. class:: scrapy.contrib.closedomain.CloseDomain
|
||||
.. class:: scrapy.contrib.closespider.CloseSpider
|
||||
|
||||
Closes a domain/spider automatically when some conditions are met, using a
|
||||
specific closing reason for each condition.
|
||||
@ -290,28 +290,28 @@ specific closing reason for each condition.
|
||||
The conditions for closing a domain can be configured through the following
|
||||
settings. Other conditions will be supported in the future.
|
||||
|
||||
.. setting:: CLOSEDOMAIN_TIMEOUT
|
||||
.. setting:: CLOSESPIDER_TIMEOUT
|
||||
|
||||
CLOSEDOMAIN_TIMEOUT
|
||||
CLOSESPIDER_TIMEOUT
|
||||
"""""""""""""""""""
|
||||
|
||||
Default: ``0``
|
||||
|
||||
An integer which specifies a number of seconds. If the domain remains open for
|
||||
more than that number of second, it will be automatically closed with the
|
||||
reason ``closedomain_timeout``. If zero (or non set) domains won't be closed by
|
||||
reason ``closespider_timeout``. If zero (or non set) domains won't be closed by
|
||||
timeout.
|
||||
|
||||
.. setting:: CLOSEDOMAIN_ITEMPASSED
|
||||
.. setting:: CLOSESPIDER_ITEMPASSED
|
||||
|
||||
CLOSEDOMAIN_ITEMPASSED
|
||||
CLOSESPIDER_ITEMPASSED
|
||||
""""""""""""""""""""""
|
||||
|
||||
Default: ``0``
|
||||
|
||||
An integer which specifies a number of items. If the spider scrapes more than
|
||||
that amount if items and those items are passed by the item pipeline, the
|
||||
domain will be closed with the reason ``closedomain_itempassed``. If zero (or
|
||||
domain will be closed with the reason ``closespider_itempassed``. If zero (or
|
||||
non set) domains won't be closed by number of passed items.
|
||||
|
||||
StatsMailer extension
|
||||
|
@ -24,11 +24,13 @@ Writing your own item pipeline
|
||||
Writing your own item pipeline is easy. Each item pipeline component is a
|
||||
single Python class that must define the following method:
|
||||
|
||||
.. method:: process_item(domain, item)
|
||||
.. method:: process_item(spider, item)
|
||||
|
||||
``domain`` is a string with the domain of the spider which scraped the item
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
|
||||
``item`` is a :class:`~scrapy.item.Item` with the item scraped
|
||||
:param item: the item scraped
|
||||
:type item: :class:`~scrapy.item.Item` object
|
||||
|
||||
This method is called for every item pipeline component and must either return
|
||||
a :class:`~scrapy.item.Item` (or any descendant class) object or raise a
|
||||
@ -49,7 +51,7 @@ attribute), and drops those items which don't contain a price::
|
||||
|
||||
vat_factor = 1.15
|
||||
|
||||
def process_item(self, domain, item):
|
||||
def process_item(self, spider, item):
|
||||
if item['price']:
|
||||
if item['price_excludes_vat']:
|
||||
item['price'] = item['price'] * self.vat_factor
|
||||
@ -68,11 +70,11 @@ To activate an Item Pipeline component you must add its class to the
|
||||
'myproject.pipeline.PricePipeline',
|
||||
]
|
||||
|
||||
Item pipeline example with resources per domain
|
||||
Item pipeline example with resources per spider
|
||||
===============================================
|
||||
|
||||
Sometimes you need to keep resources about the items processed grouped per
|
||||
domain, and delete those resource when a domain finish.
|
||||
spider, and delete those resource when a spider finish.
|
||||
|
||||
An example is a filter that looks for duplicate items, and drops those items
|
||||
that were already processed. Let say that our items has an unique id, but our
|
||||
@ -85,21 +87,21 @@ spider returns multiples items with the same id::
|
||||
|
||||
class DuplicatesPipeline(object):
|
||||
def __init__(self):
|
||||
self.domaininfo = {}
|
||||
dispatcher.connect(self.domain_opened, signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signals.domain_closed)
|
||||
self.duplicates = {}
|
||||
dispatcher.connect(self.spider_opened, signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signals.spider_closed)
|
||||
|
||||
def domain_opened(self, domain):
|
||||
self.duplicates[domain] = set()
|
||||
def spider_opened(self, spider):
|
||||
self.duplicates[spider] = set()
|
||||
|
||||
def domain_closed(self, domain):
|
||||
del self.duplicates[domain]
|
||||
def spider_closed(self, spider):
|
||||
del self.duplicates[spider]
|
||||
|
||||
def process_item(self, domain, item):
|
||||
if item.id in self.duplicates[domain]:
|
||||
def process_item(self, spider, item):
|
||||
if item.id in self.duplicates[spider]:
|
||||
raise DropItem("Duplicate item found: %s" % item)
|
||||
else:
|
||||
self.duplicates[domain].add(item.id)
|
||||
self.duplicates[spider].add(item.id)
|
||||
return item
|
||||
|
||||
Built-in Item Pipelines reference
|
||||
|
@ -45,7 +45,7 @@ which quite often consists in answer the question: *which spider is leaking?*.
|
||||
The leak could also come from a custom middleware, pipeline or extension that
|
||||
you have written, if you are not releasing the (previously allocated) resources
|
||||
properly. For example, if you're allocating resources on
|
||||
:signal:`domain_opened` but not releasing them on :signal:`domain_closed`.
|
||||
:signal:`spider_opened` but not releasing them on :signal:`spider_closed`.
|
||||
|
||||
.. _topics-leaks-trackrefs:
|
||||
|
||||
|
@ -44,7 +44,7 @@ Logging from Spiders
|
||||
|
||||
The recommended way to log from spiders is by using the Spider
|
||||
:meth:`~scrapy.spider.BaseSpider.log` method, which already populates the
|
||||
``domain`` argument of the :func:`scrapy.log.msg` function. The other arguments
|
||||
``spider`` argument of the :func:`scrapy.log.msg` function. The other arguments
|
||||
are passed directly to the :func:`~scrapy.log.msg` function.
|
||||
|
||||
scrapy.log module
|
||||
@ -81,7 +81,7 @@ scrapy.log module
|
||||
setting will be used.
|
||||
:type logstdout: boolean
|
||||
|
||||
.. function:: msg(message, level=INFO, component=BOT_NAME, domain=None)
|
||||
.. function:: msg(message, level=INFO, component=BOT_NAME, spider=None)
|
||||
|
||||
Log a message
|
||||
|
||||
@ -95,12 +95,12 @@ scrapy.log module
|
||||
:setting:`BOT_NAME`
|
||||
:type component: str
|
||||
|
||||
:param domain: the spider domain to use for logging this message. This
|
||||
parameter should always be used when logging things related to a
|
||||
particular spider.
|
||||
:type domain: str
|
||||
:param spider: the spider to use for logging this message. This parameter
|
||||
should always be used when logging things related to a particular
|
||||
spider.
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
|
||||
.. function:: exc(message, level=ERROR, component=BOT_NAME, domain=None)
|
||||
.. function:: exc(message, level=ERROR, component=BOT_NAME, spider=None)
|
||||
|
||||
Log an exception. Similar to ``msg()`` but it also appends a stack trace
|
||||
report using `traceback.format_exc`.
|
||||
|
@ -250,13 +250,24 @@ objects.
|
||||
The :class:`FormRequest` objects support the following class method in
|
||||
addition to the standard :class:`Request` methods:
|
||||
|
||||
.. classmethod:: FormRequest.from_response(response, [formnumber=0, formdata, ...])
|
||||
.. classmethod:: FormRequest.from_response(response, [formnumber=0, formdata=None, clickdata=None, dont_click=False, ...])
|
||||
|
||||
Returns a new :class:`FormRequest` object with its form field values
|
||||
pre-populated with those found in the HTML ``<form>`` element contained
|
||||
in the given response. For an example see
|
||||
:ref:`topics-request-response-ref-request-userlogin`.
|
||||
|
||||
Keep in mind that this method is implemented using `ClientForm`_ whose
|
||||
policy is to automatically simulate a click, by default, on any form
|
||||
control that looks clickable, like a a ``<input type="submit">``. Even
|
||||
though this is quite convenient, and often the desired behaviour,
|
||||
sometimes it can cause problems which could be hard to debug. For
|
||||
example, when working with forms that are filled and/or submitted using
|
||||
javascript, the default :meth:`from_response` (and `ClientForm`_)
|
||||
behaviour may not be the most appropiate. To disable this behaviour you
|
||||
can set the ``dont_click`` argument to ``True``. Also, if you want to
|
||||
change the control clicked (instead of disabling it) you can also use
|
||||
the ``clickdata`` argument.
|
||||
|
||||
:param response: the response containing a HTML form which will be used
|
||||
to pre-populate the form fields
|
||||
@ -271,10 +282,18 @@ objects.
|
||||
overridden by the one passed in this parameter.
|
||||
:type formdata: dict
|
||||
|
||||
:param clickdata: Arguments to be passed directly to ClientForm
|
||||
``click_request_data()`` method. See `ClientForm`_ homepage for
|
||||
more info.
|
||||
:type clickdata: dict
|
||||
|
||||
:param dont_click: If True the form data will be sumbitted without
|
||||
clicking in any element.
|
||||
:type clickdata: boolean
|
||||
|
||||
The other parameters of this class method are passed directly to the
|
||||
:class:`FormRequest` constructor.
|
||||
|
||||
|
||||
Request usage examples
|
||||
----------------------
|
||||
|
||||
|
@ -220,45 +220,6 @@ Default: ``1.0``
|
||||
The version of the bot implemented by this Scrapy project. This will be used to
|
||||
construct the User-Agent by default.
|
||||
|
||||
.. setting:: HTTPCACHE_DIR
|
||||
|
||||
HTTPCACHE_DIR
|
||||
-------------
|
||||
|
||||
Default: ``''`` (empty string)
|
||||
|
||||
The directory to use for storing the (low-level) HTTP cache. If empty the HTTP
|
||||
cache will be disabled.
|
||||
|
||||
.. setting:: HTTPCACHE_EXPIRATION_SECS
|
||||
|
||||
HTTPCACHE_EXPIRATION_SECS
|
||||
-------------------------
|
||||
|
||||
Default: ``0``
|
||||
|
||||
Number of seconds to use for HTTP cache expiration. Requests that were cached
|
||||
before this time will be re-downloaded. If zero, cached requests will always
|
||||
expire. Negative numbers means requests will never expire.
|
||||
|
||||
.. setting:: HTTPCACHE_IGNORE_MISSING
|
||||
|
||||
HTTPCACHE_IGNORE_MISSING
|
||||
------------------------
|
||||
|
||||
Default: ``False``
|
||||
|
||||
If enabled, requests not found in the cache will be ignored instead of downloaded.
|
||||
|
||||
.. setting:: HTTPCACHE_SECTORIZE
|
||||
|
||||
HTTPCACHE_SECTORIZE
|
||||
-------------------
|
||||
|
||||
Default: ``True``
|
||||
|
||||
Whether to split HTTP cache storage in several dirs for performance.
|
||||
|
||||
.. setting:: COMMANDS_MODULE
|
||||
|
||||
COMMANDS_MODULE
|
||||
@ -286,14 +247,14 @@ Example::
|
||||
|
||||
COMMANDS_SETTINGS_MODULE = 'mybot.conf.commands'
|
||||
|
||||
.. setting:: CONCURRENT_DOMAINS
|
||||
.. setting:: CONCURRENT_SPIDERS
|
||||
|
||||
CONCURRENT_DOMAINS
|
||||
CONCURRENT_SPIDERS
|
||||
------------------
|
||||
|
||||
Default: ``8``
|
||||
|
||||
Maximum number of domains to scrape in parallel.
|
||||
Maximum number of spiders to scrape in parallel.
|
||||
|
||||
.. setting:: CONCURRENT_ITEMS
|
||||
|
||||
@ -339,6 +300,17 @@ Default::
|
||||
The default headers used for Scrapy HTTP Requests. They're populated in the
|
||||
:class:`~scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware`.
|
||||
|
||||
.. setting:: DEFAULT_RESPONSE_ENCODING
|
||||
|
||||
DEFAULT_RESPONSE_ENCODING
|
||||
-------------------------
|
||||
|
||||
Default: ``'ascii'``
|
||||
|
||||
The default encoding to use for :class:`~scrapy.http.TextResponse` objects (and
|
||||
subclasses) when no encoding is declared and no encoding could be inferred from
|
||||
the body.
|
||||
|
||||
.. setting:: DEPTH_LIMIT
|
||||
|
||||
DEPTH_LIMIT
|
||||
@ -402,6 +374,7 @@ Default::
|
||||
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
|
||||
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
|
||||
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
|
||||
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
|
||||
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 800,
|
||||
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
|
||||
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
|
||||
@ -725,10 +698,37 @@ Default: ``+2``
|
||||
Adjust redirect request priority relative to original request.
|
||||
A negative priority adjust means more priority.
|
||||
|
||||
.. setting:: REQUESTS_PER_DOMAIN
|
||||
.. setting:: REQUEST_HANDLERS
|
||||
|
||||
REQUESTS_PER_DOMAIN
|
||||
-------------------
|
||||
REQUEST_HANDLERS
|
||||
----------------
|
||||
|
||||
Default: ``{}``
|
||||
|
||||
A dict containing the request downloader handlers enabled in your project.
|
||||
See `REQUEST_HANDLERS_BASE` for example format.
|
||||
|
||||
.. setting:: REQUEST_HANDLERS_BASE
|
||||
|
||||
REQUEST_HANDLERS_BASE
|
||||
---------------------
|
||||
|
||||
Default::
|
||||
|
||||
{
|
||||
'file': 'scrapy.core.downloader.handlers.file.download_file',
|
||||
'http': 'scrapy.core.downloader.handlers.http.download_http',
|
||||
'https': 'scrapy.core.downloader.handlers.http.download_http',
|
||||
}
|
||||
|
||||
A dict containing the request download handlers enabled by default in Scrapy.
|
||||
You should never modify this setting in your project, modify
|
||||
:setting:`REQUEST_HANDLERS` instead.
|
||||
|
||||
.. setting:: CONCURRENT_REQUESTS_PER_SPIDER
|
||||
|
||||
CONCURRENT_REQUESTS_PER_SPIDER
|
||||
------------------------------
|
||||
|
||||
Default: ``8``
|
||||
|
||||
|
@ -29,68 +29,58 @@ Built-in signals reference
|
||||
Here's a list of signals used in Scrapy and their meaning, in alphabetical
|
||||
order.
|
||||
|
||||
domain_closed
|
||||
spider_closed
|
||||
-------------
|
||||
|
||||
.. signal:: domain_closed
|
||||
.. function:: domain_closed(domain, spider, reason)
|
||||
.. signal:: spider_closed
|
||||
.. function:: spider_closed(spider, reason)
|
||||
|
||||
Sent after a spider/domain has been closed. This can be used to release
|
||||
per-spider resources reserved on :signal:`domain_opened`.
|
||||
|
||||
:param domain: a string which contains the domain of the spider which has
|
||||
been closed
|
||||
:type domain: str
|
||||
Sent after a spider has been closed. This can be used to release per-spider
|
||||
resources reserved on :signal:`spider_opened`.
|
||||
|
||||
:param spider: the spider which has been closed
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
|
||||
:param reason: a string which describes the reason why the domain was closed. If
|
||||
it was closed because the domain has completed scraping, it the reason
|
||||
is ``'finished'``. Otherwise, if the domain was manually closed by
|
||||
calling the ``close_domain`` engine method, then the reason is the one
|
||||
:param reason: a string which describes the reason why the spider was closed. If
|
||||
it was closed because the spider has completed scraping, it the reason
|
||||
is ``'finished'``. Otherwise, if the spider was manually closed by
|
||||
calling the ``close_spider`` engine method, then the reason is the one
|
||||
passed in the ``reason`` argument of that method (which defaults to
|
||||
``'cancelled'``). If the engine was shutdown (for example, by hitting
|
||||
Ctrl-C to stop it) the reason will be ``'shutdown'``.
|
||||
:type reason: str
|
||||
|
||||
domain_opened
|
||||
spider_opened
|
||||
-------------
|
||||
|
||||
.. signal:: domain_opened
|
||||
.. function:: domain_opened(domain, spider)
|
||||
.. signal:: spider_opened
|
||||
.. function:: spider_opened(spider)
|
||||
|
||||
Sent after a spider/domain has been opened for crawling. This is typically
|
||||
used to reserve per-spider resources, but can be used for any task that
|
||||
needs to be performed when a spider/domain is opened.
|
||||
|
||||
:param domain: a string with the domain of the spider which has been opened
|
||||
:type domain: str
|
||||
Sent after a spider has been opened for crawling. This is typically used to
|
||||
reserve per-spider resources, but can be used for any task that needs to be
|
||||
performed when a spider is opened.
|
||||
|
||||
:param spider: the spider which has been opened
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
|
||||
domain_idle
|
||||
spider_idle
|
||||
-----------
|
||||
|
||||
.. signal:: domain_idle
|
||||
.. function:: domain_idle(domain, spider)
|
||||
.. signal:: spider_idle
|
||||
.. function:: spider_idle(spider)
|
||||
|
||||
Sent when a domain has gone idle, which means the spider has no further:
|
||||
Sent when a spider has gone idle, which means the spider has no further:
|
||||
|
||||
* requests waiting to be downloaded
|
||||
* requests scheduled
|
||||
* items being processed in the item pipeline
|
||||
|
||||
If the idle state persists after all handlers of this signal have finished,
|
||||
the engine starts closing the domain. After the domain has finished
|
||||
closing, the :signal:`domain_closed` signal is sent.
|
||||
the engine starts closing the spider. After the spider has finished
|
||||
closing, the :signal:`spider_closed` signal is sent.
|
||||
|
||||
You can, for example, schedule some requests in your :signal:`domain_idle`
|
||||
handler to prevent the domain from being closed.
|
||||
|
||||
:param domain: is a string with the domain of the spider which has gone idle
|
||||
:type domain: str
|
||||
You can, for example, schedule some requests in your :signal:`spider_idle`
|
||||
handler to prevent the spider from being closed.
|
||||
|
||||
:param spider: the spider which has gone idle
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
|
@ -210,12 +210,24 @@ OffsiteMiddleware
|
||||
|
||||
Filters out Requests for URLs outside the domains covered by the spider.
|
||||
|
||||
This middleware filters out every request whose host names doesn't match
|
||||
This middleware filters out every request whose host names don't match
|
||||
:attr:`~scrapy.spider.BaseSpider.domain_name`, or the spider
|
||||
:attr:`~scrapy.spider.BaseSpider.domain_name` prefixed by "www.".
|
||||
Spider can add more domains to exclude using
|
||||
:attr:`~scrapy.spider.BaseSpider.extra_domain_names` attribute.
|
||||
|
||||
When your spider returns a request for a domain not belonging to those
|
||||
covered by the spider, this middleware will log a debug message similar to
|
||||
this one::
|
||||
|
||||
DEBUG: Filtered offsite request to 'www.othersite.com': <GET http://www.othersite.com/some/page.html>
|
||||
|
||||
To avoid filling the log with too much noise, it will only print one of
|
||||
these messages for each new domain filtered. So, for example, if another
|
||||
request for ``www.othersite.com`` is filtered, no log message will be
|
||||
printed. But if a request for ``someothersite.com`` is filtered, a message
|
||||
will be printed (but only for the first request filtred).
|
||||
|
||||
RefererMiddleware
|
||||
-----------------
|
||||
|
||||
|
@ -8,9 +8,9 @@ Overview
|
||||
========
|
||||
|
||||
Scrapy provides a convenient service for collecting stats in the form of
|
||||
key/values, both globally and per spider/domain. It's called the Stats
|
||||
Collector, and it's a singleton which can be imported and used quickly, as
|
||||
illustrated by the examples in the :ref:`topics-stats-usecases` section below.
|
||||
key/values, both globally and per spider. It's called the Stats Collector, and
|
||||
it's a singleton which can be imported and used quickly, as illustrated by the
|
||||
examples in the :ref:`topics-stats-usecases` section below.
|
||||
|
||||
The stats collection is enabled by default but can be disabled through the
|
||||
:setting:`STATS_ENABLED` setting.
|
||||
@ -26,10 +26,10 @@ using the Stats Collector from.
|
||||
Another feature of the Stats Collector is that it's very efficient (when
|
||||
enabled) and extremely efficient (almost unnoticeable) when disabled.
|
||||
|
||||
The Stats Collector keeps one stats table per open spider/domain and one global
|
||||
stats table. You can't set or get stats from a closed domain, but the
|
||||
domain-specific stats table is automatically opened when the spider is opened,
|
||||
and closed when the spider is closed.
|
||||
The Stats Collector keeps one stats table per open spider and one global stats
|
||||
table. You can't set or get stats from a closed spider, but the spider-specific
|
||||
stats table is automatically opened when the spider is opened, and closed when
|
||||
the spider is closed.
|
||||
|
||||
.. _topics-stats-usecases:
|
||||
|
||||
@ -61,36 +61,38 @@ Get global stat value::
|
||||
>>> stats.get_value('spiders_crawled')
|
||||
8
|
||||
|
||||
Get all global stats from a given domain::
|
||||
Get all global stats (ie. not particular to any spider)::
|
||||
|
||||
>>> stats.get_stats()
|
||||
{'hostname': 'localhost', 'spiders_crawled': 8}
|
||||
|
||||
Set domain/spider specific stat value (domains must be opened first, but this
|
||||
Set spider specific stat value (spider stats must be opened first, but this
|
||||
task is handled automatically by the Scrapy engine)::
|
||||
|
||||
stats.set_value('start_time', datetime.now(), domain='example.com')
|
||||
stats.set_value('start_time', datetime.now(), spider=some_spider)
|
||||
|
||||
Increment domain-specific stat value::
|
||||
Where ``some_spider`` is a :class:`~scrapy.spider.BaseSpider` object.
|
||||
|
||||
stats.inc_value('pages_crawled', domain='example.com')
|
||||
Increment spider-specific stat value::
|
||||
|
||||
Set domain-specific stat value only if greater than previous::
|
||||
stats.inc_value('pages_crawled', spider=some_spider)
|
||||
|
||||
stats.max_value('max_items_scraped', value, domain='example.com')
|
||||
Set spider-specific stat value only if greater than previous::
|
||||
|
||||
Set domain-specific stat value only if lower than previous::
|
||||
stats.max_value('max_items_scraped', value, spider=some_spider)
|
||||
|
||||
stats.min_value('min_free_memory_percent', value, domain='example.com')
|
||||
Set spider-specific stat value only if lower than previous::
|
||||
|
||||
Get domain-specific stat value::
|
||||
stats.min_value('min_free_memory_percent', value, spider=some_spider)
|
||||
|
||||
>>> stats.get_value('pages_crawled', domain='example.com')
|
||||
Get spider-specific stat value::
|
||||
|
||||
>>> stats.get_value('pages_crawled', spider=some_spider)
|
||||
1238
|
||||
|
||||
Get all stats from a given domain::
|
||||
Get all stats from a given spider::
|
||||
|
||||
>>> stats.get_stats('pages_crawled', domain='example.com')
|
||||
>>> stats.get_stats('pages_crawled', spider=some_spider)
|
||||
{'pages_crawled': 1238, 'start_time': datetime.datetime(2009, 7, 14, 21, 47, 28, 977139)}
|
||||
|
||||
.. _topics-stats-ref:
|
||||
@ -108,75 +110,80 @@ class (which they all inherit from).
|
||||
|
||||
.. class:: StatsCollector
|
||||
|
||||
.. method:: get_value(key, default=None, domain=None)
|
||||
.. method:: get_value(key, default=None, spider=None)
|
||||
|
||||
Return the value for the given stats key or default if it doesn't exist.
|
||||
If domain is ``None`` the global stats table is consulted, other the
|
||||
domain specific one is. If the domain is not yet opened a ``KeyError``
|
||||
If spider is ``None`` the global stats table is consulted, otherwise the
|
||||
spider specific one is. If the spider is not yet opened a ``KeyError``
|
||||
exception is raised.
|
||||
|
||||
.. method:: get_stats(domain=None)
|
||||
.. method:: get_stats(spider=None)
|
||||
|
||||
Get all stats from the given domain/spider (if domain is given) or all
|
||||
global stats otherwise, as a dict. If domain is not opened ``KeyError``
|
||||
is raied.
|
||||
Get all stats from the given spider (if spider is given) or all global
|
||||
stats otherwise, as a dict. If spider is not opened ``KeyError`` is
|
||||
raied.
|
||||
|
||||
.. method:: set_value(key, value, domain=None)
|
||||
.. method:: set_value(key, value, spider=None)
|
||||
|
||||
Set the given value for the given stats key on the global stats (if
|
||||
domain is not given) or the domain-specific stats (if domain is given),
|
||||
spider is not given) or the spider-specific stats (if spider is given),
|
||||
which must be opened or a ``KeyError`` will be raised.
|
||||
|
||||
.. method:: set_stats(stats, domain=None)
|
||||
.. method:: set_stats(stats, spider=None)
|
||||
|
||||
Set the given stats (as a dict) for the given domain. If the domain is
|
||||
Set the given stats (as a dict) for the given spider. If the spider is
|
||||
not opened a ``KeyError`` will be raised.
|
||||
|
||||
.. method:: inc_value(key, count=1, start=0, domain=None)
|
||||
.. method:: inc_value(key, count=1, start=0, spider=None)
|
||||
|
||||
Increment the value of the given stats key, by the given count,
|
||||
assuming the start value given (when it's not set). If domain is not
|
||||
given the global stats table is used, otherwise the domain-specific
|
||||
assuming the start value given (when it's not set). If spider is not
|
||||
given the global stats table is used, otherwise the spider-specific
|
||||
stats table is used, which must be opened or a ``KeyError`` will be
|
||||
raised.
|
||||
|
||||
.. method:: max_value(key, value, domain=None)
|
||||
.. method:: max_value(key, value, spider=None)
|
||||
|
||||
Set the given value for the given key only if current value for the
|
||||
same key is lower than value. If there is no current value for the
|
||||
given key, the value is always set. If domain is not given the global
|
||||
stats table is used, otherwise the domain-specific stats table is used,
|
||||
given key, the value is always set. If spider is not given the global
|
||||
stats table is used, otherwise the spider-specific stats table is used,
|
||||
which must be opened or a KeyError will be raised.
|
||||
|
||||
.. method:: min_value(key, value, domain=None)
|
||||
.. method:: min_value(key, value, spider=None)
|
||||
|
||||
Set the given value for the given key only if current value for the
|
||||
same key is greater than value. If there is no current value for the
|
||||
given key, the value is always set. If domain is not given the global
|
||||
stats table is used, otherwise the domain-specific stats table is used,
|
||||
given key, the value is always set. If spider is not given the global
|
||||
stats table is used, otherwise the spider-specific stats table is used,
|
||||
which must be opened or a KeyError will be raised.
|
||||
|
||||
.. method:: clear_stats(domain=None)
|
||||
.. method:: clear_stats(spider=None)
|
||||
|
||||
Clear all global stats (if domain is not given) or all domain-specific
|
||||
stats if domain is given, in which case it must be opened or a
|
||||
Clear all global stats (if spider is not given) or all spider-specific
|
||||
stats if spider is given, in which case it must be opened or a
|
||||
``KeyError`` will be raised.
|
||||
|
||||
.. method:: list_domains()
|
||||
.. method:: iter_spider_stats()
|
||||
|
||||
Return a list of all opened domains.
|
||||
Return a iterator over ``(spider, spider_stats)`` for each open spider
|
||||
currently tracked by the stats collector, where ``spider_stats`` is the
|
||||
dict containing all spider-specific stats.
|
||||
|
||||
.. method:: open_domain(domain)
|
||||
Global stats are not included in the iterator. If you want to get
|
||||
those, use :meth:`get_stats` method.
|
||||
|
||||
Open the given domain for stats collection. This method must be called
|
||||
prior to working with any stats specific to that domain, but this task
|
||||
.. method:: open_spider(spider)
|
||||
|
||||
Open the given spider for stats collection. This method must be called
|
||||
prior to working with any stats specific to that spider, but this task
|
||||
is handled automatically by the Scrapy engine.
|
||||
|
||||
.. method:: close_domain(domain)
|
||||
.. method:: close_spider(spider)
|
||||
|
||||
Close the given domain. After this is called, no more specific stats
|
||||
for this domain can be accessed. This method is called automatically on
|
||||
the :signal:`domain_closed` signal.
|
||||
Close the given spider. After this is called, no more specific stats
|
||||
for this spider can be accessed. This method is called automatically on
|
||||
the :signal:`spider_closed` signal.
|
||||
|
||||
Available Stats Collectors
|
||||
==========================
|
||||
@ -196,15 +203,16 @@ MemoryStatsCollector
|
||||
.. class:: MemoryStatsCollector
|
||||
|
||||
A simple stats collector that keeps the stats of the last scraping run (for
|
||||
each domain) in memory, which can be accessed through the ``domain_stats``
|
||||
attribute
|
||||
each spider) in memory, after they're closed. The stats can be accessed
|
||||
through the :attr:`domain_stats` attribute, which is a dict keyed by spider
|
||||
domain name.
|
||||
|
||||
This is the default Stats Collector used in Scrapy.
|
||||
|
||||
.. attribute:: domain_stats
|
||||
|
||||
A dict of dicts (keyed by domain) containing the stats of the last
|
||||
scraping run for each domain.
|
||||
A dict of dicts (keyed by spider domain name) containing the stats of
|
||||
the last scraping run for each domain.
|
||||
|
||||
DummyStatsCollector
|
||||
-------------------
|
||||
@ -283,41 +291,41 @@ functionality:
|
||||
.. module:: scrapy.stats.signals
|
||||
:synopsis: Stats Collector signals
|
||||
|
||||
.. signal:: stats_domain_opened
|
||||
.. function:: stats_domain_opened(domain)
|
||||
.. signal:: stats_spider_opened
|
||||
.. function:: stats_spider_opened(spider)
|
||||
|
||||
Sent right after the stats domain is opened. You can use this signal to add
|
||||
startup stats for domain (example: start time).
|
||||
Sent right after the stats spider is opened. You can use this signal to add
|
||||
startup stats for spider (example: start time).
|
||||
|
||||
:param domain: the stats domain just opened
|
||||
:type domain: str
|
||||
:param spider: the stats spider just opened
|
||||
:type spider: str
|
||||
|
||||
.. signal:: stats_domain_closing
|
||||
.. function:: stats_domain_closing(domain, reason)
|
||||
.. signal:: stats_spider_closing
|
||||
.. function:: stats_spider_closing(spider, reason)
|
||||
|
||||
Sent just before the stats domain is closed. You can use this signal to add
|
||||
Sent just before the stats spider is closed. You can use this signal to add
|
||||
some closing stats (example: finish time).
|
||||
|
||||
:param domain: the stats domain about to be closed
|
||||
:type domain: str
|
||||
:param spider: the stats spider about to be closed
|
||||
:type spider: str
|
||||
|
||||
:param reason: the reason why the domain is being closed. See
|
||||
:signal:`domain_closed` signal for more info.
|
||||
:param reason: the reason why the spider is being closed. See
|
||||
:signal:`spider_closed` signal for more info.
|
||||
:type reason: str
|
||||
|
||||
.. signal:: stats_domain_closed
|
||||
.. function:: stats_domain_closed(domain, reason, domain_stats)
|
||||
.. signal:: stats_spider_closed
|
||||
.. function:: stats_spider_closed(spider, reason, spider_stats)
|
||||
|
||||
Sent right after the stats domain is closed. You can use this signal to
|
||||
collect resources, but not to add any more stats as the stats domain has
|
||||
already been close (use :signal:`stats_domain_closing` for that instead).
|
||||
Sent right after the stats spider is closed. You can use this signal to
|
||||
collect resources, but not to add any more stats as the stats spider has
|
||||
already been close (use :signal:`stats_spider_closing` for that instead).
|
||||
|
||||
:param domain: the stats domain just closed
|
||||
:type domain: str
|
||||
:param spider: the stats spider just closed
|
||||
:type spider: str
|
||||
|
||||
:param reason: the reason why the domain was closed. See
|
||||
:signal:`domain_closed` signal for more info.
|
||||
:param reason: the reason why the spider was closed. See
|
||||
:signal:`spider_closed` signal for more info.
|
||||
:type reason: str
|
||||
|
||||
:param domain_stats: the stats of the domain just closed.
|
||||
:param spider_stats: the stats of the spider just closed.
|
||||
:type reason: dict
|
||||
|
@ -7,7 +7,7 @@ BOT_VERSION = '1.0'
|
||||
|
||||
SPIDER_MODULES = ['googledir.spiders']
|
||||
NEWSPIDER_MODULE = 'googledir.spiders'
|
||||
DEFAULT_ITEM_CLASS = 'scrapy.item.ScrapedItem'
|
||||
DEFAULT_ITEM_CLASS = 'scrapy.item.Item'
|
||||
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
||||
|
||||
ITEM_PIPELINES = ['googledir.pipelines.FilterWordsPipeline']
|
||||
|
@ -6,8 +6,8 @@
|
||||
hg purge --all
|
||||
|
||||
# build packages
|
||||
version=$(python -c "import scrapy; print scrapy.__version__")
|
||||
python setup.py sdist
|
||||
#version=$(python -c "import scrapy; print scrapy.__version__")
|
||||
#python setup.py sdist
|
||||
# FIXME: bdist_wininst doesn't work on Unix (it doesn't include the data_files)
|
||||
# To build the win32 release you need to use Windows for now.
|
||||
#python setup.py bdist_wininst -t "Scrapy $version" -p "win32"
|
||||
|
@ -3,8 +3,8 @@ Scrapy - a screen scraping framework written in Python
|
||||
"""
|
||||
|
||||
# IMPORTANT: remember to also update the version in docs/conf.py
|
||||
version_info = (0, 7, 0, 'final', 0)
|
||||
__version__ = "0.7"
|
||||
version_info = (0, 8, 0, '', 0)
|
||||
__version__ = "0.8-dev"
|
||||
|
||||
import sys, os
|
||||
|
||||
|
@ -6,29 +6,26 @@ See documentation in docs/topics/settings.rst
|
||||
|
||||
import os
|
||||
import cPickle as pickle
|
||||
import warnings
|
||||
|
||||
from scrapy.conf import default_settings
|
||||
|
||||
if 'SCRAPYSETTINGS_MODULE' in os.environ:
|
||||
warnings.warn("SCRAPYSETTINGS_MODULE environment variable is deprecated, " \
|
||||
"use SCRAPY_SETTINGS_MODULE instead", DeprecationWarning, stacklevel=2)
|
||||
|
||||
import_ = lambda x: __import__(x, {}, {}, [''])
|
||||
|
||||
class Settings(object):
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, overrides=None):
|
||||
self.defaults = {}
|
||||
self.global_defaults = default_settings
|
||||
self.disabled = os.environ.get('SCRAPY_SETTINGS_DISABLED', False)
|
||||
settings_module_path = os.environ.get('SCRAPY_SETTINGS_MODULE', \
|
||||
os.environ.get('SCRAPYSETTINGS_MODULE', 'scrapy_settings'))
|
||||
'scrapy_settings')
|
||||
self.set_settings_module(settings_module_path)
|
||||
|
||||
# XXX: find a better solution for this hack
|
||||
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
|
||||
self.overrides = pickle.loads(pickled_settings) if pickled_settings else {}
|
||||
if overrides:
|
||||
self.overrides.update(overrides)
|
||||
|
||||
def __getitem__(self, opt_name):
|
||||
if not self.disabled:
|
||||
|
@ -18,15 +18,15 @@ from os.path import join, abspath, dirname
|
||||
BOT_NAME = 'scrapybot'
|
||||
BOT_VERSION = '1.0'
|
||||
|
||||
CLOSEDOMAIN_TIMEOUT = 0
|
||||
CLOSEDOMAIN_ITEMPASSED = 0
|
||||
CLOSESPIDER_TIMEOUT = 0
|
||||
CLOSESPIDER_ITEMPASSED = 0
|
||||
|
||||
COMMANDS_MODULE = ''
|
||||
COMMANDS_SETTINGS_MODULE = ''
|
||||
|
||||
CONCURRENT_DOMAINS = 8
|
||||
|
||||
CONCURRENT_ITEMS = 100
|
||||
CONCURRENT_REQUESTS_PER_SPIDER = 8
|
||||
CONCURRENT_SPIDERS = 8
|
||||
|
||||
COOKIES_DEBUG = False
|
||||
|
||||
@ -37,6 +37,8 @@ DEFAULT_REQUEST_HEADERS = {
|
||||
'Accept-Language': 'en',
|
||||
}
|
||||
|
||||
DEFAULT_RESPONSE_ENCODING = 'ascii'
|
||||
|
||||
DEPTH_LIMIT = 0
|
||||
DEPTH_STATS = True
|
||||
|
||||
@ -60,6 +62,7 @@ DOWNLOADER_MIDDLEWARES_BASE = {
|
||||
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
|
||||
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
|
||||
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
|
||||
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
|
||||
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 800,
|
||||
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
|
||||
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
|
||||
@ -83,7 +86,7 @@ EXTENSIONS_BASE = {
|
||||
'scrapy.contrib.webconsole.stats.StatsDump': 0,
|
||||
'scrapy.contrib.memusage.MemoryUsage': 0,
|
||||
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
||||
'scrapy.contrib.closedomain.CloseDomain': 0,
|
||||
'scrapy.contrib.closespider.CloseSpider': 0,
|
||||
}
|
||||
|
||||
GROUPSETTINGS_ENABLED = False
|
||||
@ -91,7 +94,7 @@ GROUPSETTINGS_MODULE = ''
|
||||
|
||||
HTTPCACHE_DIR = ''
|
||||
HTTPCACHE_IGNORE_MISSING = False
|
||||
HTTPCACHE_SECTORIZE = True
|
||||
HTTPCACHE_STORAGE = 'scrapy.contrib.downloadermiddleware.httpcache.FilesystemCacheStorage'
|
||||
HTTPCACHE_EXPIRATION_SECS = 0
|
||||
|
||||
ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
||||
@ -100,6 +103,7 @@ ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
||||
ITEM_PIPELINES = []
|
||||
|
||||
LOG_ENABLED = True
|
||||
LOG_FORMATTER_CRAWLED = 'scrapy.contrib.logformatter.crawled_logline'
|
||||
LOG_STDOUT = False
|
||||
LOG_LEVEL = 'DEBUG'
|
||||
LOG_FILE = None
|
||||
@ -124,8 +128,14 @@ REDIRECT_MAX_METAREFRESH_DELAY = 100
|
||||
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
|
||||
REDIRECT_PRIORITY_ADJUST = +2
|
||||
|
||||
REQUEST_HANDLERS = {}
|
||||
REQUEST_HANDLERS_BASE = {
|
||||
'file': 'scrapy.core.downloader.handlers.file.download_file',
|
||||
'http': 'scrapy.core.downloader.handlers.http.download_http',
|
||||
'https': 'scrapy.core.downloader.handlers.http.download_http',
|
||||
}
|
||||
|
||||
REQUESTS_QUEUE_SIZE = 0
|
||||
REQUESTS_PER_DOMAIN = 8 # max simultaneous requests per domain
|
||||
|
||||
# contrib.middleware.retry.RetryMiddleware default settings
|
||||
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""CloseDomain is an extension that forces spiders to be closed after certain
|
||||
"""CloseSpider is an extension that forces spiders to be closed after certain
|
||||
conditions are met.
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
@ -13,31 +13,31 @@ from scrapy.core import signals
|
||||
from scrapy.core.engine import scrapyengine
|
||||
from scrapy.conf import settings
|
||||
|
||||
class CloseDomain(object):
|
||||
class CloseSpider(object):
|
||||
|
||||
def __init__(self):
|
||||
self.timeout = settings.getint('CLOSEDOMAIN_TIMEOUT')
|
||||
self.itempassed = settings.getint('CLOSEDOMAIN_ITEMPASSED')
|
||||
self.timeout = settings.getint('CLOSESPIDER_TIMEOUT')
|
||||
self.itempassed = settings.getint('CLOSESPIDER_ITEMPASSED')
|
||||
|
||||
self.counts = defaultdict(int)
|
||||
self.tasks = {}
|
||||
|
||||
if self.timeout:
|
||||
dispatcher.connect(self.domain_opened, signal=signals.domain_opened)
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
if self.itempassed:
|
||||
dispatcher.connect(self.item_passed, signal=signals.item_passed)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def domain_opened(self, spider):
|
||||
def spider_opened(self, spider):
|
||||
self.tasks[spider] = reactor.callLater(self.timeout, scrapyengine.close_spider, \
|
||||
spider=spider, reason='closedomain_timeout')
|
||||
spider=spider, reason='closespider_timeout')
|
||||
|
||||
def item_passed(self, item, spider):
|
||||
self.counts[spider] += 1
|
||||
if self.counts[spider] == self.itempassed:
|
||||
scrapyengine.close_spider(spider, 'closedomain_itempassed')
|
||||
scrapyengine.close_spider(spider, 'closespider_itempassed')
|
||||
|
||||
def domain_closed(self, spider):
|
||||
def spider_closed(self, spider):
|
||||
self.counts.pop(spider, None)
|
||||
tsk = self.tasks.pop(spider, None)
|
||||
if tsk and not tsk.called:
|
@ -10,7 +10,7 @@ from scrapy.xlib.pydispatch import dispatcher
|
||||
|
||||
from scrapy.core import signals
|
||||
from scrapy.stats import stats
|
||||
from scrapy.stats.signals import stats_domain_opened, stats_domain_closing
|
||||
from scrapy.stats.signals import stats_spider_opened, stats_spider_closing
|
||||
from scrapy.conf import settings
|
||||
|
||||
class CoreStats(object):
|
||||
@ -22,32 +22,32 @@ class CoreStats(object):
|
||||
stats.set_value('envinfo/logfile', settings['LOG_FILE'])
|
||||
stats.set_value('envinfo/pid', os.getpid())
|
||||
|
||||
dispatcher.connect(self.stats_domain_opened, signal=stats_domain_opened)
|
||||
dispatcher.connect(self.stats_domain_closing, signal=stats_domain_closing)
|
||||
dispatcher.connect(self.stats_spider_opened, signal=stats_spider_opened)
|
||||
dispatcher.connect(self.stats_spider_closing, signal=stats_spider_closing)
|
||||
dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
|
||||
dispatcher.connect(self.item_passed, signal=signals.item_passed)
|
||||
dispatcher.connect(self.item_dropped, signal=signals.item_dropped)
|
||||
|
||||
def stats_domain_opened(self, domain):
|
||||
stats.set_value('start_time', datetime.datetime.utcnow(), domain=domain)
|
||||
stats.set_value('envinfo/host', stats.get_value('envinfo/host'), domain=domain)
|
||||
stats.inc_value('domain_count/opened')
|
||||
def stats_spider_opened(self, spider):
|
||||
stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
|
||||
stats.set_value('envinfo/host', stats.get_value('envinfo/host'), spider=spider)
|
||||
stats.inc_value('spider_count/opened')
|
||||
|
||||
def stats_domain_closing(self, domain, reason):
|
||||
stats.set_value('finish_time', datetime.datetime.utcnow(), domain=domain)
|
||||
stats.set_value('finish_status', 'OK' if reason == 'finished' else reason, domain=domain)
|
||||
stats.inc_value('domain_count/%s' % reason, domain=domain)
|
||||
def stats_spider_closing(self, spider, reason):
|
||||
stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
|
||||
stats.set_value('finish_status', 'OK' if reason == 'finished' else reason, spider=spider)
|
||||
stats.inc_value('spider_count/%s' % reason, spider=spider)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
stats.inc_value('item_scraped_count', domain=spider.domain_name)
|
||||
stats.inc_value('item_scraped_count', spider=spider)
|
||||
stats.inc_value('item_scraped_count')
|
||||
|
||||
def item_passed(self, item, spider):
|
||||
stats.inc_value('item_passed_count', domain=spider.domain_name)
|
||||
stats.inc_value('item_passed_count', spider=spider)
|
||||
stats.inc_value('item_passed_count')
|
||||
|
||||
def item_dropped(self, item, spider, exception):
|
||||
reason = exception.__class__.__name__
|
||||
stats.inc_value('item_dropped_count', domain=spider.domain_name)
|
||||
stats.inc_value('item_dropped_reasons_count/%s' % reason, domain=spider.domain_name)
|
||||
stats.inc_value('item_dropped_count', spider=spider)
|
||||
stats.inc_value('item_dropped_reasons_count/%s' % reason, spider=spider)
|
||||
stats.inc_value('item_dropped_count')
|
||||
|
@ -21,19 +21,19 @@ class DelayedCloseDomain(object):
|
||||
raise NotConfigured
|
||||
|
||||
self.opened_at = defaultdict(time)
|
||||
dispatcher.connect(self.domain_idle, signal=signals.domain_idle)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
dispatcher.connect(self.spider_idle, signal=signals.spider_idle)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def domain_idle(self, domain):
|
||||
def spider_idle(self, spider):
|
||||
try:
|
||||
lastseen = scrapyengine.downloader.sites[domain].lastseen
|
||||
lastseen = scrapyengine.downloader.sites[spider].lastseen
|
||||
except KeyError:
|
||||
lastseen = None
|
||||
if not lastseen:
|
||||
lastseen = self.opened_at[domain]
|
||||
lastseen = self.opened_at[spider]
|
||||
|
||||
if time() < lastseen + self.delay:
|
||||
raise DontCloseDomain
|
||||
|
||||
def domain_closed(self, domain):
|
||||
self.opened_at.pop(domain, None)
|
||||
def spider_closed(self, spider):
|
||||
self.opened_at.pop(spider, None)
|
||||
|
@ -16,13 +16,13 @@ class CookiesMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.jars = defaultdict(CookieJar)
|
||||
dispatcher.connect(self.domain_closed, signals.domain_closed)
|
||||
dispatcher.connect(self.spider_closed, signals.spider_closed)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
return
|
||||
|
||||
jar = self.jars[spider.domain_name]
|
||||
jar = self.jars[spider]
|
||||
cookies = self._get_request_cookies(jar, request)
|
||||
for cookie in cookies:
|
||||
jar.set_cookie_if_ok(cookie, request)
|
||||
@ -37,14 +37,14 @@ class CookiesMiddleware(object):
|
||||
return response
|
||||
|
||||
# extract cookies from Set-Cookie and drop invalid/expired cookies
|
||||
jar = self.jars[spider.domain_name]
|
||||
jar = self.jars[spider]
|
||||
jar.extract_cookies(response, request)
|
||||
self._debug_set_cookie(response)
|
||||
|
||||
return response
|
||||
|
||||
def domain_closed(self, domain):
|
||||
self.jars.pop(domain, None)
|
||||
def spider_closed(self, spider):
|
||||
self.jars.pop(spider, None)
|
||||
|
||||
def _debug_cookie(self, request):
|
||||
"""log Cookie header for request"""
|
||||
|
@ -1,180 +1,122 @@
|
||||
from __future__ import with_statement
|
||||
|
||||
import errno
|
||||
import os
|
||||
import hashlib
|
||||
import datetime
|
||||
from os.path import join, exists
|
||||
from time import time
|
||||
import cPickle as pickle
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.core import signals
|
||||
from scrapy import log
|
||||
from scrapy.http import Headers
|
||||
from scrapy.core.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.http import headers_dict_to_raw, headers_raw_to_dict
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.conf import settings
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy import conf
|
||||
|
||||
|
||||
class HttpCacheMiddleware(object):
|
||||
def __init__(self):
|
||||
if not settings['HTTPCACHE_DIR']:
|
||||
raise NotConfigured
|
||||
self.cache = Cache(settings['HTTPCACHE_DIR'], sectorize=settings.getbool('HTTPCACHE_SECTORIZE'))
|
||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||
dispatcher.connect(self.open_domain, signal=signals.domain_opened)
|
||||
|
||||
def open_domain(self, domain):
|
||||
self.cache.open_domain(domain)
|
||||
def __init__(self, settings=conf.settings):
|
||||
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.storage.open_spider(spider)
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.storage.close_spider(spider)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if not is_cacheable(request):
|
||||
if not self.is_cacheable(request):
|
||||
return
|
||||
|
||||
key = request_fingerprint(request)
|
||||
domain = spider.domain_name
|
||||
|
||||
try:
|
||||
response = self.cache.retrieve_response(domain, key)
|
||||
except:
|
||||
log.msg("Corrupt cache for %s" % request.url, log.WARNING)
|
||||
response = False
|
||||
|
||||
response = self.storage.retrieve_response(spider, request)
|
||||
if response:
|
||||
response.flags.append('cached')
|
||||
return response
|
||||
elif self.ignore_missing:
|
||||
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if is_cacheable(request):
|
||||
key = request_fingerprint(request)
|
||||
self.cache.store(spider.domain_name, key, request, response)
|
||||
|
||||
if self.is_cacheable(request):
|
||||
self.storage.store_response(spider, request, response)
|
||||
return response
|
||||
|
||||
|
||||
def is_cacheable(request):
|
||||
def is_cacheable(self, request):
|
||||
return urlparse_cached(request).scheme in ['http', 'https']
|
||||
|
||||
|
||||
class Cache(object):
|
||||
DOMAIN_SECTORDIR = 'data'
|
||||
DOMAIN_LINKDIR = 'domains'
|
||||
class FilesystemCacheStorage(object):
|
||||
|
||||
def __init__(self, cachedir, sectorize=False):
|
||||
def __init__(self, settings=conf.settings):
|
||||
cachedir = settings['HTTPCACHE_DIR']
|
||||
if not cachedir:
|
||||
raise NotConfigured
|
||||
self.cachedir = cachedir
|
||||
self.sectorize = sectorize
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
|
||||
self.baselinkpath = os.path.join(self.cachedir, self.DOMAIN_LINKDIR)
|
||||
if not os.path.exists(self.baselinkpath):
|
||||
os.makedirs(self.baselinkpath)
|
||||
def open_spider(self, spider):
|
||||
pass
|
||||
|
||||
self.basesectorpath = os.path.join(self.cachedir, self.DOMAIN_SECTORDIR)
|
||||
if not os.path.exists(self.basesectorpath):
|
||||
os.makedirs(self.basesectorpath)
|
||||
def close_spider(self, spider):
|
||||
pass
|
||||
|
||||
def domainsectorpath(self, domain):
|
||||
sector = hashlib.sha1(domain).hexdigest()[0]
|
||||
return os.path.join(self.basesectorpath, sector, domain)
|
||||
|
||||
def domainlinkpath(self, domain):
|
||||
return os.path.join(self.baselinkpath, domain)
|
||||
|
||||
def requestpath(self, domain, key):
|
||||
linkpath = self.domainlinkpath(domain)
|
||||
return os.path.join(linkpath, key[0:2], key)
|
||||
|
||||
def open_domain(self, domain):
|
||||
if domain:
|
||||
linkpath = self.domainlinkpath(domain)
|
||||
if self.sectorize:
|
||||
sectorpath = self.domainsectorpath(domain)
|
||||
if not os.path.exists(sectorpath):
|
||||
os.makedirs(sectorpath)
|
||||
if not os.path.exists(linkpath):
|
||||
try:
|
||||
os.symlink(sectorpath, linkpath)
|
||||
except:
|
||||
os.makedirs(linkpath) # windows filesystem
|
||||
else:
|
||||
if not os.path.exists(linkpath):
|
||||
os.makedirs(linkpath)
|
||||
|
||||
def read_meta(self, domain, key):
|
||||
"""Return the metadata dictionary (possibly empty) if the entry is
|
||||
cached, None otherwise.
|
||||
"""
|
||||
requestpath = self.requestpath(domain, key)
|
||||
try:
|
||||
with open(os.path.join(requestpath, 'pickled_meta'), 'r') as f:
|
||||
metadata = pickle.load(f)
|
||||
except IOError, e:
|
||||
if e.errno != errno.ENOENT:
|
||||
raise
|
||||
return None
|
||||
expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
if expiration_secs >= 0:
|
||||
expiration_date = metadata['timestamp'] + datetime.timedelta(seconds=expiration_secs)
|
||||
if datetime.datetime.utcnow() > expiration_date:
|
||||
log.msg('dropping old cached response from %s' % metadata['timestamp'], level=log.DEBUG, domain=domain)
|
||||
return None
|
||||
return metadata
|
||||
|
||||
def retrieve_response(self, domain, key):
|
||||
"""
|
||||
Return response dictionary if request has correspondent cache record;
|
||||
return None if not.
|
||||
"""
|
||||
metadata = self.read_meta(domain, key)
|
||||
def retrieve_response(self, spider, request):
|
||||
"""Return response if present in cache, or None otherwise."""
|
||||
metadata = self._read_meta(spider, request)
|
||||
if metadata is None:
|
||||
return None # not cached
|
||||
|
||||
requestpath = self.requestpath(domain, key)
|
||||
responsebody = responseheaders = None
|
||||
with open(os.path.join(requestpath, 'response_body')) as f:
|
||||
responsebody = f.read()
|
||||
with open(os.path.join(requestpath, 'response_headers')) as f:
|
||||
responseheaders = f.read()
|
||||
|
||||
return # not cached
|
||||
rpath = self._get_request_path(spider, request)
|
||||
with open(join(rpath, 'response_body'), 'rb') as f:
|
||||
body = f.read()
|
||||
with open(join(rpath, 'response_headers'), 'rb') as f:
|
||||
rawheaders = f.read()
|
||||
url = metadata['url']
|
||||
headers = Headers(headers_raw_to_dict(responseheaders))
|
||||
status = metadata['status']
|
||||
|
||||
headers = Headers(headers_raw_to_dict(rawheaders))
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=responsebody)
|
||||
response.meta['cached'] = True
|
||||
response.flags.append('cached')
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store(self, domain, key, request, response):
|
||||
requestpath = self.requestpath(domain, key)
|
||||
if not os.path.exists(requestpath):
|
||||
os.makedirs(requestpath)
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
"""Store the given response in the cache."""
|
||||
rpath = self._get_request_path(spider, request)
|
||||
if not exists(rpath):
|
||||
os.makedirs(rpath)
|
||||
metadata = {
|
||||
'url': request.url,
|
||||
'method': request.method,
|
||||
'status': response.status,
|
||||
'domain': domain,
|
||||
'timestamp': datetime.datetime.utcnow(),
|
||||
'timestamp': time(),
|
||||
}
|
||||
|
||||
# metadata
|
||||
with open(os.path.join(requestpath, 'meta_data'), 'w') as f:
|
||||
with open(join(rpath, 'meta'), 'wb') as f:
|
||||
f.write(repr(metadata))
|
||||
# pickled metadata (to recover without using eval)
|
||||
with open(os.path.join(requestpath, 'pickled_meta'), 'w') as f:
|
||||
pickle.dump(metadata, f)
|
||||
# response
|
||||
with open(os.path.join(requestpath, 'response_headers'), 'w') as f:
|
||||
with open(join(rpath, 'pickled_meta'), 'wb') as f:
|
||||
pickle.dump(metadata, f, protocol=2)
|
||||
with open(join(rpath, 'response_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(response.headers))
|
||||
with open(os.path.join(requestpath, 'response_body'), 'w') as f:
|
||||
with open(join(rpath, 'response_body'), 'wb') as f:
|
||||
f.write(response.body)
|
||||
# request
|
||||
with open(os.path.join(requestpath, 'request_headers'), 'w') as f:
|
||||
with open(join(rpath, 'request_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(request.headers))
|
||||
if request.body:
|
||||
with open(os.path.join(requestpath, 'request_body'), 'w') as f:
|
||||
with open(join(rpath, 'request_body'), 'wb') as f:
|
||||
f.write(request.body)
|
||||
|
||||
def _get_request_path(self, spider, request):
|
||||
key = request_fingerprint(request)
|
||||
return join(self.cachedir, spider.domain_name, key[0:2], key)
|
||||
|
||||
def _read_meta(self, spider, request):
|
||||
rpath = self._get_request_path(spider, request)
|
||||
metapath = join(rpath, 'pickled_meta')
|
||||
if not exists(metapath):
|
||||
return # not found
|
||||
mtime = os.stat(rpath).st_mtime
|
||||
if 0 <= self.expiration_secs < time() - mtime:
|
||||
return # expired
|
||||
with open(metapath, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
51
scrapy/contrib/downloadermiddleware/httpproxy.py
Normal file
51
scrapy/contrib/downloadermiddleware/httpproxy.py
Normal file
@ -0,0 +1,51 @@
|
||||
import base64
|
||||
from urllib import getproxies, unquote, proxy_bypass
|
||||
from urllib2 import _parse_proxy
|
||||
from urlparse import urlunparse
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
|
||||
|
||||
class HttpProxyMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.proxies = {}
|
||||
for type, url in getproxies().items():
|
||||
self.proxies[type] = self._get_proxy(url, type)
|
||||
|
||||
if not self.proxies:
|
||||
raise NotConfigured
|
||||
|
||||
def _get_proxy(self, url, orig_type):
|
||||
proxy_type, user, password, hostport = _parse_proxy(url)
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
|
||||
|
||||
if user and password:
|
||||
user_pass = '%s:%s' % (unquote(user), unquote(password))
|
||||
creds = base64.b64encode(user_pass).strip()
|
||||
else:
|
||||
creds = None
|
||||
|
||||
return creds, proxy_url
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# ignore if proxy is already seted
|
||||
if 'proxy' in request.meta:
|
||||
return
|
||||
|
||||
parsed = urlparse_cached(request)
|
||||
scheme = parsed.scheme
|
||||
|
||||
# 'no_proxy' is only supported by http schemes
|
||||
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
|
||||
return
|
||||
|
||||
if scheme in self.proxies:
|
||||
self._set_proxy(request, scheme)
|
||||
|
||||
def _set_proxy(self, request, scheme):
|
||||
creds, proxy = self.proxies[scheme]
|
||||
request.meta['proxy'] = proxy
|
||||
if creds:
|
||||
request.headers['Proxy-Authorization'] = 'Basic ' + creds
|
@ -14,13 +14,9 @@ class RedirectMiddleware(object):
|
||||
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
domain = spider.domain_name
|
||||
|
||||
if response.status in [302, 303] and 'Location' in response.headers:
|
||||
redirected_url = urljoin_rfc(request.url, response.headers['location'])
|
||||
redirected = request.replace(url=redirected_url, method='GET', body='')
|
||||
redirected.headers.pop('Content-Type', None)
|
||||
redirected.headers.pop('Content-Length', None)
|
||||
redirected = self._redirect_request_using_get(request, redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
if response.status in [301, 307] and 'Location' in response.headers:
|
||||
@ -29,8 +25,8 @@ class RedirectMiddleware(object):
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
interval, url = get_meta_refresh(response)
|
||||
if url and int(interval) < self.max_metarefresh_delay:
|
||||
redirected = request.replace(url=urljoin_rfc(request.url, url))
|
||||
if url and interval < self.max_metarefresh_delay:
|
||||
redirected = self._redirect_request_using_get(request, url)
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
|
||||
return response
|
||||
@ -45,11 +41,17 @@ class RedirectMiddleware(object):
|
||||
redirected.dont_filter = request.dont_filter
|
||||
redirected.priority = request.priority + self.priority_adjust
|
||||
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),
|
||||
domain=spider.domain_name, level=log.DEBUG)
|
||||
spider=spider, level=log.DEBUG)
|
||||
return redirected
|
||||
else:
|
||||
log.msg("Discarding %s: max redirections reached" % request,
|
||||
domain=spider.domain_name, level=log.DEBUG)
|
||||
spider=spider, level=log.DEBUG)
|
||||
raise IgnoreRequest
|
||||
|
||||
def _redirect_request_using_get(self, request, redirect_url):
|
||||
redirected = request.replace(url=redirect_url, method='GET', body='')
|
||||
redirected.headers.pop('Content-Type', None)
|
||||
redirected.headers.pop('Content-Length', None)
|
||||
return redirected
|
||||
|
||||
|
||||
|
@ -58,7 +58,7 @@ class RetryMiddleware(object):
|
||||
|
||||
if retries <= self.max_retry_times:
|
||||
log.msg("Retrying %s (failed %d times): %s" % (request, retries, reason),
|
||||
domain=spider.domain_name, level=log.DEBUG)
|
||||
spider=spider, level=log.DEBUG)
|
||||
retryreq = request.copy()
|
||||
retryreq.meta['retry_times'] = retries
|
||||
retryreq.dont_filter = True
|
||||
@ -66,5 +66,5 @@ class RetryMiddleware(object):
|
||||
return retryreq
|
||||
else:
|
||||
log.msg("Discarding %s (failed %d times): %s" % (request, retries, reason),
|
||||
domain=spider.domain_name, level=log.DEBUG)
|
||||
spider=spider, level=log.DEBUG)
|
||||
|
||||
|
@ -26,8 +26,8 @@ class RobotsTxtMiddleware(object):
|
||||
self._spider_netlocs = {}
|
||||
self._useragents = {}
|
||||
self._pending = {}
|
||||
dispatcher.connect(self.domain_opened, signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signals.domain_closed)
|
||||
dispatcher.connect(self.spider_opened, signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signals.spider_closed)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
useragent = self._useragents[spider]
|
||||
@ -52,12 +52,12 @@ class RobotsTxtMiddleware(object):
|
||||
rp.parse(response.body.splitlines())
|
||||
self._parsers[urlparse_cached(response).netloc] = rp
|
||||
|
||||
def domain_opened(self, spider):
|
||||
def spider_opened(self, spider):
|
||||
self._spider_netlocs[spider] = set()
|
||||
self._useragents[spider] = getattr(spider, 'user_agent', None) \
|
||||
or settings['USER_AGENT']
|
||||
|
||||
def domain_closed(self, domain, spider):
|
||||
def spider_closed(self, spider):
|
||||
for netloc in self._spider_netlocs[domain]:
|
||||
del self._parsers[netloc]
|
||||
del self._spider_netlocs[domain]
|
||||
|
@ -5,37 +5,30 @@ from scrapy.stats import stats
|
||||
from scrapy.conf import settings
|
||||
|
||||
class DownloaderStats(object):
|
||||
"""DownloaderStats store stats of all requests, responses and
|
||||
exceptions that pass through it.
|
||||
|
||||
To use this middleware you must enable the DOWNLOADER_STATS setting.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if not settings.getbool('DOWNLOADER_STATS'):
|
||||
raise NotConfigured
|
||||
|
||||
def process_request(self, request, spider):
|
||||
domain = spider.domain_name
|
||||
stats.inc_value('downloader/request_count')
|
||||
stats.inc_value('downloader/request_count', domain=domain)
|
||||
stats.inc_value('downloader/request_method_count/%s' % request.method, domain=domain)
|
||||
stats.inc_value('downloader/request_count', spider=spider)
|
||||
stats.inc_value('downloader/request_method_count/%s' % request.method, spider=spider)
|
||||
reqlen = len(request_httprepr(request))
|
||||
stats.inc_value('downloader/request_bytes', reqlen, domain=domain)
|
||||
stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
|
||||
stats.inc_value('downloader/request_bytes', reqlen)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
domain = spider.domain_name
|
||||
stats.inc_value('downloader/response_count')
|
||||
stats.inc_value('downloader/response_count', domain=domain)
|
||||
stats.inc_value('downloader/response_status_count/%s' % response.status, domain=domain)
|
||||
stats.inc_value('downloader/response_count', spider=spider)
|
||||
stats.inc_value('downloader/response_status_count/%s' % response.status, spider=spider)
|
||||
reslen = len(response_httprepr(response))
|
||||
stats.inc_value('downloader/response_bytes', reslen, domain=domain)
|
||||
stats.inc_value('downloader/response_bytes', reslen, spider=spider)
|
||||
stats.inc_value('downloader/response_bytes', reslen)
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__)
|
||||
stats.inc_value('downloader/exception_count')
|
||||
stats.inc_value('downloader/exception_count', domain=spider.domain_name)
|
||||
stats.inc_value('downloader/exception_type_count/%s' % ex_class, domain=spider.domain_name)
|
||||
stats.inc_value('downloader/exception_count', spider=spider)
|
||||
stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
|
||||
|
@ -15,8 +15,8 @@ Settings that affect this module:
|
||||
|
||||
ITEMSAMPLER_FILE
|
||||
file where to store the pickled dict of scraped items
|
||||
ITEMSAMPLER_CLOSE_DOMAIN
|
||||
wether to close the domain after enough products have been sampled
|
||||
ITEMSAMPLER_CLOSE_SPIDER
|
||||
wether to close the spider after enough products have been sampled
|
||||
ITEMSAMPLER_MAX_RESPONSE_SIZE
|
||||
maximum response size to process
|
||||
"""
|
||||
@ -36,8 +36,8 @@ from scrapy.http import Request
|
||||
from scrapy import log
|
||||
from scrapy.conf import settings
|
||||
|
||||
items_per_domain = settings.getint('ITEMSAMPLER_COUNT', 1)
|
||||
close_domain = settings.getbool('ITEMSAMPLER_CLOSE_DOMAIN', False)
|
||||
items_per_spider = settings.getint('ITEMSAMPLER_COUNT', 1)
|
||||
close_spider = settings.getbool('ITEMSAMPLER_CLOSE_SPIDER', False)
|
||||
max_response_size = settings.getbool('ITEMSAMPLER_MAX_RESPONSE_SIZE', )
|
||||
|
||||
class ItemSamplerPipeline(object):
|
||||
@ -47,20 +47,19 @@ class ItemSamplerPipeline(object):
|
||||
if not self.filename:
|
||||
raise NotConfigured
|
||||
self.items = {}
|
||||
self.domains_count = 0
|
||||
self.spiders_count = 0
|
||||
self.empty_domains = set()
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
domain = spider.domain_name
|
||||
sampled = stats.get_value("items_sampled", 0, domain=domain)
|
||||
if sampled < items_per_domain:
|
||||
def process_item(self, spider, item):
|
||||
sampled = stats.get_value("items_sampled", 0, spider=spider)
|
||||
if sampled < items_per_spider:
|
||||
self.items[item.guid] = item
|
||||
sampled += 1
|
||||
stats.set_value("items_sampled", sampled, domain=domain)
|
||||
log.msg("Sampled %s" % item, domain=domain, level=log.INFO)
|
||||
if close_domain and sampled == items_per_domain:
|
||||
stats.set_value("items_sampled", sampled, spider=spider)
|
||||
log.msg("Sampled %s" % item, spider=spider, level=log.INFO)
|
||||
if close_spider and sampled == items_per_spider:
|
||||
scrapyengine.close_spider(spider)
|
||||
return item
|
||||
|
||||
@ -68,13 +67,15 @@ class ItemSamplerPipeline(object):
|
||||
with open(self.filename, 'w') as f:
|
||||
pickle.dump(self.items, f)
|
||||
if self.empty_domains:
|
||||
log.msg("No products sampled for: %s" % " ".join(self.empty_domains), level=log.WARNING)
|
||||
log.msg("No products sampled for: %s" % " ".join(self.empty_domains), \
|
||||
level=log.WARNING)
|
||||
|
||||
def domain_closed(self, domain, spider, reason):
|
||||
if reason == 'finished' and not stats.get_value("items_sampled", domain=domain):
|
||||
self.empty_domains.add(domain)
|
||||
self.domains_count += 1
|
||||
log.msg("Sampled %d domains so far (%d empty)" % (self.domains_count, len(self.empty_domains)), level=log.INFO)
|
||||
def spider_closed(self, spider, reason):
|
||||
if reason == 'finished' and not stats.get_value("items_sampled", spider=spider):
|
||||
self.empty_domains.add(spider.domain_name)
|
||||
self.spiders_count += 1
|
||||
log.msg("Sampled %d domains so far (%d empty)" % (self.spiders_count, \
|
||||
len(self.empty_domains)), level=log.INFO)
|
||||
|
||||
|
||||
class ItemSamplerMiddleware(object):
|
||||
@ -86,7 +87,7 @@ class ItemSamplerMiddleware(object):
|
||||
raise NotConfigured
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
if stats.get_value("items_sampled", domain=spider.domain_name) >= items_per_domain:
|
||||
if stats.get_value("items_sampled", spider=spider) >= items_per_spider:
|
||||
return []
|
||||
elif max_response_size and max_response_size > len(response_httprepr(response)):
|
||||
return []
|
||||
@ -99,7 +100,7 @@ class ItemSamplerMiddleware(object):
|
||||
else:
|
||||
items.append(r)
|
||||
|
||||
if stats.get_value("items_sampled", domain=spider.domain_name) >= items_per_domain:
|
||||
if stats.get_value("items_sampled", spider=spider) >= items_per_spider:
|
||||
return []
|
||||
else:
|
||||
# TODO: this needs some revision, as keeping only the first item
|
||||
|
@ -9,7 +9,7 @@ from lxml import etree
|
||||
import lxml.html
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.python import unique as unique_list
|
||||
from scrapy.utils.python import unique as unique_list, str_to_unicode
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc
|
||||
|
||||
class LxmlLinkExtractor(object):
|
||||
@ -33,7 +33,7 @@ class LxmlLinkExtractor(object):
|
||||
for link in links:
|
||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
link.text = link.text.decode(response_encoding)
|
||||
link.text = str_to_unicode(link.text, response_encoding)
|
||||
ret.append(link)
|
||||
|
||||
return ret
|
||||
@ -43,6 +43,10 @@ class LxmlLinkExtractor(object):
|
||||
return self._extract_links(response.body, response.url,
|
||||
response.encoding)
|
||||
|
||||
def matches(self, url):
|
||||
"""This extractor matches with any url, since it doesn't contain any patterns"""
|
||||
return True
|
||||
|
||||
|
||||
class LinkTarget(object):
|
||||
def __init__(self, scan_tag, scan_attr, process_attr):
|
||||
|
7
scrapy/contrib/logformatter.py
Normal file
7
scrapy/contrib/logformatter.py
Normal file
@ -0,0 +1,7 @@
|
||||
"""Functions for logging diferent actions"""
|
||||
|
||||
def crawled_logline(request, response):
|
||||
referer = request.headers.get('Referer')
|
||||
flags = ' %s' % str(response.flags) if response.flags else ''
|
||||
return "Crawled (%d) %s (referer: %s)%s" % (response.status, \
|
||||
request, referer, flags)
|
@ -57,7 +57,7 @@ class ItemPipelineManager(object):
|
||||
if not stages_left:
|
||||
return item
|
||||
current_stage = stages_left.pop(0)
|
||||
d = mustbe_deferred(current_stage.process_item, spider.domain_name, item)
|
||||
d = mustbe_deferred(current_stage.process_item, spider, item)
|
||||
d.addCallback(next_stage, stages_left)
|
||||
return d
|
||||
|
||||
|
@ -17,7 +17,7 @@ class FileExportPipeline(object):
|
||||
self.exporter.start_exporting()
|
||||
dispatcher.connect(self.engine_stopped, signals.engine_stopped)
|
||||
|
||||
def process_item(self, domain, item):
|
||||
def process_item(self, spider, item):
|
||||
self.exporter.export_item(item)
|
||||
return item
|
||||
|
||||
|
@ -44,10 +44,10 @@ class FSImagesStore(object):
|
||||
self.basedir = basedir
|
||||
self._mkdir(self.basedir)
|
||||
self.created_directories = defaultdict(set)
|
||||
dispatcher.connect(self.domain_closed, signals.domain_closed)
|
||||
dispatcher.connect(self.spider_closed, signals.spider_closed)
|
||||
|
||||
def domain_closed(self, domain):
|
||||
self.created_directories.pop(domain, None)
|
||||
def spider_closed(self, spider):
|
||||
self.created_directories.pop(spider.domain_name, None)
|
||||
|
||||
def persist_image(self, key, image, buf, info):
|
||||
absolute_path = self._get_filesystem_path(key)
|
||||
@ -207,28 +207,28 @@ class ImagesPipeline(MediaPipeline):
|
||||
|
||||
if response.status != 200:
|
||||
log.msg('Image (http-error): Error downloading image from %s referred in <%s>' \
|
||||
% (request, referer), level=log.WARNING, domain=info.domain)
|
||||
% (request, referer), level=log.WARNING, spider=info.spider)
|
||||
raise ImageException
|
||||
|
||||
if not response.body:
|
||||
log.msg('Image (empty-content): Empty image from %s referred in <%s>: no-content' \
|
||||
% (request, referer), level=log.WARNING, domain=info.domain)
|
||||
% (request, referer), level=log.WARNING, spider=info.spider)
|
||||
raise ImageException
|
||||
|
||||
status = 'cached' if 'cached' in response.flags else 'downloaded'
|
||||
msg = 'Image (%s): Downloaded image from %s referred in <%s>' % \
|
||||
(status, request, referer)
|
||||
log.msg(msg, level=log.DEBUG, domain=info.domain)
|
||||
self.inc_stats(info.domain, status)
|
||||
log.msg(msg, level=log.DEBUG, spider=info.spider)
|
||||
self.inc_stats(info.spider, status)
|
||||
|
||||
try:
|
||||
key = self.image_key(request.url)
|
||||
checksum = self.image_downloaded(response, request, info)
|
||||
except ImageException, ex:
|
||||
log.msg(str(ex), level=log.WARNING, domain=info.domain)
|
||||
log.msg(str(ex), level=log.WARNING, spider=info.spider)
|
||||
raise
|
||||
except Exception:
|
||||
log.err(domain=info.domain)
|
||||
log.err(spider=info.spider)
|
||||
raise ImageException
|
||||
|
||||
return {'url': request.url, 'path': key, 'checksum': checksum}
|
||||
@ -238,7 +238,7 @@ class ImagesPipeline(MediaPipeline):
|
||||
referer = request.headers.get('Referer')
|
||||
msg = 'Image (unknown-error): Error downloading %s from %s referred in <%s>: %s' \
|
||||
% (self.MEDIA_NAME, request, referer, str(failure))
|
||||
log.msg(msg, level=log.WARNING, domain=info.domain)
|
||||
log.msg(msg, level=log.WARNING, spider=info.spider)
|
||||
raise ImageException
|
||||
|
||||
def media_to_download(self, request, info):
|
||||
@ -257,8 +257,8 @@ class ImagesPipeline(MediaPipeline):
|
||||
|
||||
referer = request.headers.get('Referer')
|
||||
log.msg('Image (uptodate): Downloaded %s from <%s> referred in <%s>' % \
|
||||
(self.MEDIA_NAME, request.url, referer), level=log.DEBUG, domain=info.domain)
|
||||
self.inc_stats(info.domain, 'uptodate')
|
||||
(self.MEDIA_NAME, request.url, referer), level=log.DEBUG, spider=info.spider)
|
||||
self.inc_stats(info.spider, 'uptodate')
|
||||
|
||||
checksum = result.get('checksum', None)
|
||||
return {'url': request.url, 'path': key, 'checksum': checksum}
|
||||
@ -295,9 +295,9 @@ class ImagesPipeline(MediaPipeline):
|
||||
thumb_image, thumb_buf = self.convert_image(image, size)
|
||||
yield thumb_key, thumb_image, thumb_buf
|
||||
|
||||
def inc_stats(self, domain, status):
|
||||
stats.inc_value('image_count', domain=domain)
|
||||
stats.inc_value('image_status_count/%s' % status, domain=domain)
|
||||
def inc_stats(self, spider, status):
|
||||
stats.inc_value('image_count', spider=spider)
|
||||
stats.inc_value('image_status_count/%s' % status, spider=spider)
|
||||
|
||||
def convert_image(self, image, size=None):
|
||||
if image.mode != 'RGB':
|
||||
|
@ -12,27 +12,26 @@ from scrapy.utils.misc import arg_to_iter
|
||||
class MediaPipeline(object):
|
||||
DOWNLOAD_PRIORITY = 1000
|
||||
|
||||
class DomainInfo(object):
|
||||
class SpiderInfo(object):
|
||||
def __init__(self, spider):
|
||||
self.domain = spider.domain_name
|
||||
self.spider = spider
|
||||
self.downloading = {}
|
||||
self.downloaded = {}
|
||||
self.waiting = {}
|
||||
|
||||
def __init__(self):
|
||||
self.domaininfo = {}
|
||||
dispatcher.connect(self.domain_opened, signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signals.domain_closed)
|
||||
self.spiderinfo = {}
|
||||
dispatcher.connect(self.spider_opened, signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signals.spider_closed)
|
||||
|
||||
def domain_opened(self, spider):
|
||||
self.domaininfo[spider.domain_name] = self.DomainInfo(spider)
|
||||
def spider_opened(self, spider):
|
||||
self.spiderinfo[spider] = self.SpiderInfo(spider)
|
||||
|
||||
def domain_closed(self, domain):
|
||||
del self.domaininfo[domain]
|
||||
def spider_closed(self, spider):
|
||||
del self.spiderinfo[spider]
|
||||
|
||||
def process_item(self, domain, item):
|
||||
info = self.domaininfo[domain]
|
||||
def process_item(self, spider, item):
|
||||
info = self.spiderinfo[spider]
|
||||
requests = arg_to_iter(self.get_media_requests(item, info))
|
||||
dlist = []
|
||||
for request in requests:
|
||||
@ -83,7 +82,7 @@ class MediaPipeline(object):
|
||||
|
||||
info.downloading[fp] = (request, dwld) # fill downloading state data
|
||||
dwld.addBoth(_downloaded) # append post-download hook
|
||||
dwld.addErrback(log.err, domain=info.domain)
|
||||
dwld.addErrback(log.err, spider=info.spider)
|
||||
|
||||
# declare request in downloading state (None is used as place holder)
|
||||
info.downloading[fp] = None
|
||||
|
@ -16,14 +16,14 @@ class CachingResolver(object):
|
||||
self.resolver = _CachingThreadedResolver(reactor)
|
||||
reactor.installResolver(self.resolver)
|
||||
dispatcher.connect(self.request_received, signals.request_received)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def request_received(self, request, spider):
|
||||
url_hostname = urlparse_cached(request).hostname
|
||||
self.spider_hostnames[spider.domain_name].add(url_hostname)
|
||||
self.spider_hostnames[spider].add(url_hostname)
|
||||
|
||||
def domain_closed(self, spider):
|
||||
for hostname in self.spider_hostnames:
|
||||
def spider_closed(self, spider):
|
||||
for hostname in self.spider_hostnames[spider]:
|
||||
self.resolver._cache.pop(hostname, None)
|
||||
|
||||
|
||||
|
@ -84,7 +84,7 @@ class TwistedPluginSpiderManager(object):
|
||||
module_name = spider.__module__
|
||||
module = sys.modules[module_name]
|
||||
if hasattr(module, 'SPIDER'):
|
||||
log.msg("Reloading module %s" % module_name, domain=domain, \
|
||||
log.msg("Reloading module %s" % module_name, spider=spider, \
|
||||
level=log.DEBUG)
|
||||
new_module = rebuild(module, doLog=0)
|
||||
self._spiders[domain] = new_module.SPIDER
|
||||
|
@ -18,24 +18,23 @@ class DepthMiddleware(object):
|
||||
stats.set_value('envinfo/request_depth_limit', self.maxdepth)
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
domain = spider.domain_name
|
||||
def _filter(request):
|
||||
if isinstance(request, Request):
|
||||
depth = response.request.meta['depth'] + 1
|
||||
request.meta['depth'] = depth
|
||||
if self.maxdepth and depth > self.maxdepth:
|
||||
log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \
|
||||
level=log.DEBUG, domain=domain)
|
||||
level=log.DEBUG, spider=spider)
|
||||
return False
|
||||
elif self.stats:
|
||||
stats.inc_value('request_depth_count/%s' % depth, domain=domain)
|
||||
if depth > stats.get_value('request_depth_max', 0, domain=domain):
|
||||
stats.set_value('request_depth_max', depth, domain=domain)
|
||||
stats.inc_value('request_depth_count/%s' % depth, spider=spider)
|
||||
if depth > stats.get_value('request_depth_max', 0, spider=spider):
|
||||
stats.set_value('request_depth_max', depth, spider=spider)
|
||||
return True
|
||||
|
||||
# base case (depth=0)
|
||||
if self.stats and 'depth' not in response.request.meta:
|
||||
response.request.meta['depth'] = 0
|
||||
stats.inc_value('request_depth_count/0', domain=domain)
|
||||
stats.inc_value('request_depth_count/0', spider=spider)
|
||||
|
||||
return (r for r in result or () if _filter(r))
|
||||
|
@ -10,17 +10,29 @@ from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.core import signals
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy import log
|
||||
|
||||
class OffsiteMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.host_regexes = {}
|
||||
dispatcher.connect(self.domain_opened, signal=signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
self.domains_seen = {}
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
return (x for x in result if not isinstance(x, Request) or \
|
||||
self.should_follow(x, spider))
|
||||
for x in result:
|
||||
if isinstance(x, Request):
|
||||
if self.should_follow(x, spider):
|
||||
yield x
|
||||
else:
|
||||
domain = urlparse_cached(x).hostname
|
||||
if domain and domain not in self.domains_seen[spider]:
|
||||
log.msg("Filtered offsite request to %r: %s" % (domain, x),
|
||||
level=log.DEBUG, spider=spider)
|
||||
self.domains_seen[spider].add(domain)
|
||||
else:
|
||||
yield x
|
||||
|
||||
def should_follow(self, request, spider):
|
||||
regex = self.host_regexes[spider]
|
||||
@ -34,9 +46,11 @@ class OffsiteMiddleware(object):
|
||||
regex = r'^(.*\.)?(%s)$' % '|'.join(domains)
|
||||
return re.compile(regex)
|
||||
|
||||
def domain_opened(self, spider):
|
||||
def spider_opened(self, spider):
|
||||
domains = [spider.domain_name] + spider.extra_domain_names
|
||||
self.host_regexes[spider] = self.get_host_regex(domains)
|
||||
self.domains_seen[spider] = set()
|
||||
|
||||
def domain_closed(self, spider):
|
||||
def spider_closed(self, spider):
|
||||
del self.host_regexes[spider]
|
||||
del self.domains_seen[spider]
|
||||
|
@ -23,44 +23,43 @@ class RequestLimitMiddleware(object):
|
||||
self.max_pending = {}
|
||||
self.dropped_count = {}
|
||||
|
||||
dispatcher.connect(self.domain_opened, signal=signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def domain_opened(self, domain, spider):
|
||||
self.max_pending[domain] = getattr(spider, 'requests_queue_size', self.max_queue_size)
|
||||
self.dropped_count[domain] = 0
|
||||
def spider_opened(self, spider):
|
||||
self.max_pending[spider] = getattr(spider, 'requests_queue_size', self.max_queue_size)
|
||||
self.dropped_count[spider] = 0
|
||||
|
||||
def domain_closed(self, domain):
|
||||
dropped_count = self.dropped_count[domain]
|
||||
def spider_closed(self, spider):
|
||||
dropped_count = self.dropped_count[spider]
|
||||
if dropped_count:
|
||||
max_pending = self.max_pending[domain]
|
||||
max_pending = self.max_pending[spider]
|
||||
log.msg('Dropped %d request(s) because the scheduler queue size limit (%d requests) was exceeded' % \
|
||||
(dropped_count, max_pending), level=log.DEBUG, domain=domain)
|
||||
del self.dropped_count[domain]
|
||||
del self.max_pending[domain]
|
||||
(dropped_count, max_pending), level=log.DEBUG, spider=spider)
|
||||
del self.dropped_count[spider]
|
||||
del self.max_pending[spider]
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
domain = spider.domain_name
|
||||
max_pending = self.max_pending.get(domain, 0)
|
||||
max_pending = self.max_pending.get(spider, 0)
|
||||
if max_pending:
|
||||
return imap(lambda v: self._limit_requests(v, domain, max_pending), result)
|
||||
return imap(lambda v: self._limit_requests(v, spider, max_pending), result)
|
||||
else:
|
||||
return result
|
||||
|
||||
def _limit_requests(self, request_or_other, domain, max_pending):
|
||||
def _limit_requests(self, request_or_other, spider, max_pending):
|
||||
if isinstance(request_or_other, Request):
|
||||
free_slots = max_pending - self._pending_count(domain)
|
||||
free_slots = max_pending - self._pending_count(spider)
|
||||
if free_slots > 0:
|
||||
# Scheduler isn't saturated and it is fine to schedule more requests.
|
||||
return request_or_other
|
||||
else:
|
||||
# Skip the request and give engine time to handle other tasks.
|
||||
self.dropped_count[domain] += 1
|
||||
self.dropped_count[spider] += 1
|
||||
return None
|
||||
else:
|
||||
# Return others (non-requests) as is.
|
||||
return request_or_other
|
||||
|
||||
def _pending_count(self, domain):
|
||||
pending = scrapyengine.scheduler.pending_requests.get(domain, [])
|
||||
def _pending_count(self, spider):
|
||||
pending = scrapyengine.scheduler.pending_requests.get(spider, [])
|
||||
return len(pending)
|
||||
|
@ -19,7 +19,7 @@ class UrlLengthMiddleware(object):
|
||||
def _filter(request):
|
||||
if isinstance(request, Request) and len(request.url) > self.maxlength:
|
||||
log.msg("Ignoring link (url length > %d): %s " % (self.maxlength, request.url), \
|
||||
level=log.DEBUG, domain=spider.domain_name)
|
||||
level=log.DEBUG, spider=spider)
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
@ -90,7 +90,6 @@ class CrawlSpider(InitSpider):
|
||||
matching each case, filters them (if needed), and returns a list of unique
|
||||
requests per response.
|
||||
"""
|
||||
requests = []
|
||||
seen = set()
|
||||
for rule in self._rules:
|
||||
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
|
||||
@ -101,8 +100,7 @@ class CrawlSpider(InitSpider):
|
||||
r = Request(url=link.url)
|
||||
r.meta['link_text'] = link.text
|
||||
r.deferred.addCallback(self._response_downloaded, rule.callback, cb_kwargs=rule.cb_kwargs, follow=rule.follow)
|
||||
requests.append(r)
|
||||
return requests
|
||||
yield r
|
||||
|
||||
def _response_downloaded(self, response, callback, cb_kwargs, follow):
|
||||
"""
|
||||
@ -110,15 +108,16 @@ class CrawlSpider(InitSpider):
|
||||
to extract links or not from it, and if it will be parsed or not.
|
||||
It returns a list of requests/items.
|
||||
"""
|
||||
res = []
|
||||
|
||||
if follow and settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True):
|
||||
res.extend(self._requests_to_follow(response))
|
||||
if callback:
|
||||
cb_res = callback(response, **cb_kwargs) or ()
|
||||
cb_res = self.process_results(response, cb_res)
|
||||
res.extend(iterate_spider_output(cb_res))
|
||||
return res
|
||||
for requests_or_item in iterate_spider_output(cb_res):
|
||||
yield requests_or_item
|
||||
|
||||
if follow and settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True):
|
||||
for request_or_item in self._requests_to_follow(response):
|
||||
yield request_or_item
|
||||
|
||||
|
||||
def _compile_rules(self):
|
||||
"""Compile the crawling rules"""
|
||||
|
@ -1,5 +1,5 @@
|
||||
"""
|
||||
StatsMailer extension sends an email when a domain finishes scraping.
|
||||
StatsMailer extension sends an email when a spider finishes scraping.
|
||||
|
||||
Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
|
||||
"""
|
||||
@ -17,12 +17,12 @@ class StatsMailer(object):
|
||||
self.recipients = settings.getlist("STATSMAILER_RCPTS")
|
||||
if not self.recipients:
|
||||
raise NotConfigured
|
||||
dispatcher.connect(self.stats_domain_closed, signal=signals.stats_domain_closed)
|
||||
dispatcher.connect(self.stats_spider_closed, signal=signals.stats_spider_closed)
|
||||
|
||||
def stats_domain_closed(self, domain, domain_stats):
|
||||
def stats_spider_closed(self, spider, spider_stats):
|
||||
mail = MailSender()
|
||||
body = "Global stats\n\n"
|
||||
body += "\n".join("%-50s : %s" % i for i in stats.get_stats().items())
|
||||
body += "\n\n%s stats\n\n" % domain
|
||||
body += "\n".join("%-50s : %s" % i for i in domain_stats.items())
|
||||
mail.send(self.recipients, "Scrapy stats for: %s" % domain, body)
|
||||
body += "\n\n%s stats\n\n" % spider.domain_name
|
||||
body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
|
||||
mail.send(self.recipients, "Scrapy stats for: %s" % spider.domain_name, body)
|
||||
|
@ -20,49 +20,49 @@ class LiveStats(object):
|
||||
|
||||
def __init__(self):
|
||||
self.domains = {}
|
||||
dispatcher.connect(self.domain_opened, signal=signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
|
||||
dispatcher.connect(self.response_downloaded, signal=signals.response_downloaded)
|
||||
|
||||
dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)
|
||||
|
||||
def domain_opened(self, domain, spider):
|
||||
def spider_opened(self, spider):
|
||||
pstats = SpiderStats()
|
||||
self.domains[spider.domain_name] = pstats
|
||||
self.domains[spider] = pstats
|
||||
pstats.started = datetime.now().replace(microsecond=0)
|
||||
pstats.finished = None
|
||||
|
||||
def domain_closed(self, domain, spider):
|
||||
self.domains[spider.domain_name].finished = datetime.now().replace(microsecond=0)
|
||||
def spider_closed(self, spider):
|
||||
self.domains[spider].finished = datetime.now().replace(microsecond=0)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.domains[spider.domain_name].scraped += 1
|
||||
self.domains[spider].scraped += 1
|
||||
|
||||
def response_downloaded(self, response, spider):
|
||||
# sometimes we download responses without opening/closing domains,
|
||||
# for example from scrapy shell
|
||||
if self.domains.get(spider.domain_name):
|
||||
self.domains[spider.domain_name].crawled += 1
|
||||
if self.domains.get(spider):
|
||||
self.domains[spider].crawled += 1
|
||||
|
||||
def webconsole_render(self, wc_request):
|
||||
sch = scrapyengine.scheduler
|
||||
dwl = scrapyengine.downloader
|
||||
|
||||
totdomains = totscraped = totcrawled = totscheduled = totactive = totpending = totdqueued = tottransf = 0
|
||||
totdomains = totscraped = totcrawled = totscheduled = totactive = totdqueued = tottransf = 0
|
||||
s = banner(self)
|
||||
s += "<table border='1'>\n"
|
||||
s += "<tr><th>Domain</th><th>Items<br>Scraped</th><th>Pages<br>Crawled</th><th>Scheduler<br>Pending</th><th>Downloader<br/>Queued</th><th>Downloader<br/>Active</th><th>Downloader<br/>Transferring</th><th>Start time</th><th>Finish time</th><th>Run time</th></tr>\n"
|
||||
for d in sorted(self.domains.keys()):
|
||||
scheduled = len(sch.pending_requests[d]) if d in sch.pending_requests else 0
|
||||
active = len(dwl.sites[d].active) if d in dwl.sites else 0
|
||||
dqueued = len(dwl.sites[d].queue) if d in dwl.sites else 0
|
||||
transf = len(dwl.sites[d].transferring) if d in dwl.sites else 0
|
||||
stats = self.domains[d]
|
||||
s += "<tr><th>Spider</th><th>Items<br>Scraped</th><th>Pages<br>Crawled</th><th>Scheduler<br>Pending</th><th>Downloader<br/>Queued</th><th>Downloader<br/>Active</th><th>Downloader<br/>Transferring</th><th>Start time</th><th>Finish time</th><th>Run time</th></tr>\n"
|
||||
for spider in sorted(self.domains.keys()):
|
||||
scheduled = len(sch.pending_requests[spider]) if spider in sch.pending_requests else 0
|
||||
active = len(dwl.sites[spider].active) if spider in dwl.sites else 0
|
||||
dqueued = len(dwl.sites[spider].queue) if spider in dwl.sites else 0
|
||||
transf = len(dwl.sites[spider].transferring) if spider in dwl.sites else 0
|
||||
stats = self.domains[spider]
|
||||
runtime = stats.finished - stats.started if stats.finished else datetime.now() - stats.started
|
||||
|
||||
s += '<tr><td>%s</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % \
|
||||
(d, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(stats.finished), str(runtime))
|
||||
(spider.domain_name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(stats.finished), str(runtime))
|
||||
|
||||
totdomains += 1
|
||||
totscraped += stats.scraped
|
||||
|
@ -18,16 +18,16 @@ class Spiderctl(object):
|
||||
def __init__(self):
|
||||
self.running = {}
|
||||
self.finished = set()
|
||||
dispatcher.connect(self.domain_opened, signal=signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
from scrapy.management.web import webconsole_discover_module
|
||||
dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)
|
||||
|
||||
def domain_opened(self, spider):
|
||||
def spider_opened(self, spider):
|
||||
self.running[spider.domain_name] = spider
|
||||
|
||||
def domain_closed(self, spider):
|
||||
def spider_closed(self, spider):
|
||||
del self.running[spider.domain_name]
|
||||
self.finished.add(spider.domain_name)
|
||||
|
||||
@ -44,9 +44,9 @@ class Spiderctl(object):
|
||||
s += '<table border=1">\n'
|
||||
s += "<tr><th>Idle (%d)</th><th>Scheduled (%d)</th><th>Running (%d/%d)</th><th>Finished (%d)</th></tr>\n" % \
|
||||
(len(self.idle),
|
||||
len(self.running),
|
||||
settings['CONCURRENT_DOMAINS'],
|
||||
len(self.scheduled),
|
||||
len(self.running),
|
||||
settings['CONCURRENT_SPIDERS'],
|
||||
len(self.finished))
|
||||
s += "<tr>\n"
|
||||
|
||||
|
@ -22,9 +22,9 @@ class StatsDump(object):
|
||||
s = banner(self)
|
||||
s += "<h3>Global stats</h3>\n"
|
||||
s += stats_html_table(stats.get_stats())
|
||||
for domain in stats.list_domains():
|
||||
s += "<h3>%s</h3>\n" % domain
|
||||
s += stats_html_table(stats.get_stats(domain))
|
||||
for spider, spider_stats in stats.iter_spider_stats():
|
||||
s += "<h3>%s</h3>\n" % spider.domain_name
|
||||
s += stats_html_table(spider_stats)
|
||||
s += "</body>\n"
|
||||
s += "</html>\n"
|
||||
|
||||
|
@ -2,10 +2,10 @@
|
||||
and extract the potentially compressed responses that may arrive.
|
||||
"""
|
||||
|
||||
import bz2
|
||||
import gzip
|
||||
import zipfile
|
||||
import tarfile
|
||||
import gzip
|
||||
import bz2
|
||||
from cStringIO import StringIO
|
||||
from tempfile import mktemp
|
||||
|
||||
@ -13,79 +13,69 @@ from scrapy import log
|
||||
from scrapy.http import Response
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
|
||||
|
||||
class DecompressionMiddleware(object):
|
||||
""" This middleware tries to recognise and extract the possibly compressed
|
||||
responses that may arrive. """
|
||||
|
||||
def __init__(self):
|
||||
self.decompressors = {
|
||||
'tar': self.is_tar,
|
||||
'zip': self.is_zip,
|
||||
'gz': self.is_gzip,
|
||||
'bz2': self.is_bzip2
|
||||
self._formats = {
|
||||
'tar': self._is_tar,
|
||||
'zip': self._is_zip,
|
||||
'gz': self._is_gzip,
|
||||
'bz2': self._is_bzip2
|
||||
}
|
||||
|
||||
def is_tar(self, response):
|
||||
def _is_tar(self, response):
|
||||
archive = StringIO(response.body)
|
||||
try:
|
||||
tar_file = tarfile.open(name=mktemp(), fileobj=self.archive)
|
||||
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
|
||||
except tarfile.ReadError:
|
||||
return False
|
||||
if tar_file.members:
|
||||
body = body=tar_file.extractfile(tar_file.members[0]).read()
|
||||
return
|
||||
|
||||
body = tar_file.extractfile(tar_file.members[0]).read()
|
||||
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
else:
|
||||
raise self.ArchiveIsEmpty
|
||||
|
||||
def is_zip(self, response):
|
||||
def _is_zip(self, response):
|
||||
archive = StringIO(response.body)
|
||||
try:
|
||||
zip_file = zipfile.ZipFile(self.archive)
|
||||
zip_file = zipfile.ZipFile(archive)
|
||||
except zipfile.BadZipfile:
|
||||
return False
|
||||
return
|
||||
|
||||
namelist = zip_file.namelist()
|
||||
if namelist:
|
||||
body = zip_file.read(namelist[0])
|
||||
respcls = responsetypes.from_args(filename=namelist[0], body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
else:
|
||||
raise self.ArchiveIsEmpty
|
||||
|
||||
def is_gzip(self, response):
|
||||
def _is_gzip(self, response):
|
||||
archive = StringIO(response.body)
|
||||
try:
|
||||
gzip_file = gzip.GzipFile(fileobj=self.archive)
|
||||
decompressed_body = gzip_file.read()
|
||||
body = gzip.GzipFile(fileobj=archive).read()
|
||||
except IOError:
|
||||
return False
|
||||
respcls = responsetypes.from_args(body=decompressed_body)
|
||||
return response.replace(body=decompressed_body, cls=respcls)
|
||||
return
|
||||
|
||||
def is_bzip2(self, response):
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_bzip2(self, response):
|
||||
try:
|
||||
decompressed_body = bz2.decompress(self.body)
|
||||
body = bz2.decompress(response.body)
|
||||
except IOError:
|
||||
return False
|
||||
respcls = responsetypes.from_args(body=decompressed_body)
|
||||
return response.replace(body=decompressed_body, cls=respcls)
|
||||
return
|
||||
|
||||
def extract(self, response):
|
||||
""" This method tries to decompress the given response, if possible,
|
||||
and returns a tuple containing the resulting response, and the name
|
||||
of the used decompressor """
|
||||
|
||||
self.body = response.body
|
||||
self.archive = StringIO()
|
||||
self.archive.write(self.body)
|
||||
|
||||
for decompressor in self.decompressors.keys():
|
||||
self.archive.seek(0)
|
||||
new_response = self.decompressors[decompressor](response)
|
||||
if new_response:
|
||||
return (new_response, decompressor)
|
||||
return (response, None)
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if isinstance(response, Response) and response.body:
|
||||
response, format = self.extract(response)
|
||||
if format:
|
||||
log.msg('Decompressed response with format: %s' % format, log.DEBUG, domain=spider.domain_name)
|
||||
if not response.body:
|
||||
return response
|
||||
|
||||
for fmt, func in self._formats.iteritems():
|
||||
new_response = func(response)
|
||||
if new_response:
|
||||
log.msg('Decompressed response with format: %s' % \
|
||||
fmt, log.DEBUG, spider=spider)
|
||||
return new_response
|
||||
return response
|
||||
|
37
scrapy/contrib_exp/iterators.py
Normal file
37
scrapy/contrib_exp/iterators.py
Normal file
@ -0,0 +1,37 @@
|
||||
from scrapy.http import Response
|
||||
from scrapy.selector import XmlXPathSelector
|
||||
|
||||
|
||||
def xmliter_lxml(obj, nodename):
|
||||
from lxml import etree
|
||||
reader = _StreamReader(obj)
|
||||
iterable = etree.iterparse(reader, tag=nodename, encoding=reader.encoding)
|
||||
for _, node in iterable:
|
||||
nodetext = etree.tostring(node)
|
||||
node.clear()
|
||||
yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0]
|
||||
|
||||
|
||||
class _StreamReader(object):
|
||||
|
||||
def __init__(self, obj):
|
||||
self._ptr = 0
|
||||
if isinstance(obj, Response):
|
||||
self._text, self.encoding = obj.body, obj.encoding
|
||||
else:
|
||||
self._text, self.encoding = obj, 'utf-8'
|
||||
self._is_unicode = isinstance(self._text, unicode)
|
||||
|
||||
def read(self, n=65535):
|
||||
self.read = self._read_unicode if self._is_unicode else self._read_string
|
||||
return self.read(n).lstrip()
|
||||
|
||||
def _read_string(self, n=65535):
|
||||
s, e = self._ptr, self._ptr + n
|
||||
self._ptr = e
|
||||
return self._text[s:e]
|
||||
|
||||
def _read_unicode(self, n=65535):
|
||||
s, e = self._ptr, self._ptr + n
|
||||
self._ptr = e
|
||||
return self._text[s:e].encode('utf-8')
|
0
scrapy/contrib_exp/loader/__init__.py
Normal file
0
scrapy/contrib_exp/loader/__init__.py
Normal file
34
scrapy/contrib_exp/loader/lxmlloader.py
Normal file
34
scrapy/contrib_exp/loader/lxmlloader.py
Normal file
@ -0,0 +1,34 @@
|
||||
from lxml import html, etree
|
||||
|
||||
from scrapy.contrib.loader import ItemLoader
|
||||
|
||||
|
||||
class LxmlItemLoader(ItemLoader):
|
||||
|
||||
def __init__(self, response, item=None, **context):
|
||||
self.tree = html.fromstring(response.body_as_unicode())
|
||||
context.update(response=response)
|
||||
super(LxmlItemLoader, self).__init__(item, **context)
|
||||
|
||||
def add_xpath(self, field_name, xpath):
|
||||
self.add_value(field_name, self._get_xpath(xpath))
|
||||
|
||||
def replace_xpath(self, field_name, xpath):
|
||||
self.replace_value(field_name, self._get_xpath(xpath))
|
||||
|
||||
def _get_xpath(self, xpath):
|
||||
return self._get_values(self.tree.xpath(xpath))
|
||||
|
||||
def add_css(self, field_name, css):
|
||||
self.add_value(field_name, self._get_css(css))
|
||||
|
||||
def replace_css(self, field_name, css):
|
||||
self.replace_value(field_name, self._get_css(css))
|
||||
|
||||
def _get_css(self, css):
|
||||
return self._get_values(self.tree.cssselect(css))
|
||||
|
||||
def _get_values(self, elems):
|
||||
for e in elems:
|
||||
yield etree.tostring(e) if isinstance(e, etree.ElementBase) else e
|
||||
|
@ -24,14 +24,14 @@ class ShoveItemPipeline(object):
|
||||
self.opts = settings['SHOVEITEM_STORE_OPT'] or {}
|
||||
self.stores = {}
|
||||
|
||||
dispatcher.connect(self.domain_opened, signal=signals.domain_opened)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def process_item(self, domain, item):
|
||||
def process_item(self, spider, item):
|
||||
guid = str(item.guid)
|
||||
|
||||
if guid in self.stores[domain]:
|
||||
if self.stores[domain][guid] == item:
|
||||
if guid in self.stores[spider]:
|
||||
if self.stores[spider][guid] == item:
|
||||
status = 'old'
|
||||
else:
|
||||
status = 'upd'
|
||||
@ -39,16 +39,17 @@ class ShoveItemPipeline(object):
|
||||
status = 'new'
|
||||
|
||||
if not status == 'old':
|
||||
self.stores[domain][guid] = item
|
||||
self.log(domain, item, status)
|
||||
self.stores[spider][guid] = item
|
||||
self.log(spider, item, status)
|
||||
return item
|
||||
|
||||
def domain_opened(self, domain):
|
||||
uri = Template(self.uritpl).substitute(domain=domain)
|
||||
self.stores[domain] = Shove(uri, **self.opts)
|
||||
def spider_opened(self, spider):
|
||||
uri = Template(self.uritpl).substitute(domain=spider.domain_name)
|
||||
self.stores[spider] = Shove(uri, **self.opts)
|
||||
|
||||
def domain_closed(self, domain):
|
||||
self.stores[domain].sync()
|
||||
def spider_closed(self, spider):
|
||||
self.stores[spider].sync()
|
||||
|
||||
def log(self, domain, item, status):
|
||||
log.msg("Shove (%s): Item guid=%s" % (status, item.guid), level=log.DEBUG, domain=domain)
|
||||
def log(self, spider, item, status):
|
||||
log.msg("Shove (%s): Item guid=%s" % (status, item.guid), level=log.DEBUG, \
|
||||
spider=spider)
|
||||
|
@ -45,20 +45,18 @@ class SpiderProfiler(object):
|
||||
r = function(*args, **kwargs)
|
||||
mafter = self._memusage()
|
||||
ct = time() - tbefore
|
||||
domain = spider.domain_name
|
||||
tcc = stats.get_value('profiling/total_callback_time', 0, domain=domain)
|
||||
sct = stats.get_value('profiling/slowest_callback_time', 0, domain=domain)
|
||||
stats.set_value('profiling/total_callback_time' % spider.domain_name, \
|
||||
tcc+ct, domain=domain)
|
||||
tcc = stats.get_value('profiling/total_callback_time', 0, spider=spider)
|
||||
sct = stats.get_value('profiling/slowest_callback_time', 0, spider=spider)
|
||||
stats.set_value('profiling/total_callback_time', tcc+ct, spider=spider)
|
||||
if ct > sct:
|
||||
stats.set_value('profiling/slowest_callback_time', ct, domain=domain)
|
||||
stats.set_value('profiling/slowest_callback_time', ct, spider=spider)
|
||||
stats.set_value('profiling/slowest_callback_name', function.__name__, \
|
||||
domain=domain)
|
||||
spider=spider)
|
||||
stats.set_value('profiling/slowest_callback_url', args[0].url, \
|
||||
domain=domain)
|
||||
spider=spider)
|
||||
if self._memusage:
|
||||
stats.inc_value('profiling/total_mem_allocated_in_callbacks', \
|
||||
count=mafter-mbefore, domain=domain)
|
||||
count=mafter-mbefore, spider=spider)
|
||||
return r
|
||||
return new_callback
|
||||
|
||||
|
@ -1,90 +0,0 @@
|
||||
"""
|
||||
Download handlers for different schemes
|
||||
"""
|
||||
from __future__ import with_statement
|
||||
|
||||
import urlparse
|
||||
|
||||
from twisted.internet import reactor
|
||||
try:
|
||||
from twisted.internet import ssl
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from scrapy import optional_features
|
||||
from scrapy.core import signals
|
||||
from scrapy.http import Headers
|
||||
from scrapy.core.exceptions import NotSupported
|
||||
from scrapy.utils.defer import defer_succeed
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.signal import send_catch_log
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
|
||||
default_timeout = settings.getint('DOWNLOAD_TIMEOUT')
|
||||
ssl_supported = 'ssl' in optional_features
|
||||
|
||||
def download_any(request, spider):
|
||||
scheme = urlparse_cached(request).scheme
|
||||
if scheme == 'http':
|
||||
return download_http(request, spider)
|
||||
elif scheme == 'https':
|
||||
if ssl_supported:
|
||||
return download_https(request, spider)
|
||||
else:
|
||||
raise NotSupported("HTTPS not supported: install pyopenssl library")
|
||||
elif scheme == 'file':
|
||||
return download_file(request, spider)
|
||||
else:
|
||||
raise NotSupported("Unsupported URL scheme '%s' in: <%s>" % (scheme, request.url))
|
||||
|
||||
def create_factory(request, spider):
|
||||
"""Return HTTPClientFactory for the given Request"""
|
||||
url = urlparse.urldefrag(request.url)[0]
|
||||
timeout = getattr(spider, "download_timeout", None) or default_timeout
|
||||
factory = HTTPClientFactory.from_request(request, timeout)
|
||||
|
||||
def _create_response(body):
|
||||
body = body or ''
|
||||
status = int(factory.status)
|
||||
headers = Headers(factory.response_headers)
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
r = respcls(url=request.url, status=status, headers=headers, body=body)
|
||||
send_catch_log(signal=signals.request_uploaded, sender='download_http', \
|
||||
request=request, spider=spider)
|
||||
send_catch_log(signal=signals.response_downloaded, sender='download_http', \
|
||||
response=r, spider=spider)
|
||||
return r
|
||||
|
||||
factory.deferred.addCallbacks(_create_response)
|
||||
return factory
|
||||
|
||||
def download_http(request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
factory = create_factory(request, spider)
|
||||
url = urlparse_cached(request)
|
||||
port = url.port
|
||||
reactor.connectTCP(url.hostname, port or 80, factory)
|
||||
return factory.deferred
|
||||
|
||||
def download_https(request, spider):
|
||||
"""Return a deferred for the HTTPS download"""
|
||||
factory = create_factory(request, spider)
|
||||
url = urlparse_cached(request)
|
||||
port = url.port
|
||||
contextFactory = ssl.ClientContextFactory()
|
||||
reactor.connectSSL(url.hostname, port or 443, factory, contextFactory)
|
||||
return factory.deferred
|
||||
|
||||
def download_file(request, spider) :
|
||||
"""Return a deferred for a file download."""
|
||||
filepath = request.url.split("file://")[1]
|
||||
with open(filepath) as f:
|
||||
body = f.read()
|
||||
respcls = responsetypes.from_args(filename=filepath, body=body)
|
||||
response = respcls(url=request.url, body=body)
|
||||
|
||||
return defer_succeed(response)
|
27
scrapy/core/downloader/handlers/__init__.py
Normal file
27
scrapy/core/downloader/handlers/__init__.py
Normal file
@ -0,0 +1,27 @@
|
||||
"""Download handlers for different schemes"""
|
||||
|
||||
from scrapy.core.exceptions import NotSupported
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.conf import settings
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
|
||||
class RequestHandlers(object):
|
||||
|
||||
def __init__(self):
|
||||
self._handlers = {}
|
||||
handlers = settings.get('REQUEST_HANDLERS_BASE')
|
||||
handlers.update(settings.get('REQUEST_HANDLERS', {}))
|
||||
for scheme, cls in handlers.iteritems():
|
||||
self._handlers[scheme] = load_object(cls)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
scheme = urlparse_cached(request).scheme
|
||||
try:
|
||||
handler = self._handlers[scheme]
|
||||
except KeyError:
|
||||
raise NotSupported("Unsupported URL scheme '%s' in: <%s>" % (scheme, request.url))
|
||||
return handler(request, spider)
|
||||
|
||||
|
||||
download_any = RequestHandlers().download_request
|
18
scrapy/core/downloader/handlers/file.py
Normal file
18
scrapy/core/downloader/handlers/file.py
Normal file
@ -0,0 +1,18 @@
|
||||
"""Download handler for file:// scheme"""
|
||||
from __future__ import with_statement
|
||||
|
||||
from twisted.internet import defer
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
|
||||
|
||||
def download_file(request, spider):
|
||||
"""Return a deferred for a file download."""
|
||||
return defer.maybeDeferred(_all_in_one_read_download_file, request, spider)
|
||||
|
||||
def _all_in_one_read_download_file(request, spider):
|
||||
filepath = request.url.split("file://")[1]
|
||||
with open(filepath) as f:
|
||||
body = f.read()
|
||||
respcls = responsetypes.from_args(filename=filepath, body=body)
|
||||
return respcls(url=request.url, body=body)
|
||||
|
50
scrapy/core/downloader/handlers/http.py
Normal file
50
scrapy/core/downloader/handlers/http.py
Normal file
@ -0,0 +1,50 @@
|
||||
"""Download handlers for http and https schemes"""
|
||||
|
||||
from twisted.internet import reactor
|
||||
|
||||
from scrapy.core import signals
|
||||
from scrapy.core.exceptions import NotSupported
|
||||
from scrapy.utils.signal import send_catch_log
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.conf import settings
|
||||
from scrapy import optional_features
|
||||
|
||||
ssl_supported = 'ssl' in optional_features
|
||||
if ssl_supported:
|
||||
from twisted.internet.ssl import ClientContextFactory
|
||||
|
||||
|
||||
HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
|
||||
default_timeout = settings.getint('DOWNLOAD_TIMEOUT')
|
||||
|
||||
def _create_factory(request, spider):
|
||||
def _download_signals(response):
|
||||
send_catch_log(signal=signals.request_uploaded, \
|
||||
sender='download_http', request=request, spider=spider)
|
||||
send_catch_log(signal=signals.response_downloaded, \
|
||||
sender='download_http', response=response, spider=spider)
|
||||
return response
|
||||
|
||||
timeout = getattr(spider, "download_timeout", None) or default_timeout
|
||||
factory = HTTPClientFactory(request, timeout)
|
||||
factory.deferred.addCallbacks(_download_signals)
|
||||
return factory
|
||||
|
||||
|
||||
def _connect(factory):
|
||||
host, port = factory.host, factory.port
|
||||
if factory.scheme == 'https':
|
||||
if ssl_supported:
|
||||
return reactor.connectSSL(host, port, factory, ClientContextFactory())
|
||||
raise NotSupported("HTTPS not supported: install pyopenssl library")
|
||||
else:
|
||||
return reactor.connectTCP(host, port, factory)
|
||||
|
||||
|
||||
def download_http(request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
factory = _create_factory(request, spider)
|
||||
_connect(factory)
|
||||
return factory.deferred
|
||||
|
||||
|
@ -27,7 +27,7 @@ class SiteInfo(object):
|
||||
if download_delay:
|
||||
self.max_concurrent_requests = 1
|
||||
elif max_concurrent_requests is None:
|
||||
self.max_concurrent_requests = settings.getint('REQUESTS_PER_DOMAIN')
|
||||
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
|
||||
else:
|
||||
self.max_concurrent_requests = max_concurrent_requests
|
||||
|
||||
@ -54,7 +54,7 @@ class Downloader(object):
|
||||
def __init__(self):
|
||||
self.sites = {}
|
||||
self.middleware = DownloaderMiddlewareManager()
|
||||
self.concurrent_domains = settings.getint('CONCURRENT_DOMAINS')
|
||||
self.concurrent_domains = settings.getint('CONCURRENT_SPIDERS')
|
||||
|
||||
def fetch(self, request, spider):
|
||||
"""Main method to use to request a download
|
||||
@ -135,7 +135,7 @@ class Downloader(object):
|
||||
# downloader middleware, to speed-up the closing process
|
||||
if site.closing:
|
||||
log.msg("Crawled while closing spider: %s" % request, \
|
||||
level=log.DEBUG)
|
||||
level=log.DEBUG, spider=spider)
|
||||
raise IgnoreRequest
|
||||
return _
|
||||
return dfd.addBoth(finish_transferring)
|
||||
|
@ -1,12 +1,14 @@
|
||||
from urlparse import urlparse, urlunparse
|
||||
from urlparse import urlparse, urlunparse, urldefrag
|
||||
|
||||
from twisted.python import failure
|
||||
from twisted.web.client import HTTPClientFactory, PartialDownloadError
|
||||
from twisted.web.client import PartialDownloadError, HTTPClientFactory
|
||||
from twisted.web.http import HTTPClient
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.http import Headers
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
|
||||
|
||||
def _parsed_url_args(parsed):
|
||||
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
||||
@ -18,11 +20,13 @@ def _parsed_url_args(parsed):
|
||||
port = 443 if scheme == 'https' else 80
|
||||
return scheme, netloc, host, port, path
|
||||
|
||||
|
||||
def _parse(url):
|
||||
url = url.strip()
|
||||
parsed = urlparse(url)
|
||||
return _parsed_url_args(parsed)
|
||||
|
||||
|
||||
class ScrapyHTTPPageGetter(HTTPClient):
|
||||
|
||||
def connectionMade(self):
|
||||
@ -64,7 +68,9 @@ class ScrapyHTTPPageGetter(HTTPClient):
|
||||
|
||||
def timeout(self):
|
||||
self.transport.loseConnection()
|
||||
self.factory.noPage(defer.TimeoutError("Getting %s took longer than %s seconds." % (self.factory.url, self.factory.timeout)))
|
||||
self.factory.noPage(\
|
||||
defer.TimeoutError("Getting %s took longer than %s seconds." % \
|
||||
(self.factory.url, self.factory.timeout)))
|
||||
|
||||
|
||||
class ScrapyHTTPClientFactory(HTTPClientFactory):
|
||||
@ -74,22 +80,19 @@ class ScrapyHTTPClientFactory(HTTPClientFactory):
|
||||
"""
|
||||
|
||||
protocol = ScrapyHTTPPageGetter
|
||||
response_headers = None
|
||||
waiting = 1
|
||||
noisy = False
|
||||
|
||||
def __init__(self, url, method='GET', body=None, headers=None, timeout=0, parsedurl=None):
|
||||
self.url = url
|
||||
self.method = method
|
||||
self.body = body or None
|
||||
if parsedurl:
|
||||
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsedurl)
|
||||
else:
|
||||
self.scheme, self.netloc, self.host, self.port, self.path = _parse(url)
|
||||
|
||||
def __init__(self, request, timeout=0):
|
||||
self.url = urldefrag(request.url)[0]
|
||||
self.method = request.method
|
||||
self.body = request.body or None
|
||||
self.headers = Headers(request.headers)
|
||||
self.response_headers = None
|
||||
self.timeout = timeout
|
||||
self.headers = Headers(headers or {})
|
||||
self.deferred = defer.Deferred()
|
||||
self.deferred = defer.Deferred().addCallback(self._build_response)
|
||||
|
||||
self._set_connection_attributes(request)
|
||||
|
||||
# set Host header based on url
|
||||
self.headers.setdefault('Host', self.netloc)
|
||||
@ -100,33 +103,19 @@ class ScrapyHTTPClientFactory(HTTPClientFactory):
|
||||
# just in case a broken http/1.1 decides to keep connection alive
|
||||
self.headers.setdefault("Connection", "close")
|
||||
|
||||
@classmethod
|
||||
def from_request(cls, request, timeout):
|
||||
return cls(request.url,
|
||||
method=request.method,
|
||||
body=request.body or None, # see http://dev.scrapy.org/ticket/60
|
||||
headers=Headers(request.headers or {}),
|
||||
timeout=timeout,
|
||||
parsedurl=urlparse_cached(request),
|
||||
)
|
||||
def _build_response(self, body):
|
||||
status = int(self.status)
|
||||
headers = Headers(self.response_headers)
|
||||
respcls = responsetypes.from_args(headers=headers, url=self.url)
|
||||
return respcls(url=self.url, status=status, headers=headers, body=body)
|
||||
|
||||
def _set_connection_attributes(self, request):
|
||||
parsed = urlparse_cached(request)
|
||||
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
|
||||
proxy = request.meta.get('proxy')
|
||||
if proxy:
|
||||
self.scheme, _, self.host, self.port, _ = _parse(proxy)
|
||||
self.path = self.url
|
||||
|
||||
def gotHeaders(self, headers):
|
||||
self.response_headers = headers
|
||||
|
||||
|
||||
|
||||
def getPage(url, contextFactory=None, *args, **kwargs):
|
||||
"""
|
||||
Download a web page as a string.
|
||||
|
||||
Download a page. Return a deferred, which will callback with a
|
||||
page (as a string) or errback with a description of the error.
|
||||
|
||||
See HTTPClientFactory to see what extra args can be passed.
|
||||
"""
|
||||
from twisted.web.client import _makeGetterFactory
|
||||
return _makeGetterFactory(
|
||||
url,
|
||||
ScrapyHTTPClientFactory,
|
||||
contextFactory=contextFactory,
|
||||
*args, **kwargs).deferred
|
||||
|
@ -34,6 +34,7 @@ class ExecutionEngine(object):
|
||||
self.paused = False
|
||||
self._next_request_pending = set()
|
||||
self._mainloop_task = task.LoopingCall(self._mainloop)
|
||||
self._crawled_logline = load_object(settings['LOG_FORMATTER_CRAWLED'])
|
||||
|
||||
def configure(self):
|
||||
"""
|
||||
@ -98,7 +99,7 @@ class ExecutionEngine(object):
|
||||
return True
|
||||
|
||||
def next_request(self, spider, now=False):
|
||||
"""Scrape the next request for the domain passed.
|
||||
"""Scrape the next request for the spider passed.
|
||||
|
||||
The next request to be scraped is retrieved from the scheduler and
|
||||
requested from the downloader.
|
||||
@ -163,7 +164,7 @@ class ExecutionEngine(object):
|
||||
def crawl(self, request, spider):
|
||||
if not request.deferred.callbacks:
|
||||
log.msg("Unable to crawl Request with no callback: %s" % request,
|
||||
level=log.ERROR, domain=spider.domain_name)
|
||||
level=log.ERROR, spider=spider)
|
||||
return
|
||||
schd = mustbe_deferred(self.schedule, request, spider)
|
||||
# FIXME: we can't log errors because we would be preventing them from
|
||||
@ -185,7 +186,7 @@ class ExecutionEngine(object):
|
||||
return self.scheduler.enqueue_request(spider, request)
|
||||
|
||||
def _mainloop(self):
|
||||
"""Add more domains to be scraped if the downloader has the capacity.
|
||||
"""Add more spiders to be scraped if the downloader has the capacity.
|
||||
|
||||
If there is nothing else scheduled then stop the execution engine.
|
||||
"""
|
||||
@ -197,16 +198,13 @@ class ExecutionEngine(object):
|
||||
return self._stop_if_idle()
|
||||
|
||||
def download(self, request, spider):
|
||||
domain = spider.domain_name
|
||||
referer = request.headers.get('Referer')
|
||||
|
||||
def _on_success(response):
|
||||
"""handle the result of a page download"""
|
||||
assert isinstance(response, (Response, Request))
|
||||
if isinstance(response, Response):
|
||||
response.request = request # tie request to response received
|
||||
log.msg("Crawled %s (referer: <%s>)" % (request, referer), \
|
||||
level=log.DEBUG, domain=domain)
|
||||
log.msg(self._crawled_logline(request, response), \
|
||||
level=log.DEBUG, spider=spider)
|
||||
return response
|
||||
elif isinstance(response, Request):
|
||||
newrequest = response
|
||||
@ -224,52 +222,48 @@ class ExecutionEngine(object):
|
||||
errmsg = str(_failure)
|
||||
level = log.ERROR
|
||||
if errmsg:
|
||||
log.msg("Downloading <%s> (referer: <%s>): %s" % (request.url, \
|
||||
referer, errmsg), level=level, domain=domain)
|
||||
log.msg("Crawling <%s>: %s" % (request.url, errmsg), \
|
||||
level=level, spider=spider)
|
||||
return Failure(IgnoreRequest(str(exc)))
|
||||
|
||||
def _on_complete(_):
|
||||
self.next_request(spider)
|
||||
return _
|
||||
|
||||
if spider not in self.downloader.sites:
|
||||
return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)
|
||||
|
||||
dwld = mustbe_deferred(self.downloader.fetch, request, spider)
|
||||
dwld.addCallbacks(_on_success, _on_error)
|
||||
dwld.addBoth(_on_complete)
|
||||
return dwld
|
||||
|
||||
def open_spider(self, spider):
|
||||
domain = spider.domain_name
|
||||
log.msg("Domain opened", domain=domain)
|
||||
log.msg("Spider opened", spider=spider)
|
||||
self.next_request(spider)
|
||||
|
||||
self.downloader.open_spider(spider)
|
||||
self.scraper.open_spider(spider)
|
||||
stats.open_domain(domain)
|
||||
stats.open_spider(spider)
|
||||
|
||||
# XXX: sent for backwards compatibility (will be removed in Scrapy 0.8)
|
||||
send_catch_log(signals.domain_open, sender=self.__class__, \
|
||||
domain=domain, spider=spider)
|
||||
|
||||
send_catch_log(signals.domain_opened, sender=self.__class__, \
|
||||
domain=domain, spider=spider)
|
||||
send_catch_log(signals.spider_opened, sender=self.__class__, spider=spider)
|
||||
|
||||
def _spider_idle(self, spider):
|
||||
"""Called when a domain gets idle. This function is called when there
|
||||
"""Called when a spider gets idle. This function is called when there
|
||||
are no remaining pages to download or schedule. It can be called
|
||||
multiple times. If some extension raises a DontCloseDomain exception
|
||||
(in the domain_idle signal handler) the domain is not closed until the
|
||||
(in the spider_idle signal handler) the spider is not closed until the
|
||||
next loop and this function is guaranteed to be called (at least) once
|
||||
again for this domain.
|
||||
again for this spider.
|
||||
"""
|
||||
domain = spider.domain_name
|
||||
try:
|
||||
dispatcher.send(signal=signals.domain_idle, sender=self.__class__, \
|
||||
domain=domain, spider=spider)
|
||||
dispatcher.send(signal=signals.spider_idle, sender=self.__class__, \
|
||||
spider=spider)
|
||||
except DontCloseDomain:
|
||||
self.next_request(spider)
|
||||
reactor.callLater(5, self.next_request, spider)
|
||||
return
|
||||
except:
|
||||
log.err("Exception catched on domain_idle signal dispatch")
|
||||
log.err("Exception catched on spider_idle signal dispatch")
|
||||
if self.spider_is_idle(spider):
|
||||
self.close_spider(spider, reason='finished')
|
||||
|
||||
@ -280,9 +274,8 @@ class ExecutionEngine(object):
|
||||
|
||||
def close_spider(self, spider, reason='cancelled'):
|
||||
"""Close (cancel) spider and clear all its outstanding requests"""
|
||||
domain = spider.domain_name
|
||||
if spider not in self.closing:
|
||||
log.msg("Closing domain (%s)" % reason, domain=domain)
|
||||
log.msg("Closing spider (%s)" % reason, spider=spider)
|
||||
self.closing[spider] = reason
|
||||
self.downloader.close_spider(spider)
|
||||
self.scheduler.clear_pending_requests(spider)
|
||||
@ -295,7 +288,7 @@ class ExecutionEngine(object):
|
||||
return dlist
|
||||
|
||||
def _finish_closing_spider_if_idle(self, spider):
|
||||
"""Call _finish_closing_spider if domain is idle"""
|
||||
"""Call _finish_closing_spider if spider is idle"""
|
||||
if self.spider_is_idle(spider) or self.killed:
|
||||
return self._finish_closing_spider(spider)
|
||||
else:
|
||||
@ -307,15 +300,14 @@ class ExecutionEngine(object):
|
||||
|
||||
def _finish_closing_spider(self, spider):
|
||||
"""This function is called after the spider has been closed"""
|
||||
domain = spider.domain_name
|
||||
self.scheduler.close_spider(spider)
|
||||
self.scraper.close_spider(spider)
|
||||
reason = self.closing.pop(spider, 'finished')
|
||||
send_catch_log(signal=signals.domain_closed, sender=self.__class__, \
|
||||
domain=domain, spider=spider, reason=reason)
|
||||
stats.close_domain(domain, reason=reason)
|
||||
send_catch_log(signal=signals.spider_closed, sender=self.__class__, \
|
||||
spider=spider, reason=reason)
|
||||
stats.close_spider(spider, reason=reason)
|
||||
dfd = defer.maybeDeferred(spiders.close_spider, spider)
|
||||
dfd.addBoth(log.msg, "Domain closed (%s)" % reason, domain=domain)
|
||||
dfd.addBoth(log.msg, "Spider closed (%s)" % reason, spider=spider)
|
||||
reactor.callLater(0, self._mainloop)
|
||||
return dfd
|
||||
|
||||
|
@ -87,17 +87,17 @@ class Scraper(object):
|
||||
def enqueue_scrape(self, response, request, spider):
|
||||
site = self.sites[spider]
|
||||
dfd = site.add_response_request(response, request)
|
||||
# FIXME: this can't be called here because the stats domain may be
|
||||
# FIXME: this can't be called here because the stats spider may be
|
||||
# already closed
|
||||
#stats.max_value('scraper/max_active_size', site.active_size, \
|
||||
# domain=spider.domain_name)
|
||||
# spider=spider)
|
||||
def finish_scraping(_):
|
||||
site.finish_response(response)
|
||||
self._scrape_next(spider, site)
|
||||
return _
|
||||
dfd.addBoth(finish_scraping)
|
||||
dfd.addErrback(log.err, 'Scraper bug processing %s' % request, \
|
||||
domain=spider.domain_name)
|
||||
spider=spider)
|
||||
self._scrape_next(spider, site)
|
||||
return dfd
|
||||
|
||||
@ -136,9 +136,9 @@ class Scraper(object):
|
||||
referer = request.headers.get('Referer', None)
|
||||
msg = "Spider exception caught while processing <%s> (referer: <%s>): %s" % \
|
||||
(request.url, referer, _failure)
|
||||
log.msg(msg, log.ERROR, domain=spider.domain_name)
|
||||
log.msg(msg, log.ERROR, spider=spider)
|
||||
stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
|
||||
domain=spider.domain_name)
|
||||
spider=spider)
|
||||
|
||||
def handle_spider_output(self, result, request, response, spider):
|
||||
if not result:
|
||||
@ -152,7 +152,6 @@ class Scraper(object):
|
||||
from the given spider
|
||||
"""
|
||||
# TODO: keep closing state internally instead of checking engine
|
||||
domain = spider.domain_name
|
||||
if spider in self.engine.closing:
|
||||
return
|
||||
elif isinstance(output, Request):
|
||||
@ -161,14 +160,14 @@ class Scraper(object):
|
||||
self.engine.crawl(request=output, spider=spider)
|
||||
elif isinstance(output, BaseItem):
|
||||
log.msg("Scraped %s in <%s>" % (output, request.url), level=log.DEBUG, \
|
||||
domain=domain)
|
||||
spider=spider)
|
||||
send_catch_log(signal=signals.item_scraped, sender=self.__class__, \
|
||||
item=output, spider=spider, response=response)
|
||||
self.sites[spider].itemproc_size += 1
|
||||
# FIXME: this can't be called here because the stats domain may be
|
||||
# FIXME: this can't be called here because the stats spider may be
|
||||
# already closed
|
||||
#stats.max_value('scraper/max_itemproc_size', \
|
||||
# self.sites[domain].itemproc_size, domain=domain)
|
||||
# self.sites[spider].itemproc_size, spider=spider)
|
||||
dfd = self.itemproc.process_item(output, spider)
|
||||
dfd.addBoth(self._itemproc_finished, output, spider)
|
||||
return dfd
|
||||
@ -176,7 +175,7 @@ class Scraper(object):
|
||||
pass
|
||||
else:
|
||||
log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \
|
||||
(type(output).__name__, request), log.ERROR, domain=domain)
|
||||
(type(output).__name__, request), log.ERROR, spider=spider)
|
||||
|
||||
def _check_propagated_failure(self, spider_failure, propagated_failure, request, spider):
|
||||
"""Log and silence the bugs raised outside of spiders, but still allow
|
||||
@ -195,19 +194,18 @@ class Scraper(object):
|
||||
def _itemproc_finished(self, output, item, spider):
|
||||
"""ItemProcessor finished for the given ``item`` and returned ``output``
|
||||
"""
|
||||
domain = spider.domain_name
|
||||
self.sites[spider].itemproc_size -= 1
|
||||
if isinstance(output, Failure):
|
||||
ex = output.value
|
||||
if isinstance(ex, DropItem):
|
||||
log.msg("Dropped %s - %s" % (item, str(ex)), level=log.WARNING, domain=domain)
|
||||
log.msg("Dropped %s - %s" % (item, str(ex)), level=log.WARNING, spider=spider)
|
||||
send_catch_log(signal=signals.item_dropped, sender=self.__class__, \
|
||||
item=item, spider=spider, exception=output.value)
|
||||
else:
|
||||
log.msg('Error processing %s - %s' % (item, output), \
|
||||
log.ERROR, domain=domain)
|
||||
log.ERROR, spider=spider)
|
||||
else:
|
||||
log.msg("Passed %s" % item, log.INFO, domain=domain)
|
||||
log.msg("Passed %s" % item, log.INFO, spider=spider)
|
||||
send_catch_log(signal=signals.item_passed, sender=self.__class__, \
|
||||
item=item, spider=spider, output=output)
|
||||
|
||||
|
@ -7,9 +7,9 @@ signals here without documenting them there.
|
||||
|
||||
engine_started = object()
|
||||
engine_stopped = object()
|
||||
domain_opened = object()
|
||||
domain_idle = object()
|
||||
domain_closed = object()
|
||||
spider_opened = object()
|
||||
spider_idle = object()
|
||||
spider_closed = object()
|
||||
request_received = object()
|
||||
request_uploaded = object()
|
||||
response_received = object()
|
||||
@ -17,7 +17,3 @@ response_downloaded = object()
|
||||
item_scraped = object()
|
||||
item_passed = object()
|
||||
item_dropped = object()
|
||||
|
||||
# XXX: deprecated signals (will be removed in Scrapy 0.8)
|
||||
domain_open = object()
|
||||
|
||||
|
@ -85,9 +85,6 @@ class Request(object_ref):
|
||||
return self._encoding
|
||||
|
||||
def __str__(self):
|
||||
if self.method == 'GET':
|
||||
return "<%s>" % self.url
|
||||
else:
|
||||
return "<%s %s>" % (self.method, self.url)
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -37,7 +37,8 @@ class FormRequest(Request):
|
||||
self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||
|
||||
@classmethod
|
||||
def from_response(cls, response, formnumber=0, formdata=None, **kwargs):
|
||||
def from_response(cls, response, formnumber=0, formdata=None,
|
||||
clickdata=None, dont_click=False, **kwargs):
|
||||
encoding = getattr(response, 'encoding', 'utf-8')
|
||||
forms = ParseFile(StringIO(response.body), response.url,
|
||||
encoding=encoding, backwards_compat=False)
|
||||
@ -51,11 +52,14 @@ class FormRequest(Request):
|
||||
# remove all existing fields with the same name before, so that
|
||||
# formdata fields properly can properly override existing ones,
|
||||
# which is the desired behaviour
|
||||
form.controls = [c for c in form.controls if c.name not in formdata.keys()]
|
||||
form.controls = [c for c in form.controls if c.name not in formdata]
|
||||
for k, v in formdata.iteritems():
|
||||
for v2 in v if hasattr(v, '__iter__') else [v]:
|
||||
form.new_control('text', k, {'value': v2})
|
||||
|
||||
url, body, headers = form.click_request_data()
|
||||
request = cls(url, method=form.method, body=body, headers=headers, **kwargs)
|
||||
return request
|
||||
if dont_click:
|
||||
url, body, headers = form._switch_click('request_data')
|
||||
else:
|
||||
url, body, headers = form.click_request_data(**(clickdata or {}))
|
||||
|
||||
return cls(url, method=form.method, body=body, headers=headers, **kwargs)
|
||||
|
@ -65,9 +65,7 @@ class Response(object_ref):
|
||||
return "%s(%s)" % (self.__class__.__name__, args)
|
||||
|
||||
def __str__(self):
|
||||
flags = "(%s) " % ",".join(self.flags) if self.flags else ""
|
||||
status = "%d " % self.status + " " if self.status != 200 else ""
|
||||
return "%s<%s%s>" % (flags, status, self.url)
|
||||
return "<%d %s>" % (self.status, self.url)
|
||||
|
||||
def copy(self):
|
||||
"""Return a copy of this Response"""
|
||||
|
@ -11,9 +11,11 @@ from scrapy.xlib.BeautifulSoup import UnicodeDammit
|
||||
|
||||
from scrapy.http.response import Response
|
||||
from scrapy.utils.python import memoizemethod_noargs
|
||||
from scrapy.conf import settings
|
||||
|
||||
class TextResponse(Response):
|
||||
|
||||
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
|
||||
_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
|
||||
|
||||
__slots__ = ['_encoding', '_body_inferred_encoding']
|
||||
@ -71,6 +73,8 @@ class TextResponse(Response):
|
||||
self._body_declared_encoding())
|
||||
dammit = UnicodeDammit(self.body, possible_encodings)
|
||||
self._body_inferred_encoding = dammit.originalEncoding
|
||||
if self._body_inferred_encoding in ('ascii', None):
|
||||
self._body_inferred_encoding = self._DEFAULT_ENCODING
|
||||
return dammit.unicode
|
||||
|
||||
def body_encoding(self):
|
||||
|
@ -83,29 +83,3 @@ class Item(DictItem):
|
||||
|
||||
__metaclass__ = ItemMeta
|
||||
|
||||
|
||||
class ScrapedItem(BaseItem):
|
||||
|
||||
def __init__(self, data=None):
|
||||
"""
|
||||
A ScrapedItem can be initialised with a dictionary that will be
|
||||
squirted directly into the object.
|
||||
"""
|
||||
import warnings
|
||||
warnings.warn("scrapy.item.ScrapedItem is deprecated, use scrapy.item.Item instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
if isinstance(data, dict):
|
||||
for attr, value in data.iteritems():
|
||||
setattr(self, attr, value)
|
||||
elif data is not None:
|
||||
raise TypeError("Initialize with dict, not %s" % data.__class__.__name__)
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Generate the following format so that items can be deserialized
|
||||
easily: ClassName({'attrib': value, ...})
|
||||
"""
|
||||
reprdict = dict(items for items in self.__dict__.iteritems() \
|
||||
if not items[0].startswith('_'))
|
||||
return "%s(%s)" % (self.__class__.__name__, repr(reprdict))
|
||||
|
||||
|
@ -26,7 +26,7 @@ level_names = {
|
||||
BOT_NAME = settings['BOT_NAME']
|
||||
|
||||
# signal sent when log message is received
|
||||
# args: message, level, domain
|
||||
# args: message, level, spider
|
||||
logmessage_received = object()
|
||||
|
||||
# default logging level
|
||||
@ -63,26 +63,31 @@ def start(logfile=None, loglevel=None, logstdout=None):
|
||||
file = open(logfile, 'a') if logfile else sys.stderr
|
||||
log.startLogging(file, setStdout=logstdout)
|
||||
|
||||
def msg(message, level=INFO, component=BOT_NAME, domain=None):
|
||||
def msg(message, level=INFO, component=BOT_NAME, domain=None, spider=None):
|
||||
"""Log message according to the level"""
|
||||
if level > log_level:
|
||||
return
|
||||
if domain is not None:
|
||||
import warnings
|
||||
warnings.warn("'domain' argument of scrapy.log.msg() is deprecated, " \
|
||||
"use 'spider' argument instead", DeprecationWarning, stacklevel=2)
|
||||
dispatcher.send(signal=logmessage_received, message=message, level=level, \
|
||||
domain=domain)
|
||||
system = domain if domain else component
|
||||
spider=spider)
|
||||
system = domain or (spider.domain_name if spider else component)
|
||||
msg_txt = unicode_to_str("%s: %s" % (level_names[level], message))
|
||||
log.msg(msg_txt, system=system)
|
||||
|
||||
def exc(message, level=ERROR, component=BOT_NAME, domain=None):
|
||||
def exc(message, level=ERROR, component=BOT_NAME, domain=None, spider=None):
|
||||
message = message + '\n' + format_exc()
|
||||
msg(message, level, component, domain)
|
||||
msg(message, level, component, domain, spider)
|
||||
|
||||
def err(_stuff=None, _why=None, **kwargs):
|
||||
if ERROR > log_level:
|
||||
return
|
||||
domain = kwargs.pop('domain', None)
|
||||
spider = kwargs.pop('spider', None)
|
||||
component = kwargs.pop('component', BOT_NAME)
|
||||
kwargs['system'] = domain if domain else component
|
||||
kwargs['system'] = domain or spider.domain_name if spider else component
|
||||
if _why:
|
||||
_why = unicode_to_str("ERROR: %s" % _why)
|
||||
log.err(_stuff, _why, **kwargs)
|
||||
|
@ -5,9 +5,10 @@ garbage collection to libxml2 documents (xmlDoc).
|
||||
|
||||
import weakref
|
||||
|
||||
from scrapy.utils.trackref import object_ref
|
||||
from .factories import xmlDoc_from_html
|
||||
|
||||
class Libxml2Document(object):
|
||||
class Libxml2Document(object_ref):
|
||||
|
||||
cache = weakref.WeakKeyDictionary()
|
||||
__slots__ = ['xmlDoc', 'xpathContext', '__weakref__']
|
||||
@ -15,7 +16,7 @@ class Libxml2Document(object):
|
||||
def __new__(cls, response, factory=xmlDoc_from_html):
|
||||
cache = cls.cache.setdefault(response, {})
|
||||
if factory not in cache:
|
||||
obj = object.__new__(cls)
|
||||
obj = object_ref.__new__(cls)
|
||||
obj.xmlDoc = factory(response)
|
||||
obj.xpathContext = obj.xmlDoc.xpathNewContext()
|
||||
cache[factory] = obj
|
||||
@ -25,8 +26,7 @@ class Libxml2Document(object):
|
||||
# we must call both cleanup functions, so we try/except all exceptions
|
||||
# to make sure one doesn't prevent the other from being called
|
||||
# this call sometimes raises a "NoneType is not callable" TypeError
|
||||
# also, these calls sometimes raise a "NoneType is not callable"
|
||||
# TypeError, so the try/except block silences them
|
||||
# so the try/except block silences them
|
||||
try:
|
||||
self.xmlDoc.freeDoc()
|
||||
except:
|
||||
|
@ -24,7 +24,7 @@ def body_as_utf8(response):
|
||||
|
||||
def xmlDoc_from_html(response):
|
||||
"""Return libxml2 doc for HTMLs"""
|
||||
utf8body = body_as_utf8(response)
|
||||
utf8body = body_as_utf8(response) or ' '
|
||||
try:
|
||||
lxdoc = libxml2.htmlReadDoc(utf8body, response.url, 'utf-8', \
|
||||
html_parser_options)
|
||||
@ -35,7 +35,7 @@ def xmlDoc_from_html(response):
|
||||
|
||||
def xmlDoc_from_xml(response):
|
||||
"""Return libxml2 doc for XMLs"""
|
||||
utf8body = body_as_utf8(response)
|
||||
utf8body = body_as_utf8(response) or ' '
|
||||
try:
|
||||
lxdoc = libxml2.readDoc(utf8body, response.url, 'utf-8', \
|
||||
xml_parser_options)
|
||||
|
@ -18,7 +18,7 @@ from scrapy.utils.response import open_in_browser
|
||||
from scrapy.conf import settings
|
||||
from scrapy.core.manager import scrapymanager
|
||||
from scrapy.core.engine import scrapyengine
|
||||
from scrapy.http import Request
|
||||
from scrapy.http import Request, TextResponse
|
||||
|
||||
def relevant_var(varname):
|
||||
return varname not in ['shelp', 'fetch', 'view', '__builtins__', 'In', \
|
||||
@ -67,6 +67,7 @@ class Shell(object):
|
||||
item = self.item_class()
|
||||
self.vars['item'] = item
|
||||
if url:
|
||||
if isinstance(response, TextResponse):
|
||||
self.vars['xxs'] = XmlXPathSelector(response)
|
||||
self.vars['hxs'] = HtmlXPathSelector(response)
|
||||
self.vars['url'] = url
|
||||
|
@ -66,7 +66,7 @@ class BaseSpider(object):
|
||||
"""Log the given messages at the given log level. Always use this
|
||||
method to send log messages from your spider
|
||||
"""
|
||||
log.msg(message, domain=self.domain_name, level=level)
|
||||
log.msg(message, spider=self, level=level)
|
||||
|
||||
def start_requests(self):
|
||||
reqs = []
|
||||
|
@ -5,8 +5,8 @@ import pprint
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
|
||||
from scrapy.stats.signals import stats_domain_opened, stats_domain_closing, \
|
||||
stats_domain_closed
|
||||
from scrapy.stats.signals import stats_spider_opened, stats_spider_closing, \
|
||||
stats_spider_closed
|
||||
from scrapy.utils.signal import send_catch_log
|
||||
from scrapy.core import signals
|
||||
from scrapy import log
|
||||
@ -19,57 +19,57 @@ class StatsCollector(object):
|
||||
self._stats = {None: {}} # None is for global stats
|
||||
dispatcher.connect(self._engine_stopped, signal=signals.engine_stopped)
|
||||
|
||||
def get_value(self, key, default=None, domain=None):
|
||||
return self._stats[domain].get(key, default)
|
||||
def get_value(self, key, default=None, spider=None):
|
||||
return self._stats[spider].get(key, default)
|
||||
|
||||
def get_stats(self, domain=None):
|
||||
return self._stats[domain]
|
||||
def get_stats(self, spider=None):
|
||||
return self._stats[spider]
|
||||
|
||||
def set_value(self, key, value, domain=None):
|
||||
self._stats[domain][key] = value
|
||||
def set_value(self, key, value, spider=None):
|
||||
self._stats[spider][key] = value
|
||||
|
||||
def set_stats(self, stats, domain=None):
|
||||
self._stats[domain] = stats
|
||||
def set_stats(self, stats, spider=None):
|
||||
self._stats[spider] = stats
|
||||
|
||||
def inc_value(self, key, count=1, start=0, domain=None):
|
||||
d = self._stats[domain]
|
||||
def inc_value(self, key, count=1, start=0, spider=None):
|
||||
d = self._stats[spider]
|
||||
d[key] = d.setdefault(key, start) + count
|
||||
|
||||
def max_value(self, key, value, domain=None):
|
||||
d = self._stats[domain]
|
||||
def max_value(self, key, value, spider=None):
|
||||
d = self._stats[spider]
|
||||
d[key] = max(d.setdefault(key, value), value)
|
||||
|
||||
def min_value(self, key, value, domain=None):
|
||||
d = self._stats[domain]
|
||||
def min_value(self, key, value, spider=None):
|
||||
d = self._stats[spider]
|
||||
d[key] = min(d.setdefault(key, value), value)
|
||||
|
||||
def clear_stats(self, domain=None):
|
||||
self._stats[domain].clear()
|
||||
def clear_stats(self, spider=None):
|
||||
self._stats[spider].clear()
|
||||
|
||||
def list_domains(self):
|
||||
return [d for d in self._stats.keys() if d is not None]
|
||||
def iter_spider_stats(self):
|
||||
return [x for x in self._stats.iteritems() if x[0]]
|
||||
|
||||
def open_domain(self, domain):
|
||||
self._stats[domain] = {}
|
||||
send_catch_log(stats_domain_opened, domain=domain)
|
||||
def open_spider(self, spider):
|
||||
self._stats[spider] = {}
|
||||
send_catch_log(stats_spider_opened, spider=spider)
|
||||
|
||||
def close_domain(self, domain, reason):
|
||||
send_catch_log(stats_domain_closing, domain=domain, reason=reason)
|
||||
stats = self._stats.pop(domain)
|
||||
send_catch_log(stats_domain_closed, domain=domain, reason=reason, \
|
||||
domain_stats=stats)
|
||||
def close_spider(self, spider, reason):
|
||||
send_catch_log(stats_spider_closing, spider=spider, reason=reason)
|
||||
stats = self._stats.pop(spider)
|
||||
send_catch_log(stats_spider_closed, spider=spider, reason=reason, \
|
||||
spider_stats=stats)
|
||||
if self._dump:
|
||||
log.msg("Dumping domain stats:\n" + pprint.pformat(stats), \
|
||||
domain=domain)
|
||||
self._persist_stats(stats, domain)
|
||||
log.msg("Dumping spider stats:\n" + pprint.pformat(stats), \
|
||||
spider=spider)
|
||||
self._persist_stats(stats, spider)
|
||||
|
||||
def _engine_stopped(self):
|
||||
stats = self.get_stats()
|
||||
if self._dump:
|
||||
log.msg("Dumping global stats:\n" + pprint.pformat(stats))
|
||||
self._persist_stats(stats, domain=None)
|
||||
self._persist_stats(stats, spider=None)
|
||||
|
||||
def _persist_stats(self, stats, domain=None):
|
||||
def _persist_stats(self, stats, spider=None):
|
||||
pass
|
||||
|
||||
class MemoryStatsCollector(StatsCollector):
|
||||
@ -78,28 +78,29 @@ class MemoryStatsCollector(StatsCollector):
|
||||
super(MemoryStatsCollector, self).__init__()
|
||||
self.domain_stats = {}
|
||||
|
||||
def _persist_stats(self, stats, domain=None):
|
||||
self.domain_stats[domain] = stats
|
||||
def _persist_stats(self, stats, spider=None):
|
||||
if spider is not None:
|
||||
self.domain_stats[spider.domain_name] = stats
|
||||
|
||||
|
||||
class DummyStatsCollector(StatsCollector):
|
||||
|
||||
def get_value(self, key, default=None, domain=None):
|
||||
def get_value(self, key, default=None, spider=None):
|
||||
return default
|
||||
|
||||
def set_value(self, key, value, domain=None):
|
||||
def set_value(self, key, value, spider=None):
|
||||
pass
|
||||
|
||||
def set_stats(self, stats, domain=None):
|
||||
def set_stats(self, stats, spider=None):
|
||||
pass
|
||||
|
||||
def inc_value(self, key, count=1, start=0, domain=None):
|
||||
def inc_value(self, key, count=1, start=0, spider=None):
|
||||
pass
|
||||
|
||||
def max_value(self, key, value, domain=None):
|
||||
def max_value(self, key, value, spider=None):
|
||||
pass
|
||||
|
||||
def min_value(self, key, value, domain=None):
|
||||
def min_value(self, key, value, spider=None):
|
||||
pass
|
||||
|
||||
|
||||
|
@ -16,8 +16,8 @@ class MysqlStatsCollector(StatsCollector):
|
||||
mysqluri = settings['STATS_MYSQL_URI']
|
||||
self._mysql_conn = mysql_connect(mysqluri, use_unicode=False) if mysqluri else None
|
||||
|
||||
def _persist_stats(self, stats, domain=None):
|
||||
if domain is None: # only store domain-specific stats
|
||||
def _persist_stats(self, stats, spider=None):
|
||||
if spider is None: # only store spider-specific stats
|
||||
return
|
||||
if self._mysql_conn is None:
|
||||
return
|
||||
@ -27,5 +27,5 @@ class MysqlStatsCollector(StatsCollector):
|
||||
|
||||
c = self._mysql_conn.cursor()
|
||||
c.execute("INSERT INTO %s (domain,stored,data) VALUES (%%s,%%s,%%s)" % table, \
|
||||
(domain, stored, datas))
|
||||
(spider.domain_name, stored, datas))
|
||||
self._mysql_conn.commit()
|
||||
|
@ -22,27 +22,27 @@ class SimpledbStatsCollector(StatsCollector):
|
||||
self._async = settings.getbool('STATS_SDB_ASYNC')
|
||||
connect_sdb().create_domain(self._sdbdomain)
|
||||
|
||||
def _persist_stats(self, stats, domain=None):
|
||||
if domain is None: # only store domain-specific stats
|
||||
def _persist_stats(self, stats, spider=None):
|
||||
if spider is None: # only store spider-specific stats
|
||||
return
|
||||
if not self._sdbdomain:
|
||||
return
|
||||
if self._async:
|
||||
dfd = threads.deferToThread(self._persist_to_sdb, domain, stats.copy())
|
||||
dfd = threads.deferToThread(self._persist_to_sdb, spider, stats.copy())
|
||||
dfd.addErrback(log.err, 'Error uploading stats to SimpleDB', \
|
||||
domain=domain)
|
||||
spider=spider)
|
||||
else:
|
||||
self._persist_to_sdb(domain, stats)
|
||||
self._persist_to_sdb(spider, stats)
|
||||
|
||||
def _persist_to_sdb(self, domain, stats):
|
||||
ts = self._get_timestamp(domain).isoformat()
|
||||
sdb_item_id = "%s_%s" % (domain, ts)
|
||||
def _persist_to_sdb(self, spider, stats):
|
||||
ts = self._get_timestamp(spider).isoformat()
|
||||
sdb_item_id = "%s_%s" % (spider.domain_name, ts)
|
||||
sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems())
|
||||
sdb_item['domain'] = domain
|
||||
sdb_item['domain'] = spider.domain_name
|
||||
sdb_item['timestamp'] = self._to_sdb_value(ts)
|
||||
connect_sdb().put_attributes(self._sdbdomain, sdb_item_id, sdb_item)
|
||||
|
||||
def _get_timestamp(self, domain):
|
||||
def _get_timestamp(self, spider):
|
||||
return datetime.utcnow()
|
||||
|
||||
def _to_sdb_value(self, obj, key=None):
|
||||
|
@ -1,3 +1,3 @@
|
||||
stats_domain_opened = object()
|
||||
stats_domain_closing = object()
|
||||
stats_domain_closed = object()
|
||||
stats_spider_opened = object()
|
||||
stats_spider_closing = object()
|
||||
stats_spider_closed = object()
|
||||
|
67
scrapy/tests/test_contrib_exp_loader_lxmlloader.py
Normal file
67
scrapy/tests/test_contrib_exp_loader_lxmlloader.py
Normal file
@ -0,0 +1,67 @@
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.contrib.loader.processor import MapCompose
|
||||
from scrapy.item import Item, Field
|
||||
from scrapy.http import HtmlResponse
|
||||
|
||||
try:
|
||||
import lxml
|
||||
except ImportError:
|
||||
lxml = False
|
||||
|
||||
|
||||
class TestItem(Item):
|
||||
name = Field()
|
||||
|
||||
|
||||
if lxml:
|
||||
from scrapy.contrib_exp.loader.lxmlloader import LxmlItemLoader
|
||||
|
||||
class TestLxmlItemLoader(LxmlItemLoader):
|
||||
default_item_class = TestItem
|
||||
|
||||
|
||||
class LxmlItemLoaderTest(unittest.TestCase):
|
||||
response = HtmlResponse(url="", body='<html><body><div id="id">marta</div><p>paragraph</p></body></html>')
|
||||
|
||||
def setUp(self):
|
||||
if not lxml:
|
||||
raise unittest.SkipTest("lxml is not available")
|
||||
|
||||
def test_constructor_with_response(self):
|
||||
l = TestLxmlItemLoader(response=self.response)
|
||||
self.assert_(l.tree)
|
||||
|
||||
def test_add_xpath(self):
|
||||
l = TestLxmlItemLoader(response=self.response)
|
||||
l.add_xpath('name', '//div')
|
||||
self.assertEqual(l.get_output_value('name'), [u'<div id="id">marta</div>'])
|
||||
|
||||
def test_add_xpath_text(self):
|
||||
l = TestLxmlItemLoader(response=self.response)
|
||||
l.add_xpath('name', '//div/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'marta'])
|
||||
|
||||
def test_replace_xpath(self):
|
||||
l = TestLxmlItemLoader(response=self.response)
|
||||
l.add_xpath('name', '//div/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'marta'])
|
||||
l.replace_xpath('name', '//p/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'paragraph'])
|
||||
|
||||
def test_add_css(self):
|
||||
l = TestLxmlItemLoader(response=self.response)
|
||||
l.add_css('name', '#id')
|
||||
self.assertEqual(l.get_output_value('name'), [u'<div id="id">marta</div>'])
|
||||
|
||||
def test_replace_css(self):
|
||||
l = TestLxmlItemLoader(response=self.response)
|
||||
l.add_css('name', '#id')
|
||||
self.assertEqual(l.get_output_value('name'), [u'<div id="id">marta</div>'])
|
||||
l.replace_css('name', 'p')
|
||||
self.assertEqual(l.get_output_value('name'), [u'<p>paragraph</p>'])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
22
scrapy/tests/test_contrib_logformatter.py
Normal file
22
scrapy/tests/test_contrib_logformatter.py
Normal file
@ -0,0 +1,22 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.contrib.logformatter import crawled_logline
|
||||
|
||||
|
||||
class LoggingContribTest(unittest.TestCase):
|
||||
|
||||
def test_crawled_logline(self):
|
||||
req = Request("http://www.example.com")
|
||||
res = Response("http://www.example.com")
|
||||
self.assertEqual(crawled_logline(req, res),
|
||||
"Crawled (200) <GET http://www.example.com> (referer: None)")
|
||||
|
||||
req = Request("http://www.example.com", headers={'referer': 'http://example.com'})
|
||||
res = Response("http://www.example.com", flags=['cached'])
|
||||
self.assertEqual(crawled_logline(req, res),
|
||||
"Crawled (200) <GET http://www.example.com> (referer: http://example.com) ['cached']")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
163
scrapy/tests/test_downloader_handlers.py
Normal file
163
scrapy/tests/test_downloader_handlers.py
Normal file
@ -0,0 +1,163 @@
|
||||
import os
|
||||
|
||||
from twisted.trial import unittest
|
||||
from twisted.protocols.policies import WrappingFactory
|
||||
from twisted.python.filepath import FilePath
|
||||
from twisted.internet import reactor, defer
|
||||
from twisted.web import server, static, util, resource
|
||||
from twisted.web.test.test_webclient import ForeverTakingResource, \
|
||||
NoLengthResource, HostHeaderResource, \
|
||||
PayloadResource, BrokenDownloadResource
|
||||
|
||||
from scrapy.core.downloader.webclient import PartialDownloadError
|
||||
from scrapy.core.downloader.handlers.file import download_file
|
||||
from scrapy.core.downloader.handlers.http import download_http
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request
|
||||
|
||||
|
||||
class FileTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.tmpname = self.mktemp()
|
||||
fd = open(self.tmpname, 'w')
|
||||
fd.write('0123456789')
|
||||
fd.close()
|
||||
|
||||
def test_download(self):
|
||||
def _test(response):
|
||||
self.assertEquals(response.url, request.url)
|
||||
self.assertEquals(response.status, 200)
|
||||
self.assertEquals(response.body, '0123456789')
|
||||
|
||||
request = Request('file://%s' % self.tmpname)
|
||||
return download_file(request, BaseSpider()).addCallback(_test)
|
||||
|
||||
def test_non_existent(self):
|
||||
request = Request('file://%s' % self.mktemp())
|
||||
d = download_file(request, BaseSpider())
|
||||
return self.assertFailure(d, IOError)
|
||||
|
||||
|
||||
class HttpTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
name = self.mktemp()
|
||||
os.mkdir(name)
|
||||
FilePath(name).child("file").setContent("0123456789")
|
||||
r = static.File(name)
|
||||
r.putChild("redirect", util.Redirect("/file"))
|
||||
r.putChild("wait", ForeverTakingResource())
|
||||
r.putChild("nolength", NoLengthResource())
|
||||
r.putChild("host", HostHeaderResource())
|
||||
r.putChild("payload", PayloadResource())
|
||||
r.putChild("broken", BrokenDownloadResource())
|
||||
self.site = server.Site(r, timeout=None)
|
||||
self.wrapper = WrappingFactory(self.site)
|
||||
self.port = reactor.listenTCP(0, self.wrapper, interface='127.0.0.1')
|
||||
self.portno = self.port.getHost().port
|
||||
|
||||
def tearDown(self):
|
||||
return self.port.stopListening()
|
||||
|
||||
def getURL(self, path):
|
||||
return "http://127.0.0.1:%d/%s" % (self.portno, path)
|
||||
|
||||
def test_download(self):
|
||||
request = Request(self.getURL('file'))
|
||||
d = download_http(request, BaseSpider())
|
||||
d.addCallback(lambda r: r.body)
|
||||
d.addCallback(self.assertEquals, "0123456789")
|
||||
return d
|
||||
|
||||
def test_redirect_status(self):
|
||||
request = Request(self.getURL('redirect'))
|
||||
d = download_http(request, BaseSpider())
|
||||
d.addCallback(lambda r: r.status)
|
||||
d.addCallback(self.assertEquals, 302)
|
||||
return d
|
||||
|
||||
def test_timeout_download_from_spider(self):
|
||||
spider = BaseSpider()
|
||||
spider.download_timeout = 0.000001
|
||||
request = Request(self.getURL('wait'))
|
||||
d = download_http(request, spider)
|
||||
return self.assertFailure(d, defer.TimeoutError)
|
||||
|
||||
def test_host_header_not_in_request_headers(self):
|
||||
def _test(response):
|
||||
self.assertEquals(response.body, '127.0.0.1:%d' % self.portno)
|
||||
self.assertEquals(request.headers, {})
|
||||
|
||||
request = Request(self.getURL('host'))
|
||||
return download_http(request, BaseSpider()).addCallback(_test)
|
||||
|
||||
def test_host_header_seted_in_request_headers(self):
|
||||
def _test(response):
|
||||
self.assertEquals(response.body, 'example.com')
|
||||
self.assertEquals(request.headers.get('Host'), 'example.com')
|
||||
|
||||
request = Request(self.getURL('host'), headers={'Host': 'example.com'})
|
||||
return download_http(request, BaseSpider()).addCallback(_test)
|
||||
|
||||
d = download_http(request, BaseSpider())
|
||||
d.addCallback(lambda r: r.body)
|
||||
d.addCallback(self.assertEquals, 'example.com')
|
||||
return d
|
||||
|
||||
def test_payload(self):
|
||||
body = '1'*100 # PayloadResource requires body length to be 100
|
||||
request = Request(self.getURL('payload'), method='POST', body=body)
|
||||
d = download_http(request, BaseSpider())
|
||||
d.addCallback(lambda r: r.body)
|
||||
d.addCallback(self.assertEquals, body)
|
||||
return d
|
||||
|
||||
def test_broken_download(self):
|
||||
request = Request(self.getURL('broken'))
|
||||
d = download_http(request, BaseSpider())
|
||||
return self.assertFailure(d, PartialDownloadError)
|
||||
|
||||
|
||||
class UriResource(resource.Resource):
|
||||
"""Return the full uri that was requested"""
|
||||
|
||||
def getChild(self, path, request):
|
||||
return self
|
||||
|
||||
def render(self, request):
|
||||
return request.uri
|
||||
|
||||
|
||||
class HttpProxyTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
site = server.Site(UriResource(), timeout=None)
|
||||
wrapper = WrappingFactory(site)
|
||||
self.port = reactor.listenTCP(0, wrapper, interface='127.0.0.1')
|
||||
self.portno = self.port.getHost().port
|
||||
|
||||
def tearDown(self):
|
||||
return self.port.stopListening()
|
||||
|
||||
def getURL(self, path):
|
||||
return "http://127.0.0.1:%d/%s" % (self.portno, path)
|
||||
|
||||
def test_download_with_proxy(self):
|
||||
def _test(response):
|
||||
self.assertEquals(response.status, 200)
|
||||
self.assertEquals(response.url, request.url)
|
||||
self.assertEquals(response.body, 'https://example.com')
|
||||
|
||||
http_proxy = self.getURL('')
|
||||
request = Request('https://example.com', meta={'proxy': http_proxy})
|
||||
return download_http(request, BaseSpider()).addCallback(_test)
|
||||
|
||||
def test_download_without_proxy(self):
|
||||
def _test(response):
|
||||
self.assertEquals(response.status, 200)
|
||||
self.assertEquals(response.url, request.url)
|
||||
self.assertEquals(response.body, '/path/to/resource')
|
||||
|
||||
request = Request(self.getURL('path/to/resource'))
|
||||
return download_http(request, BaseSpider()).addCallback(_test)
|
@ -14,7 +14,7 @@ class CookiesMiddlewareTest(TestCase):
|
||||
self.mw = CookiesMiddleware()
|
||||
|
||||
def tearDown(self):
|
||||
self.mw.domain_closed('scrapytest.org')
|
||||
self.mw.spider_closed(self.spider)
|
||||
del self.mw
|
||||
|
||||
def test_basic(self):
|
||||
|
@ -1,10 +1,11 @@
|
||||
from unittest import TestCase, main
|
||||
from scrapy.http import Response, XmlResponse
|
||||
from scrapy.http import Response, XmlResponse, Request
|
||||
from scrapy.contrib_exp.downloadermiddleware.decompression import DecompressionMiddleware
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.tests import get_testdata
|
||||
|
||||
def setUp():
|
||||
formats = ['tar', 'xml.bz2', 'xml.gz', 'zip']
|
||||
|
||||
def _test_data(formats):
|
||||
uncompressed_body = get_testdata('compressed', 'feed-sample1.xml')
|
||||
test_responses = {}
|
||||
for format in formats:
|
||||
@ -12,29 +13,40 @@ def setUp():
|
||||
test_responses[format] = Response('http://foo.com/bar', body=body)
|
||||
return uncompressed_body, test_responses
|
||||
|
||||
class ScrapyDecompressionTest(TestCase):
|
||||
uncompressed_body, test_responses = setUp()
|
||||
middleware = DecompressionMiddleware()
|
||||
|
||||
def test_tar(self):
|
||||
response, format = self.middleware.extract(self.test_responses['tar'])
|
||||
assert isinstance(response, XmlResponse)
|
||||
self.assertEqual(response.body, self.uncompressed_body)
|
||||
class DecompressionMiddlewareTest(TestCase):
|
||||
|
||||
def test_zip(self):
|
||||
response, format = self.middleware.extract(self.test_responses['zip'])
|
||||
assert isinstance(response, XmlResponse)
|
||||
self.assertEqual(response.body, self.uncompressed_body)
|
||||
test_formats = ['tar', 'xml.bz2', 'xml.gz', 'zip']
|
||||
uncompressed_body, test_responses = _test_data(test_formats)
|
||||
|
||||
def test_gz(self):
|
||||
response, format = self.middleware.extract(self.test_responses['xml.gz'])
|
||||
assert isinstance(response, XmlResponse)
|
||||
self.assertEqual(response.body, self.uncompressed_body)
|
||||
def setUp(self):
|
||||
self.mw = DecompressionMiddleware()
|
||||
self.spider = BaseSpider()
|
||||
|
||||
def test_known_compression_formats(self):
|
||||
for fmt in self.test_formats:
|
||||
rsp = self.test_responses[fmt]
|
||||
new = self.mw.process_response(None, rsp, self.spider)
|
||||
assert isinstance(new, XmlResponse), \
|
||||
'Failed %s, response type %s' % (fmt, type(new).__name__)
|
||||
self.assertEqual(new.body, self.uncompressed_body, fmt)
|
||||
|
||||
def test_plain_response(self):
|
||||
rsp = Response(url='http://test.com', body=self.uncompressed_body)
|
||||
new = self.mw.process_response(None, rsp, self.spider)
|
||||
assert new is rsp
|
||||
self.assertEqual(new.body, rsp.body)
|
||||
|
||||
def test_empty_response(self):
|
||||
rsp = Response(url='http://test.com', body='')
|
||||
new = self.mw.process_response(None, rsp, self.spider)
|
||||
assert new is rsp
|
||||
assert not rsp.body
|
||||
assert not new.body
|
||||
|
||||
def tearDown(self):
|
||||
del self.mw
|
||||
|
||||
def test_bz2(self):
|
||||
response, format = self.middleware.extract(self.test_responses['xml.bz2'])
|
||||
assert isinstance(response, XmlResponse)
|
||||
self.assertEqual(response.body, self.uncompressed_body)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
86
scrapy/tests/test_downloadermiddleware_httpcache.py
Normal file
86
scrapy/tests/test_downloadermiddleware_httpcache.py
Normal file
@ -0,0 +1,86 @@
|
||||
import unittest, tempfile, shutil, time
|
||||
|
||||
from scrapy.http import Response, HtmlResponse, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware
|
||||
from scrapy.conf import Settings
|
||||
from scrapy.core.exceptions import IgnoreRequest
|
||||
|
||||
|
||||
class HttpCacheMiddlewareTest(unittest.TestCase):
|
||||
|
||||
storage_class = FilesystemCacheStorage
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('example.com')
|
||||
self.tmpdir = tempfile.mkdtemp()
|
||||
self.request = Request('http://www.example.com', headers={'User-Agent': 'test'})
|
||||
self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdir)
|
||||
|
||||
def _get_settings(self, **new_settings):
|
||||
settings = {
|
||||
'HTTPCACHE_DIR': self.tmpdir,
|
||||
'HTTPCACHE_EXPIRATION_SECS': 1,
|
||||
}
|
||||
settings.update(new_settings)
|
||||
return Settings(settings)
|
||||
|
||||
def _get_storage(self, **new_settings):
|
||||
return self.storage_class(self._get_settings(**new_settings))
|
||||
|
||||
def _get_middleware(self, **new_settings):
|
||||
return HttpCacheMiddleware(self._get_settings(**new_settings))
|
||||
|
||||
def test_storage(self):
|
||||
storage = self._get_storage()
|
||||
request2 = self.request.copy()
|
||||
assert storage.retrieve_response(self.spider, request2) is None
|
||||
storage.store_response(self.spider, self.request, self.response)
|
||||
response2 = storage.retrieve_response(self.spider, request2)
|
||||
assert isinstance(response2, HtmlResponse) # inferred from content-type header
|
||||
self.assertEqualResponse(self.response, response2)
|
||||
time.sleep(2) # wait for cache to expire
|
||||
assert storage.retrieve_response(self.spider, request2) is None
|
||||
|
||||
def test_storage_expire_immediately(self):
|
||||
storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
|
||||
assert storage.retrieve_response(self.spider, self.request) is None
|
||||
storage.store_response(self.spider, self.request, self.response)
|
||||
assert storage.retrieve_response(self.spider, self.request) is None
|
||||
|
||||
def test_storage_never_expire(self):
|
||||
storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=-1)
|
||||
assert storage.retrieve_response(self.spider, self.request) is None
|
||||
storage.store_response(self.spider, self.request, self.response)
|
||||
assert storage.retrieve_response(self.spider, self.request)
|
||||
|
||||
def test_middleware(self):
|
||||
mw = HttpCacheMiddleware(self._get_settings())
|
||||
assert mw.process_request(self.request, self.spider) is None
|
||||
mw.process_response(self.request, self.response, self.spider)
|
||||
response = mw.process_request(self.request, self.spider)
|
||||
assert isinstance(response, HtmlResponse)
|
||||
self.assertEqualResponse(self.response, response)
|
||||
assert 'cached' in response.flags
|
||||
|
||||
def test_middleware_ignore_missing(self):
|
||||
mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
|
||||
self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
|
||||
mw.process_response(self.request, self.response, self.spider)
|
||||
response = mw.process_request(self.request, self.spider)
|
||||
assert isinstance(response, HtmlResponse)
|
||||
self.assertEqualResponse(self.response, response)
|
||||
assert 'cached' in response.flags
|
||||
|
||||
def assertEqualResponse(self, response1, response2):
|
||||
self.assertEqual(response1.url, response2.url)
|
||||
self.assertEqual(response1.status, response2.status)
|
||||
self.assertEqual(response1.headers, response2.headers)
|
||||
self.assertEqual(response1.body, response2.body)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
86
scrapy/tests/test_downloadermiddleware_httpproxy.py
Normal file
86
scrapy/tests/test_downloadermiddleware_httpproxy.py
Normal file
@ -0,0 +1,86 @@
|
||||
import os
|
||||
import sys
|
||||
from twisted.trial.unittest import TestCase, SkipTest
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.conf import settings
|
||||
|
||||
spider = BaseSpider()
|
||||
|
||||
class TestDefaultHeadersMiddleware(TestCase):
|
||||
|
||||
failureException = AssertionError
|
||||
|
||||
def setUp(self):
|
||||
self._oldenv = os.environ.copy()
|
||||
|
||||
def tearDown(self):
|
||||
os.environ = self._oldenv
|
||||
|
||||
def test_no_proxies(self):
|
||||
os.environ = {}
|
||||
self.assertRaises(NotConfigured, HttpProxyMiddleware)
|
||||
|
||||
def test_no_enviroment_proxies(self):
|
||||
os.environ = {'dummy_proxy': 'reset_env_and_do_not_raise'}
|
||||
mw = HttpProxyMiddleware()
|
||||
|
||||
for url in ('http://e.com', 'https://e.com', 'file:///tmp/a'):
|
||||
req = Request(url)
|
||||
assert mw.process_request(req, spider) is None
|
||||
self.assertEquals(req.url, url)
|
||||
self.assertEquals(req.meta, {})
|
||||
|
||||
def test_enviroment_proxies(self):
|
||||
os.environ['http_proxy'] = http_proxy = 'https://proxy.for.http:3128'
|
||||
os.environ['https_proxy'] = https_proxy = 'http://proxy.for.https:8080'
|
||||
os.environ.pop('file_proxy', None)
|
||||
mw = HttpProxyMiddleware()
|
||||
|
||||
for url, proxy in [('http://e.com', http_proxy),
|
||||
('https://e.com', https_proxy), ('file://tmp/a', None)]:
|
||||
req = Request(url)
|
||||
assert mw.process_request(req, spider) is None
|
||||
self.assertEquals(req.url, url)
|
||||
self.assertEquals(req.meta.get('proxy'), proxy)
|
||||
|
||||
def test_proxy_auth(self):
|
||||
os.environ['http_proxy'] = 'https://user:pass@proxy:3128'
|
||||
mw = HttpProxyMiddleware()
|
||||
req = Request('http://scrapytest.org')
|
||||
assert mw.process_request(req, spider) is None
|
||||
self.assertEquals(req.meta, {'proxy': 'https://proxy:3128'})
|
||||
self.assertEquals(req.headers.get('Proxy-Authorization'), 'Basic dXNlcjpwYXNz')
|
||||
|
||||
def test_proxy_already_seted(self):
|
||||
os.environ['http_proxy'] = http_proxy = 'https://proxy.for.http:3128'
|
||||
mw = HttpProxyMiddleware()
|
||||
req = Request('http://noproxy.com', meta={'proxy': None})
|
||||
assert mw.process_request(req, spider) is None
|
||||
assert 'proxy' in req.meta and req.meta['proxy'] is None
|
||||
|
||||
|
||||
def test_no_proxy(self):
|
||||
if sys.version_info < (2, 6):
|
||||
raise SkipTest('no_proxy is not supported in python < 2.6')
|
||||
os.environ['http_proxy'] = http_proxy = 'https://proxy.for.http:3128'
|
||||
mw = HttpProxyMiddleware()
|
||||
|
||||
os.environ['no_proxy'] = '*'
|
||||
req = Request('http://noproxy.com')
|
||||
assert mw.process_request(req, spider) is None
|
||||
assert 'proxy' not in req.meta
|
||||
|
||||
os.environ['no_proxy'] = 'other.com'
|
||||
req = Request('http://noproxy.com')
|
||||
assert mw.process_request(req, spider) is None
|
||||
assert 'proxy' in req.meta
|
||||
|
||||
os.environ['no_proxy'] = 'other.com,noproxy.com'
|
||||
req = Request('http://noproxy.com')
|
||||
assert mw.process_request(req, spider) is None
|
||||
assert 'proxy' not in req.meta
|
||||
|
@ -64,6 +64,7 @@ class RedirectMiddlewareTest(unittest.TestCase):
|
||||
assert isinstance(req2, Request)
|
||||
self.assertEqual(req2.url, 'http://example.org/newpage')
|
||||
|
||||
def test_meta_refresh_with_high_interval(self):
|
||||
# meta-refresh with high intervals don't trigger redirects
|
||||
body = """<html>
|
||||
<head><meta http-equiv="refresh" content="1000;url=http://example.org/newpage" /></head>
|
||||
@ -74,6 +75,25 @@ class RedirectMiddlewareTest(unittest.TestCase):
|
||||
|
||||
assert rsp is rsp2
|
||||
|
||||
def test_meta_refresh_trough_posted_request(self):
|
||||
body = """<html>
|
||||
<head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
|
||||
</html>"""
|
||||
req = Request(url='http://example.org', method='POST', body='test',
|
||||
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
|
||||
rsp = Response(url='http://example.org', body=body)
|
||||
req2 = self.mw.process_response(req, rsp, self.spider)
|
||||
|
||||
assert isinstance(req2, Request)
|
||||
self.assertEqual(req2.url, 'http://example.org/newpage')
|
||||
self.assertEqual(req2.method, 'GET')
|
||||
assert 'Content-Type' not in req2.headers, \
|
||||
"Content-Type header must not be present in redirected request"
|
||||
assert 'Content-Length' not in req2.headers, \
|
||||
"Content-Length header must not be present in redirected request"
|
||||
assert not req2.body, \
|
||||
"Redirected body must be empty, not '%s'" % req2.body
|
||||
|
||||
def test_max_redirect_times(self):
|
||||
self.mw.max_redirect_times = 1
|
||||
req = Request('http://scrapytest.org/302')
|
||||
|
@ -1,6 +1,5 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.conf import settings
|
||||
from scrapy.contrib.downloadermiddleware.stats import DownloaderStats
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.spider import BaseSpider
|
||||
@ -10,11 +9,10 @@ from scrapy.stats import stats
|
||||
class TestDownloaderStats(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider()
|
||||
self.spider.domain_name = 'scrapytest.org'
|
||||
self.spider = BaseSpider('scrapytest.org')
|
||||
self.mw = DownloaderStats()
|
||||
|
||||
stats.open_domain(self.spider.domain_name)
|
||||
stats.open_spider(self.spider)
|
||||
|
||||
self.req = Request('scrapytest.org')
|
||||
self.res = Response('scrapytest.org', status=400)
|
||||
@ -22,18 +20,18 @@ class TestDownloaderStats(TestCase):
|
||||
def test_process_request(self):
|
||||
self.mw.process_request(self.req, self.spider)
|
||||
self.assertEqual(stats.get_value('downloader/request_count', \
|
||||
domain=self.spider.domain_name), 1)
|
||||
spider=self.spider), 1)
|
||||
|
||||
def test_process_response(self):
|
||||
self.mw.process_response(self.req, self.res, self.spider)
|
||||
self.assertEqual(stats.get_value('downloader/response_count', \
|
||||
domain=self.spider.domain_name), 1)
|
||||
spider=self.spider), 1)
|
||||
|
||||
def test_process_exception(self):
|
||||
self.mw.process_exception(self.req, Exception(), self.spider)
|
||||
self.assertEqual(stats.get_value('downloader/exception_count', \
|
||||
domain=self.spider.domain_name), 1)
|
||||
spider=self.spider), 1)
|
||||
|
||||
def tearDown(self):
|
||||
stats.close_domain(self.spider.domain_name, '')
|
||||
stats.close_spider(self.spider, '')
|
||||
|
||||
|
@ -89,9 +89,9 @@ class CrawlingSession(object):
|
||||
|
||||
dispatcher.connect(self.record_signal, signals.engine_started)
|
||||
dispatcher.connect(self.record_signal, signals.engine_stopped)
|
||||
dispatcher.connect(self.record_signal, signals.domain_opened)
|
||||
dispatcher.connect(self.record_signal, signals.domain_idle)
|
||||
dispatcher.connect(self.record_signal, signals.domain_closed)
|
||||
dispatcher.connect(self.record_signal, signals.spider_opened)
|
||||
dispatcher.connect(self.record_signal, signals.spider_idle)
|
||||
dispatcher.connect(self.record_signal, signals.spider_closed)
|
||||
dispatcher.connect(self.item_scraped, signals.item_scraped)
|
||||
dispatcher.connect(self.request_received, signals.request_received)
|
||||
dispatcher.connect(self.response_downloaded, signals.response_downloaded)
|
||||
@ -201,16 +201,16 @@ class EngineTest(unittest.TestCase):
|
||||
|
||||
assert signals.engine_started in session.signals_catched
|
||||
assert signals.engine_stopped in session.signals_catched
|
||||
assert signals.domain_opened in session.signals_catched
|
||||
assert signals.domain_idle in session.signals_catched
|
||||
assert signals.domain_closed in session.signals_catched
|
||||
assert signals.spider_opened in session.signals_catched
|
||||
assert signals.spider_idle in session.signals_catched
|
||||
assert signals.spider_closed in session.signals_catched
|
||||
|
||||
self.assertEqual({'domain': session.domain, 'spider': session.spider},
|
||||
session.signals_catched[signals.domain_opened])
|
||||
self.assertEqual({'domain': session.domain, 'spider': session.spider},
|
||||
session.signals_catched[signals.domain_idle])
|
||||
self.assertEqual({'domain': session.domain, 'spider': session.spider, 'reason': 'finished'},
|
||||
session.signals_catched[signals.domain_closed])
|
||||
self.assertEqual({'spider': session.spider},
|
||||
session.signals_catched[signals.spider_opened])
|
||||
self.assertEqual({'spider': session.spider},
|
||||
session.signals_catched[signals.spider_idle])
|
||||
self.assertEqual({'spider': session.spider, 'reason': 'finished'},
|
||||
session.signals_catched[signals.spider_closed])
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1 and sys.argv[1] == 'runserver':
|
||||
|
@ -258,6 +258,55 @@ class FormRequestTest(RequestTest):
|
||||
self.assertEqual(fs['one'].value, '1')
|
||||
self.assertEqual(fs['two'].value, '2')
|
||||
|
||||
def test_from_response_submit_first_clickeable(self):
|
||||
respbody = """
|
||||
<form action="get.php" method="GET">
|
||||
<input type="submit" name="clickeable1" value="clicked1">
|
||||
<input type="hidden" name="one" value="1">
|
||||
<input type="hidden" name="two" value="3">
|
||||
<input type="submit" name="clickeable2" value="clicked2">
|
||||
</form>
|
||||
"""
|
||||
response = Response("http://www.example.com/this/list.html", body=respbody)
|
||||
r1 = self.request_class.from_response(response, formdata={'two': '2'})
|
||||
urlargs = cgi.parse_qs(urlparse(r1.url).query)
|
||||
self.assertEqual(urlargs['clickeable1'], ['clicked1'])
|
||||
self.assertFalse('clickeable2' in urlargs, urlargs)
|
||||
self.assertEqual(urlargs['one'], ['1'])
|
||||
self.assertEqual(urlargs['two'], ['2'])
|
||||
|
||||
def test_from_response_submit_not_first_clickeable(self):
|
||||
respbody = """
|
||||
<form action="get.php" method="GET">
|
||||
<input type="submit" name="clickeable1" value="clicked1">
|
||||
<input type="hidden" name="one" value="1">
|
||||
<input type="hidden" name="two" value="3">
|
||||
<input type="submit" name="clickeable2" value="clicked2">
|
||||
</form>
|
||||
"""
|
||||
response = Response("http://www.example.com/this/list.html", body=respbody)
|
||||
r1 = self.request_class.from_response(response, formdata={'two': '2'}, clickdata={'name': 'clickeable2'})
|
||||
urlargs = cgi.parse_qs(urlparse(r1.url).query)
|
||||
self.assertEqual(urlargs['clickeable2'], ['clicked2'])
|
||||
self.assertFalse('clickeable1' in urlargs, urlargs)
|
||||
self.assertEqual(urlargs['one'], ['1'])
|
||||
self.assertEqual(urlargs['two'], ['2'])
|
||||
|
||||
def test_from_response_dont_click(self):
|
||||
respbody = """
|
||||
<form action="get.php" method="GET">
|
||||
<input type="submit" name="clickeable1" value="clicked1">
|
||||
<input type="hidden" name="one" value="1">
|
||||
<input type="hidden" name="two" value="3">
|
||||
<input type="submit" name="clickeable2" value="clicked2">
|
||||
</form>
|
||||
"""
|
||||
response = Response("http://www.example.com/this/list.html", body=respbody)
|
||||
r1 = self.request_class.from_response(response, dont_click=True)
|
||||
urlargs = cgi.parse_qs(urlparse(r1.url).query)
|
||||
self.assertFalse('clickeable1' in urlargs, urlargs)
|
||||
self.assertFalse('clickeable2' in urlargs, urlargs)
|
||||
|
||||
def test_from_response_errors_noform(self):
|
||||
respbody = """<html></html>"""
|
||||
response = Response("http://www.example.com/lala.html", body=respbody)
|
||||
|
@ -2,6 +2,7 @@ import unittest
|
||||
import weakref
|
||||
|
||||
from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
class BaseResponseTest(unittest.TestCase):
|
||||
@ -138,8 +139,10 @@ class TextResponseTest(BaseResponseTest):
|
||||
self.assertEqual(r3.encoding, "latin1")
|
||||
|
||||
def test_unicode_url(self):
|
||||
# instantiate with unicode url without encoding
|
||||
self.assertRaises(TypeError, self.response_class, u"http://www.example.com/")
|
||||
# instantiate with unicode url without encoding (should set default encoding)
|
||||
resp = self.response_class(u"http://www.example.com/")
|
||||
self.assertEqual(resp.encoding, settings['DEFAULT_RESPONSE_ENCODING'])
|
||||
|
||||
# make sure urls are converted to str
|
||||
resp = self.response_class(url=u"http://www.example.com/", encoding='utf-8')
|
||||
assert isinstance(resp.url, str)
|
||||
@ -187,7 +190,6 @@ class TextResponseTest(BaseResponseTest):
|
||||
# TextResponse (and subclasses) must be passed a encoding when instantiating with unicode bodies
|
||||
self.assertRaises(TypeError, self.response_class, "http://www.example.com", body=u"\xa3")
|
||||
|
||||
|
||||
class HtmlResponseTest(TextResponseTest):
|
||||
|
||||
response_class = HtmlResponse
|
||||
@ -229,8 +231,7 @@ class XmlResponseTest(TextResponseTest):
|
||||
|
||||
body = "<xml></xml>"
|
||||
r1 = self.response_class("http://www.example.com", body=body)
|
||||
# XXX: we may want to swtich default XmlResponse encoding to utf-8
|
||||
self._assert_response_values(r1, 'ascii', body)
|
||||
self._assert_response_values(r1, settings['DEFAULT_RESPONSE_ENCODING'], body)
|
||||
|
||||
body = """<?xml version="1.0" encoding="iso-8859-1"?><xml></xml>"""
|
||||
r2 = self.response_class("http://www.example.com", body=body)
|
||||
|
@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.item import Item, Field, ScrapedItem
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
|
||||
class ItemTest(unittest.TestCase):
|
||||
@ -127,23 +127,5 @@ class ItemTest(unittest.TestCase):
|
||||
self.assertEqual(dict(i), {'name': u'John'})
|
||||
|
||||
|
||||
# NOTE: ScrapedItem is deprecated and will be removed in the next stable
|
||||
# release, and so will these tests.
|
||||
|
||||
class ScrapedItemTestCase(unittest.TestCase):
|
||||
|
||||
def test_item(self):
|
||||
|
||||
class MyItem(ScrapedItem):
|
||||
pass
|
||||
|
||||
item = MyItem()
|
||||
self.assertEqual(repr(item), 'MyItem({})')
|
||||
|
||||
item = ScrapedItem({'key': 'value'})
|
||||
self.assertEqual(item.key, 'value')
|
||||
|
||||
self.assertRaises(TypeError, ScrapedItem, 10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -241,6 +241,12 @@ class XPathSelectorTestCase(unittest.TestCase):
|
||||
u'\n ',
|
||||
u'\n pff\n'])
|
||||
|
||||
@libxml2debug
|
||||
def test_empty_bodies(self):
|
||||
r1 = TextResponse('http://www.example.com', body='')
|
||||
hxs = HtmlXPathSelector(r1) # shouldn't raise error
|
||||
xxs = XmlXPathSelector(r1) # shouldn't raise error
|
||||
|
||||
@libxml2debug
|
||||
def test_weakref_slots(self):
|
||||
"""Check that classes are using slots and are weak-referenceable"""
|
||||
|
@ -14,10 +14,9 @@ class TestDepthMiddleware(TestCase):
|
||||
settings.overrides['DEPTH_LIMIT'] = 1
|
||||
settings.overrides['DEPTH_STATS'] = True
|
||||
|
||||
self.spider = BaseSpider()
|
||||
self.spider.domain_name = 'scrapytest.org'
|
||||
self.spider = BaseSpider('scrapytest.org')
|
||||
|
||||
stats.open_domain(self.spider.domain_name)
|
||||
stats.open_spider(self.spider)
|
||||
|
||||
self.mw = DepthMiddleware()
|
||||
self.assertEquals(stats.get_value('envinfo/request_depth_limit'), 1)
|
||||
@ -31,8 +30,7 @@ class TestDepthMiddleware(TestCase):
|
||||
out = list(self.mw.process_spider_output(resp, result, self.spider))
|
||||
self.assertEquals(out, result)
|
||||
|
||||
rdc = stats.get_value('request_depth_count/1',
|
||||
domain=self.spider.domain_name)
|
||||
rdc = stats.get_value('request_depth_count/1', spider=self.spider)
|
||||
self.assertEquals(rdc, 1)
|
||||
|
||||
req.meta['depth'] = 1
|
||||
@ -40,8 +38,7 @@ class TestDepthMiddleware(TestCase):
|
||||
out2 = list(self.mw.process_spider_output(resp, result, self.spider))
|
||||
self.assertEquals(out2, [])
|
||||
|
||||
rdm = stats.get_value('request_depth_max',
|
||||
domain=self.spider.domain_name)
|
||||
rdm = stats.get_value('request_depth_max', spider=self.spider)
|
||||
self.assertEquals(rdm, 1)
|
||||
|
||||
def tearDown(self):
|
||||
@ -49,5 +46,5 @@ class TestDepthMiddleware(TestCase):
|
||||
del settings.overrides['DEPTH_STATS']
|
||||
settings.disabled = True
|
||||
|
||||
stats.close_domain(self.spider.domain_name, '')
|
||||
stats.close_spider(self.spider, '')
|
||||
|
||||
|
@ -13,7 +13,7 @@ class TestOffsiteMiddleware(TestCase):
|
||||
self.spider.extra_domain_names = ['scrapy.org']
|
||||
|
||||
self.mw = OffsiteMiddleware()
|
||||
self.mw.domain_opened(self.spider)
|
||||
self.mw.spider_opened(self.spider)
|
||||
|
||||
def test_process_spider_output(self):
|
||||
res = Response('http://scrapytest.org')
|
||||
@ -28,5 +28,5 @@ class TestOffsiteMiddleware(TestCase):
|
||||
self.assertEquals(out, onsite_reqs)
|
||||
|
||||
def tearDown(self):
|
||||
self.mw.domain_closed(self.spider)
|
||||
self.mw.spider_closed(self.spider)
|
||||
|
||||
|
@ -1,12 +1,16 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.stats.collector import StatsCollector, DummyStatsCollector
|
||||
from scrapy.stats.signals import stats_domain_opened, stats_domain_closing, \
|
||||
stats_domain_closed
|
||||
from scrapy.stats.signals import stats_spider_opened, stats_spider_closing, \
|
||||
stats_spider_closed
|
||||
|
||||
class StatsCollectorTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider()
|
||||
|
||||
def test_collector(self):
|
||||
stats = StatsCollector()
|
||||
self.assertEqual(stats.get_stats(), {})
|
||||
@ -43,44 +47,45 @@ class StatsCollectorTest(unittest.TestCase):
|
||||
stats.inc_value('v1')
|
||||
stats.max_value('v2', 100)
|
||||
stats.min_value('v3', 100)
|
||||
stats.open_domain('a')
|
||||
stats.set_value('test', 'value', domain='a')
|
||||
stats.open_spider('a')
|
||||
stats.set_value('test', 'value', spider=self.spider)
|
||||
self.assertEqual(stats.get_stats(), {})
|
||||
self.assertEqual(stats.get_stats('a'), {})
|
||||
|
||||
def test_signals(self):
|
||||
signals_catched = set()
|
||||
|
||||
def domain_open(domain):
|
||||
assert domain == 'example.com'
|
||||
signals_catched.add(stats_domain_opened)
|
||||
def spider_opened(spider):
|
||||
assert spider is self.spider
|
||||
signals_catched.add(stats_spider_opened)
|
||||
|
||||
def domain_closing(domain, reason):
|
||||
assert domain == 'example.com'
|
||||
def spider_closing(spider, reason):
|
||||
assert spider is self.spider
|
||||
assert reason == 'testing'
|
||||
signals_catched.add(stats_domain_closing)
|
||||
signals_catched.add(stats_spider_closing)
|
||||
|
||||
def domain_closed(domain, reason, domain_stats):
|
||||
assert domain == 'example.com'
|
||||
def spider_closed(spider, reason, spider_stats):
|
||||
assert spider is self.spider
|
||||
assert reason == 'testing'
|
||||
assert domain_stats == {'test': 1}
|
||||
signals_catched.add(stats_domain_closed)
|
||||
assert spider_stats == {'test': 1}
|
||||
signals_catched.add(stats_spider_closed)
|
||||
|
||||
dispatcher.connect(domain_open, signal=stats_domain_opened)
|
||||
dispatcher.connect(domain_closing, signal=stats_domain_closing)
|
||||
dispatcher.connect(domain_closed, signal=stats_domain_closed)
|
||||
dispatcher.connect(spider_opened, signal=stats_spider_opened)
|
||||
dispatcher.connect(spider_closing, signal=stats_spider_closing)
|
||||
dispatcher.connect(spider_closed, signal=stats_spider_closed)
|
||||
|
||||
stats = StatsCollector()
|
||||
stats.open_domain('example.com')
|
||||
stats.set_value('test', 1, domain='example.com')
|
||||
stats.close_domain('example.com', 'testing')
|
||||
assert stats_domain_opened in signals_catched
|
||||
assert stats_domain_closing in signals_catched
|
||||
assert stats_domain_closed in signals_catched
|
||||
stats.open_spider(self.spider)
|
||||
stats.set_value('test', 1, spider=self.spider)
|
||||
self.assertEqual([(self.spider, {'test': 1})], list(stats.iter_spider_stats()))
|
||||
stats.close_spider(self.spider, 'testing')
|
||||
assert stats_spider_opened in signals_catched
|
||||
assert stats_spider_closing in signals_catched
|
||||
assert stats_spider_closed in signals_catched
|
||||
|
||||
dispatcher.disconnect(domain_open, signal=stats_domain_opened)
|
||||
dispatcher.disconnect(domain_closing, signal=stats_domain_closing)
|
||||
dispatcher.disconnect(domain_closed, signal=stats_domain_closed)
|
||||
dispatcher.disconnect(spider_opened, signal=stats_spider_opened)
|
||||
dispatcher.disconnect(spider_closing, signal=stats_spider_closing)
|
||||
dispatcher.disconnect(spider_closed, signal=stats_spider_closed)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -1,15 +1,17 @@
|
||||
import os
|
||||
import unittest
|
||||
import libxml2
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.utils.iterators import csviter, xmliter
|
||||
from scrapy.contrib_exp.iterators import xmliter_lxml
|
||||
from scrapy.http import XmlResponse, TextResponse
|
||||
from scrapy.tests import get_testdata
|
||||
|
||||
class UtilsIteratorsTestCase(unittest.TestCase):
|
||||
### NOTE: Encoding issues have been found with BeautifulSoup for utf-16 files, utf-16 test removed ###
|
||||
# pablo: Tests shouldn't be removed, but commented with proper steps on how
|
||||
# to reproduce the missing functionality
|
||||
|
||||
class XmliterTestCase(unittest.TestCase):
|
||||
|
||||
xmliter = staticmethod(xmliter)
|
||||
|
||||
def test_xmliter(self):
|
||||
body = """<?xml version="1.0" encoding="UTF-8"?>\
|
||||
<products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="someschmea.xsd">\
|
||||
@ -25,7 +27,7 @@ class UtilsIteratorsTestCase(unittest.TestCase):
|
||||
|
||||
response = XmlResponse(url="http://example.com", body=body)
|
||||
attrs = []
|
||||
for x in xmliter(response, 'product'):
|
||||
for x in self.xmliter(response, 'product'):
|
||||
attrs.append((x.select("@id").extract(), x.select("name/text()").extract(), x.select("./type/text()").extract()))
|
||||
|
||||
self.assertEqual(attrs,
|
||||
@ -34,7 +36,7 @@ class UtilsIteratorsTestCase(unittest.TestCase):
|
||||
def test_xmliter_text(self):
|
||||
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
|
||||
|
||||
self.assertEqual([x.select("text()").extract() for x in xmliter(body, 'product')],
|
||||
self.assertEqual([x.select("text()").extract() for x in self.xmliter(body, 'product')],
|
||||
[[u'one'], [u'two']])
|
||||
|
||||
def test_xmliter_namespaces(self):
|
||||
@ -57,7 +59,7 @@ class UtilsIteratorsTestCase(unittest.TestCase):
|
||||
</rss>
|
||||
"""
|
||||
response = XmlResponse(url='http://mydummycompany.com', body=body)
|
||||
my_iter = xmliter(response, 'item')
|
||||
my_iter = self.xmliter(response, 'item')
|
||||
|
||||
node = my_iter.next()
|
||||
node.register_namespace('g', 'http://base.google.com/ns/1.0')
|
||||
@ -74,7 +76,7 @@ class UtilsIteratorsTestCase(unittest.TestCase):
|
||||
def test_xmliter_exception(self):
|
||||
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
|
||||
|
||||
iter = xmliter(body, 'product')
|
||||
iter = self.xmliter(body, 'product')
|
||||
iter.next()
|
||||
iter.next()
|
||||
|
||||
@ -84,10 +86,19 @@ class UtilsIteratorsTestCase(unittest.TestCase):
|
||||
body = '<?xml version="1.0" encoding="ISO-8859-9"?>\n<xml>\n <item>Some Turkish Characters \xd6\xc7\xde\xdd\xd0\xdc \xfc\xf0\xfd\xfe\xe7\xf6</item>\n</xml>\n\n'
|
||||
response = XmlResponse('http://www.example.com', body=body)
|
||||
self.assertEqual(
|
||||
xmliter(response, 'item').next().extract(),
|
||||
self.xmliter(response, 'item').next().extract(),
|
||||
u'<item>Some Turkish Characters \xd6\xc7\u015e\u0130\u011e\xdc \xfc\u011f\u0131\u015f\xe7\xf6</item>'
|
||||
)
|
||||
|
||||
|
||||
class LxmlXmliterTestCase(XmliterTestCase):
|
||||
xmliter = staticmethod(xmliter_lxml)
|
||||
try:
|
||||
import lxml
|
||||
except ImportError:
|
||||
skip = True
|
||||
|
||||
|
||||
class UtilsCsvTestCase(unittest.TestCase):
|
||||
sample_feeds_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data', 'feeds')
|
||||
sample_feed_path = os.path.join(sample_feeds_dir, 'feed-sample3.csv')
|
||||
|
@ -42,7 +42,7 @@ class ResponseUtilsTest(unittest.TestCase):
|
||||
<body>blahablsdfsal&</body>
|
||||
</html>"""
|
||||
response = Response(url='http://example.org', body=body)
|
||||
self.assertEqual(get_meta_refresh(response), ('5', 'http://example.org/newpage'))
|
||||
self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage'))
|
||||
|
||||
# refresh without url should return (None, None)
|
||||
body = """<meta http-equiv="refresh" content="5" />"""
|
||||
@ -52,7 +52,7 @@ class ResponseUtilsTest(unittest.TestCase):
|
||||
body = """<meta http-equiv="refresh" content="5;
|
||||
url=http://example.org/newpage" /></head>"""
|
||||
response = Response(url='http://example.org', body=body)
|
||||
self.assertEqual(get_meta_refresh(response), ('5', 'http://example.org/newpage'))
|
||||
self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage'))
|
||||
|
||||
# meta refresh in multiple lines
|
||||
body = """<html><head>
|
||||
@ -60,7 +60,44 @@ class ResponseUtilsTest(unittest.TestCase):
|
||||
HTTP-EQUIV="Refresh"
|
||||
CONTENT="1; URL=http://example.org/newpage">"""
|
||||
response = Response(url='http://example.org', body=body)
|
||||
self.assertEqual(get_meta_refresh(response), ('1', 'http://example.org/newpage'))
|
||||
self.assertEqual(get_meta_refresh(response), (1, 'http://example.org/newpage'))
|
||||
|
||||
# entities in the redirect url
|
||||
body = """<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">"""
|
||||
response = Response(url='http://example.com', body=body)
|
||||
self.assertEqual(get_meta_refresh(response), (3, 'http://www.example.com/other'))
|
||||
|
||||
# relative redirects
|
||||
body = """<meta http-equiv="refresh" content="3; url=other.html">"""
|
||||
response = Response(url='http://example.com/page/this.html', body=body)
|
||||
self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/page/other.html'))
|
||||
|
||||
# non-standard encodings (utf-16)
|
||||
body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">"""
|
||||
body = body.decode('ascii').encode('utf-16')
|
||||
response = TextResponse(url='http://example.com', body=body, encoding='utf-16')
|
||||
self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/redirect'))
|
||||
|
||||
# non-ascii chars in the url (default encoding - utf8)
|
||||
body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
|
||||
response = Response(url='http://example.com', body=body)
|
||||
self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3'))
|
||||
|
||||
# non-ascii chars in the url (custom encoding - latin1)
|
||||
body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
|
||||
response = TextResponse(url='http://example.com', body=body, encoding='latin1')
|
||||
self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3'))
|
||||
|
||||
# wrong encodings (possibly caused by truncated chunks)
|
||||
body = """<meta http-equiv="refresh" content="3; url=http://example.com/this\xc2_THAT">"""
|
||||
response = Response(url='http://example.com', body=body)
|
||||
self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/thisTHAT'))
|
||||
|
||||
# responses without refresh tag should return None None
|
||||
response = Response(url='http://example.org')
|
||||
self.assertEqual(get_meta_refresh(response), (None, None))
|
||||
response = TextResponse(url='http://example.org')
|
||||
self.assertEqual(get_meta_refresh(response), (None, None))
|
||||
|
||||
def test_response_httprepr(self):
|
||||
r1 = Response("http://www.example.com")
|
||||
|
@ -13,14 +13,27 @@ from twisted.python.filepath import FilePath
|
||||
from twisted.protocols.policies import WrappingFactory
|
||||
|
||||
from scrapy.core.downloader import webclient as client
|
||||
from scrapy.http import Headers
|
||||
from scrapy.http import Request, Headers
|
||||
|
||||
|
||||
def getPage(url, contextFactory=None, *args, **kwargs):
|
||||
"""Adapted version of twisted.web.client.getPage"""
|
||||
def _clientfactory(*args, **kwargs):
|
||||
timeout = kwargs.pop('timeout', 0)
|
||||
f = client.ScrapyHTTPClientFactory(Request(*args, **kwargs), timeout=timeout)
|
||||
f.deferred.addCallback(lambda r: r.body)
|
||||
return f
|
||||
|
||||
from twisted.web.client import _makeGetterFactory
|
||||
return _makeGetterFactory(url, _clientfactory,
|
||||
contextFactory=contextFactory, *args, **kwargs).deferred
|
||||
|
||||
|
||||
class ParseUrlTestCase(unittest.TestCase):
|
||||
"""Test URL parsing facility and defaults values."""
|
||||
|
||||
def _parse(self, url):
|
||||
f = client.ScrapyHTTPClientFactory(url)
|
||||
f = client.ScrapyHTTPClientFactory(Request(url))
|
||||
return (f.scheme, f.netloc, f.host, f.port, f.path)
|
||||
|
||||
def testParse(self):
|
||||
@ -75,7 +88,7 @@ class ScrapyHTTPPageGetterTests(unittest.TestCase):
|
||||
|
||||
def test_earlyHeaders(self):
|
||||
# basic test stolen from twisted HTTPageGetter
|
||||
factory = client.ScrapyHTTPClientFactory(
|
||||
factory = client.ScrapyHTTPClientFactory(Request(
|
||||
url='http://foo/bar',
|
||||
body="some data",
|
||||
headers={
|
||||
@ -83,7 +96,7 @@ class ScrapyHTTPPageGetterTests(unittest.TestCase):
|
||||
'User-Agent': 'fooble',
|
||||
'Cookie': 'blah blah',
|
||||
'Content-Length': '12981',
|
||||
'Useful': 'value'})
|
||||
'Useful': 'value'}))
|
||||
|
||||
self._test(factory,
|
||||
"GET /bar HTTP/1.0\r\n"
|
||||
@ -97,18 +110,18 @@ class ScrapyHTTPPageGetterTests(unittest.TestCase):
|
||||
"some data")
|
||||
|
||||
# test minimal sent headers
|
||||
factory = client.ScrapyHTTPClientFactory('http://foo/bar')
|
||||
factory = client.ScrapyHTTPClientFactory(Request('http://foo/bar'))
|
||||
self._test(factory,
|
||||
"GET /bar HTTP/1.0\r\n"
|
||||
"Host: foo\r\n"
|
||||
"\r\n")
|
||||
|
||||
# test a simple POST with body and content-type
|
||||
factory = client.ScrapyHTTPClientFactory(
|
||||
factory = client.ScrapyHTTPClientFactory(Request(
|
||||
method='POST',
|
||||
url='http://foo/bar',
|
||||
body='name=value',
|
||||
headers={'Content-Type': 'application/x-www-form-urlencoded'})
|
||||
headers={'Content-Type': 'application/x-www-form-urlencoded'}))
|
||||
|
||||
self._test(factory,
|
||||
"POST /bar HTTP/1.0\r\n"
|
||||
@ -120,12 +133,12 @@ class ScrapyHTTPPageGetterTests(unittest.TestCase):
|
||||
"name=value")
|
||||
|
||||
# test with single and multivalued headers
|
||||
factory = client.ScrapyHTTPClientFactory(
|
||||
factory = client.ScrapyHTTPClientFactory(Request(
|
||||
url='http://foo/bar',
|
||||
headers={
|
||||
'X-Meta-Single': 'single',
|
||||
'X-Meta-Multivalued': ['value1', 'value2'],
|
||||
})
|
||||
}))
|
||||
|
||||
self._test(factory,
|
||||
"GET /bar HTTP/1.0\r\n"
|
||||
@ -136,12 +149,12 @@ class ScrapyHTTPPageGetterTests(unittest.TestCase):
|
||||
"\r\n")
|
||||
|
||||
# same test with single and multivalued headers but using Headers class
|
||||
factory = client.ScrapyHTTPClientFactory(
|
||||
factory = client.ScrapyHTTPClientFactory(Request(
|
||||
url='http://foo/bar',
|
||||
headers=Headers({
|
||||
'X-Meta-Single': 'single',
|
||||
'X-Meta-Multivalued': ['value1', 'value2'],
|
||||
}))
|
||||
})))
|
||||
|
||||
self._test(factory,
|
||||
"GET /bar HTTP/1.0\r\n"
|
||||
@ -193,11 +206,11 @@ class WebClientTestCase(unittest.TestCase):
|
||||
|
||||
def testPayload(self):
|
||||
s = "0123456789" * 10
|
||||
return client.getPage(self.getURL("payload"), body=s).addCallback(self.assertEquals, s)
|
||||
return getPage(self.getURL("payload"), body=s).addCallback(self.assertEquals, s)
|
||||
|
||||
def testBrokenDownload(self):
|
||||
# test what happens when download gets disconnected in the middle
|
||||
d = client.getPage(self.getURL("broken"))
|
||||
d = getPage(self.getURL("broken"))
|
||||
d = self.assertFailure(d, client.PartialDownloadError)
|
||||
d.addCallback(lambda exc: self.assertEquals(exc.response, "abc"))
|
||||
return d
|
||||
@ -206,8 +219,8 @@ class WebClientTestCase(unittest.TestCase):
|
||||
# if we pass Host header explicitly, it should be used, otherwise
|
||||
# it should extract from url
|
||||
return defer.gatherResults([
|
||||
client.getPage(self.getURL("host")).addCallback(self.assertEquals, "127.0.0.1:%d" % self.portno),
|
||||
client.getPage(self.getURL("host"), headers={"Host": "www.example.com"}).addCallback(self.assertEquals, "www.example.com")])
|
||||
getPage(self.getURL("host")).addCallback(self.assertEquals, "127.0.0.1:%d" % self.portno),
|
||||
getPage(self.getURL("host"), headers={"Host": "www.example.com"}).addCallback(self.assertEquals, "www.example.com")])
|
||||
|
||||
|
||||
def test_getPage(self):
|
||||
@ -215,7 +228,7 @@ class WebClientTestCase(unittest.TestCase):
|
||||
L{client.getPage} returns a L{Deferred} which is called back with
|
||||
the body of the response if the default method B{GET} is used.
|
||||
"""
|
||||
d = client.getPage(self.getURL("file"))
|
||||
d = getPage(self.getURL("file"))
|
||||
d.addCallback(self.assertEquals, "0123456789")
|
||||
return d
|
||||
|
||||
@ -226,11 +239,11 @@ class WebClientTestCase(unittest.TestCase):
|
||||
the empty string if the method is C{HEAD} and there is a successful
|
||||
response code.
|
||||
"""
|
||||
def getPage(method):
|
||||
return client.getPage(self.getURL("file"), method=method)
|
||||
def _getPage(method):
|
||||
return getPage(self.getURL("file"), method=method)
|
||||
return defer.gatherResults([
|
||||
getPage("head").addCallback(self.assertEqual, ""),
|
||||
getPage("HEAD").addCallback(self.assertEqual, "")])
|
||||
_getPage("head").addCallback(self.assertEqual, ""),
|
||||
_getPage("HEAD").addCallback(self.assertEqual, "")])
|
||||
|
||||
|
||||
def test_timeoutNotTriggering(self):
|
||||
@ -239,7 +252,7 @@ class WebClientTestCase(unittest.TestCase):
|
||||
retrieved before the timeout period elapses, the L{Deferred} is
|
||||
called back with the contents of the page.
|
||||
"""
|
||||
d = client.getPage(self.getURL("host"), timeout=100)
|
||||
d = getPage(self.getURL("host"), timeout=100)
|
||||
d.addCallback(self.assertEquals, "127.0.0.1:%d" % self.portno)
|
||||
return d
|
||||
|
||||
@ -251,7 +264,7 @@ class WebClientTestCase(unittest.TestCase):
|
||||
L{Deferred} is errbacked with a L{error.TimeoutError}.
|
||||
"""
|
||||
finished = self.assertFailure(
|
||||
client.getPage(self.getURL("wait"), timeout=0.000001),
|
||||
getPage(self.getURL("wait"), timeout=0.000001),
|
||||
defer.TimeoutError)
|
||||
def cleanup(passthrough):
|
||||
# Clean up the server which is hanging around not doing
|
||||
@ -266,7 +279,7 @@ class WebClientTestCase(unittest.TestCase):
|
||||
return finished
|
||||
|
||||
def testNotFound(self):
|
||||
return client.getPage(self.getURL('notsuchfile')).addCallback(self._cbNoSuchFile)
|
||||
return getPage(self.getURL('notsuchfile')).addCallback(self._cbNoSuchFile)
|
||||
|
||||
def _cbNoSuchFile(self, pageData):
|
||||
self.assert_('404 - No Such Resource' in pageData)
|
||||
@ -274,7 +287,7 @@ class WebClientTestCase(unittest.TestCase):
|
||||
def testFactoryInfo(self):
|
||||
url = self.getURL('file')
|
||||
scheme, netloc, host, port, path = client._parse(url)
|
||||
factory = client.ScrapyHTTPClientFactory(url)
|
||||
factory = client.ScrapyHTTPClientFactory(Request(url))
|
||||
reactor.connectTCP(host, port, factory)
|
||||
return factory.deferred.addCallback(self._cbFactoryInfo, factory)
|
||||
|
||||
@ -285,7 +298,7 @@ class WebClientTestCase(unittest.TestCase):
|
||||
self.assertEquals(factory.response_headers['content-length'], '10')
|
||||
|
||||
def testRedirect(self):
|
||||
return client.getPage(self.getURL("redirect")).addCallback(self._cbRedirect)
|
||||
return getPage(self.getURL("redirect")).addCallback(self._cbRedirect)
|
||||
|
||||
def _cbRedirect(self, pageData):
|
||||
self.assertEquals(pageData,
|
||||
|
@ -1,4 +1,5 @@
|
||||
import re, csv
|
||||
from cStringIO import StringIO
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.selector import XmlXPathSelector
|
||||
@ -6,6 +7,7 @@ from scrapy import log
|
||||
from scrapy.utils.python import re_rsearch, str_to_unicode
|
||||
from scrapy.utils.response import body_or_str
|
||||
|
||||
|
||||
def xmliter(obj, nodename):
|
||||
"""Return a iterator of XPathSelector's over all nodes of a XML document,
|
||||
given tha name of the node to iterate. Useful for parsing XML feeds.
|
||||
@ -29,6 +31,7 @@ def xmliter(obj, nodename):
|
||||
nodetext = header_start + match.group() + header_end
|
||||
yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0]
|
||||
|
||||
|
||||
def csviter(obj, delimiter=None, headers=None, encoding=None):
|
||||
""" Returns an iterator of dictionaries from the given csv object
|
||||
|
||||
@ -46,7 +49,7 @@ def csviter(obj, delimiter=None, headers=None, encoding=None):
|
||||
def _getrow(csv_r):
|
||||
return [str_to_unicode(field, encoding) for field in csv_r.next()]
|
||||
|
||||
lines = body_or_str(obj, unicode=False).splitlines(True)
|
||||
lines = StringIO(body_or_str(obj, unicode=False))
|
||||
if delimiter:
|
||||
csv_r = csv.reader(lines, delimiter=delimiter)
|
||||
else:
|
||||
|
@ -13,6 +13,8 @@ from tempfile import NamedTemporaryFile
|
||||
from twisted.web import http
|
||||
from twisted.web.http import RESPONSES
|
||||
|
||||
from scrapy.utils.markup import remove_entities
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc
|
||||
from scrapy.xlib.BeautifulSoup import BeautifulSoup
|
||||
from scrapy.http import Response, HtmlResponse
|
||||
|
||||
@ -34,16 +36,29 @@ def get_base_url(response):
|
||||
_baseurl_cache[response] = match.group(1) if match else response.url
|
||||
return _baseurl_cache[response]
|
||||
|
||||
META_REFRESH_RE = re.compile(r'<meta[^>]*http-equiv[^>]*refresh[^>].*?(\d+);\s*url=([^"\']+)', re.DOTALL | re.IGNORECASE)
|
||||
META_REFRESH_RE = re.compile(ur'<meta[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>\d+)\s*;\s*url=(?P<url>.*?)(?P=quote)', re.DOTALL | re.IGNORECASE)
|
||||
_metaref_cache = weakref.WeakKeyDictionary()
|
||||
def get_meta_refresh(response):
|
||||
""" Return a tuple of two strings containing the interval and url included
|
||||
in the http-equiv parameter of the HTML meta element. If no url is included
|
||||
(None, None) is returned [instead of (interval, None)]
|
||||
"""Parse the http-equiv parameter of the HTML meta element from the given
|
||||
response and return a tuple (interval, url) where interval is an integer
|
||||
containing the delay in seconds (or zero if not present) and url is a
|
||||
string with the absolute url to redirect.
|
||||
|
||||
If no meta redirect is found, (None, None) is returned.
|
||||
"""
|
||||
if response not in _metaref_cache:
|
||||
match = META_REFRESH_RE.search(response.body[0:4096])
|
||||
_metaref_cache[response] = match.groups() if match else (None, None)
|
||||
encoding = getattr(response, 'encoding', None) or 'utf-8'
|
||||
body_chunk = remove_entities(unicode(response.body[0:4096], encoding, \
|
||||
errors='ignore'))
|
||||
match = META_REFRESH_RE.search(body_chunk)
|
||||
if match:
|
||||
interval = int(match.group('int'))
|
||||
url = safe_url_string(match.group('url').strip(' "\''))
|
||||
url = urljoin_rfc(response.url, url)
|
||||
_metaref_cache[response] = (interval, url)
|
||||
else:
|
||||
_metaref_cache[response] = (None, None)
|
||||
#_metaref_cache[response] = match.groups() if match else (None, None)
|
||||
return _metaref_cache[response]
|
||||
|
||||
_beautifulsoup_cache = weakref.WeakKeyDictionary()
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user