mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 19:03:54 +00:00
Automated merge with http://hg.scrapy.org/scrapy-0.8
This commit is contained in:
commit
3fb8058016
39
AUTHORS
39
AUTHORS
@ -1,28 +1,25 @@
|
||||
Scrapy was brought to life by Shane Evans while hacking a scraping framework
|
||||
prototype for Mydeco (mydeco.com). It soon became maintained, extended and
|
||||
improved by Insophia (insophia.com), with the sponsorship of By Design (the
|
||||
company behind Mydeco).
|
||||
improved by Insophia (insophia.com), with the initial sponsorship of Mydeco to
|
||||
bootstrap the project.
|
||||
|
||||
Here is the list of the primary authors & contributors, along with their user
|
||||
name (in Scrapy trac/subversion). Emails are intentionally left out to avoid
|
||||
spam.
|
||||
Here is the list of the primary authors & contributors:
|
||||
|
||||
* Pablo Hoffman (pablo)
|
||||
* Daniel Graña (daniel)
|
||||
* Martin Olveyra (olveyra)
|
||||
* Gabriel García (elpolilla)
|
||||
* Michael Cetrulo (samus_)
|
||||
* Artem Bogomyagkov (artem)
|
||||
* Damian Canabal (calarval)
|
||||
* Andres Moreira (andres)
|
||||
* Ismael Carnales (ismael)
|
||||
* Matías Aguirre (omab)
|
||||
* German Hoffman (german)
|
||||
* Anibal Pacheco (anibal)
|
||||
* Pablo Hoffman
|
||||
* Daniel Graña
|
||||
* Martin Olveyra
|
||||
* Gabriel García
|
||||
* Michael Cetrulo
|
||||
* Artem Bogomyagkov
|
||||
* Damian Canabal
|
||||
* Andres Moreira
|
||||
* Ismael Carnales
|
||||
* Matías Aguirre
|
||||
* German Hoffmann
|
||||
* Anibal Pacheco
|
||||
* Bruno Deferrari
|
||||
* Shane Evans
|
||||
|
||||
And here is the list of people who have helped to put the Scrapy homepage live:
|
||||
|
||||
* Ezequiel Rivero (ezequiel)
|
||||
* Ezequiel Rivero
|
||||
* Patrick Mezard
|
||||
* Rolando Espinoza
|
||||
|
||||
|
5
bin/scrapy.tac
Normal file
5
bin/scrapy.tac
Normal file
@ -0,0 +1,5 @@
|
||||
from twisted.application.service import Application
|
||||
from scrapy.service import ScrapyService
|
||||
|
||||
application = Application("Scrapy")
|
||||
ScrapyService().setServiceParent(application)
|
128
docs/experimental/crawlspider-v2.rst
Normal file
128
docs/experimental/crawlspider-v2.rst
Normal file
@ -0,0 +1,128 @@
|
||||
.. _topics-crawlspider-v2:
|
||||
|
||||
==============
|
||||
CrawlSpider v2
|
||||
==============
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
TODO: introduction
|
||||
|
||||
Rules Matching
|
||||
==============
|
||||
|
||||
TODO: describe purpose of rules
|
||||
|
||||
Request Extractors & Processors
|
||||
===============================
|
||||
|
||||
TODO: describe purpose of extractors & processors
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
TODO: plenty of examples
|
||||
|
||||
|
||||
.. module:: scrapy.contrib_exp.crawlspider.spider
|
||||
:synopsis: CrawlSpider
|
||||
|
||||
|
||||
Reference
|
||||
=========
|
||||
|
||||
CrawlSpider
|
||||
-----------
|
||||
|
||||
TODO: describe crawlspider
|
||||
|
||||
.. class:: CrawlSpider
|
||||
|
||||
TODO: describe class
|
||||
|
||||
|
||||
.. module:: scrapy.contrib_exp.crawlspider.rules
|
||||
:synopsis: Rules
|
||||
|
||||
Rules
|
||||
-----
|
||||
|
||||
TODO: describe spider rules
|
||||
|
||||
.. class:: Rule
|
||||
|
||||
TODO: describe Rules class
|
||||
|
||||
|
||||
.. module:: scrapy.contrib_exp.crawlspider.reqext
|
||||
:synopsis: Request Extractors
|
||||
|
||||
Request Extractors
|
||||
------------------
|
||||
|
||||
TODO: describe extractors purpose
|
||||
|
||||
.. class:: BaseSgmlRequestExtractor
|
||||
|
||||
TODO: describe base extractor
|
||||
|
||||
.. class:: SgmlRequestExtractor
|
||||
|
||||
TODO: describe sgml extractor
|
||||
|
||||
.. class:: XPathRequestExtractor
|
||||
|
||||
TODO: describe xpath request extractor
|
||||
|
||||
|
||||
.. module:: scrapy.contrib_exp.crawlspider.reqproc
|
||||
:synopsis: Request Processors
|
||||
|
||||
Request Processors
|
||||
------------------
|
||||
|
||||
TODO: describe request processors
|
||||
|
||||
.. class:: Canonicalize
|
||||
|
||||
TODO: describe proc
|
||||
|
||||
.. class:: Unique
|
||||
|
||||
TODO: describe unique
|
||||
|
||||
.. class:: FilterDomain
|
||||
|
||||
TODO: describe filter domain
|
||||
|
||||
.. class:: FilterUrl
|
||||
|
||||
TODO: describe filter url
|
||||
|
||||
|
||||
.. module:: scrapy.contrib_exp.crawlspider.matchers
|
||||
:synopsis: Matchers
|
||||
|
||||
Request/Response Matchers
|
||||
-------------------------
|
||||
|
||||
TODO: describe matchers
|
||||
|
||||
.. class:: BaseMatcher
|
||||
|
||||
TODO: describe base matcher
|
||||
|
||||
.. class:: UrlMatcher
|
||||
|
||||
TODO: describe url matcher
|
||||
|
||||
.. class:: UrlRegexMatcher
|
||||
|
||||
TODO: describe UrlListMatcher
|
||||
|
||||
.. class:: UrlListMatcher
|
||||
|
||||
TODO: describe url list matcher
|
||||
|
||||
|
@ -21,3 +21,4 @@ it's properly merged) . Use at your own risk.
|
||||
|
||||
djangoitems
|
||||
scheduler-middleware
|
||||
crawlspider-v2
|
||||
|
@ -128,7 +128,8 @@ Finally, here's the spider code::
|
||||
|
||||
class MininovaSpider(CrawlSpider):
|
||||
|
||||
domain_name = 'mininova.org'
|
||||
name = 'mininova.org'
|
||||
allowed_domains = ['mininova.org']
|
||||
start_urls = ['http://www.mininova.org/today']
|
||||
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
||||
|
||||
|
@ -102,8 +102,8 @@ to parse the contents of those pages to extract :ref:`items <topics-items>`.
|
||||
To create a Spider, you must subclass :class:`scrapy.spider.BaseSpider`, and
|
||||
define the three main, mandatory, attributes:
|
||||
|
||||
* :attr:`~scrapy.spider.BaseSpider.domain_name`: identifies the Spider. It must
|
||||
be unique, that is, you can't set the same domain name for different Spiders.
|
||||
* :attr:`~scrapy.spider.BaseSpider.name`: identifies the Spider. It must be
|
||||
unique, that is, you can't set the same name for different Spiders.
|
||||
|
||||
* :attr:`~scrapy.spider.BaseSpider.start_urls`: is a list of URLs where the
|
||||
Spider will begin to crawl from. So, the first pages downloaded will be those
|
||||
@ -128,7 +128,8 @@ This is the code for our first Spider, save it in a file named
|
||||
from scrapy.spider import BaseSpider
|
||||
|
||||
class DmozSpider(BaseSpider):
|
||||
domain_name = "dmoz.org"
|
||||
name = "dmoz.org"
|
||||
allowed_domains = ["dmoz.org"]
|
||||
start_urls = [
|
||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
|
||||
@ -354,7 +355,8 @@ Let's add this code to our spider::
|
||||
from scrapy.selector import HtmlXPathSelector
|
||||
|
||||
class DmozSpider(BaseSpider):
|
||||
domain_name = "dmoz.org"
|
||||
name = "dmoz.org"
|
||||
allowed_domains = ["dmoz.org"]
|
||||
start_urls = [
|
||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
|
||||
@ -398,7 +400,8 @@ scraped so far, the code for our Spider should be like this::
|
||||
from dmoz.items import DmozItem
|
||||
|
||||
class DmozSpider(BaseSpider):
|
||||
domain_name = "dmoz.org"
|
||||
name = "dmoz.org"
|
||||
allowed_domains = ["dmoz.org"]
|
||||
start_urls = [
|
||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
|
||||
@ -420,8 +423,8 @@ scraped so far, the code for our Spider should be like this::
|
||||
|
||||
Now doing a crawl on the dmoz.org domain yields ``DmozItem``'s::
|
||||
|
||||
[dmoz.org] DEBUG: Scraped DmozItem({'title': [u'Text Processing in Python'], 'link': [u'http://gnosis.cx/TPiP/'], 'desc': [u' - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]\n']}) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
[dmoz.org] DEBUG: Scraped DmozItem({'title': [u'XML Processing with Python'], 'link': [u'http://www.informit.com/store/product.aspx?isbn=0130211192'], 'desc': [u' - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n']}) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
[dmoz.org] DEBUG: Scraped DmozItem(desc=[u' - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]\n'], link=[u'http://gnosis.cx/TPiP/'], title=[u'Text Processing in Python']) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
[dmoz.org] DEBUG: Scraped DmozItem(desc=[u' - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n'], link=[u'http://www.informit.com/store/product.aspx?isbn=0130211192'], title=[u'XML Processing with Python']) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
|
||||
|
||||
Storing the data (using an Item Pipeline)
|
||||
|
@ -199,7 +199,7 @@ HttpAuthMiddleware
|
||||
|
||||
http_user = 'someuser'
|
||||
http_pass = 'somepass'
|
||||
domain_name = 'intranet.example.com'
|
||||
name = 'intranet.example.com'
|
||||
|
||||
# .. rest of the spider code omitted ...
|
||||
|
||||
|
@ -52,7 +52,7 @@ Exporter to export scraped items to different files, one per spider::
|
||||
self.files = {}
|
||||
|
||||
def spider_opened(self, spider):
|
||||
file = open('%s_products.xml' % spider.domain_name, 'w+b')
|
||||
file = open('%s_products.xml' % spider.name, 'w+b')
|
||||
self.files[spider] = file
|
||||
self.exporter = XmlItemExporter(file)
|
||||
self.exporter.start_exporting()
|
||||
|
@ -105,10 +105,10 @@ every time a domain/spider is opened and closed::
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
log.msg("opened spider %s" % spider.domain_name)
|
||||
log.msg("opened spider %s" % spider.name)
|
||||
|
||||
def spider_closed(self, spider):
|
||||
log.msg("closed spider %s" % spider.domain_name)
|
||||
log.msg("closed spider %s" % spider.name)
|
||||
|
||||
|
||||
.. _topics-extensions-ref-manager:
|
||||
|
@ -79,7 +79,8 @@ This is how the spider would look so far::
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
|
||||
class GoogleDirectorySpider(CrawlSpider):
|
||||
domain_name = 'directory.google.com'
|
||||
name = 'directory.google.com'
|
||||
allowed_domains = ['directory.google.com']
|
||||
start_urls = ['http://directory.google.com/']
|
||||
|
||||
rules = (
|
||||
|
@ -98,10 +98,10 @@ spider returns multiples items with the same id::
|
||||
del self.duplicates[spider]
|
||||
|
||||
def process_item(self, spider, item):
|
||||
if item.id in self.duplicates[spider]:
|
||||
if item['id'] in self.duplicates[spider]:
|
||||
raise DropItem("Duplicate item found: %s" % item)
|
||||
else:
|
||||
self.duplicates[spider].add(item.id)
|
||||
self.duplicates[spider].add(item['id'])
|
||||
return item
|
||||
|
||||
Built-in Item Pipelines reference
|
||||
|
@ -129,3 +129,14 @@ scrapy.log module
|
||||
|
||||
Log level for debugging messages (recommended level for development)
|
||||
|
||||
Logging settings
|
||||
================
|
||||
|
||||
These settings can be used to configure the logging:
|
||||
|
||||
* :setting:`LOG_ENABLED`
|
||||
* :setting:`LOG_ENCODING`
|
||||
* :setting:`LOG_FILE`
|
||||
* :setting:`LOG_LEVEL`
|
||||
* :setting:`LOG_STDOUT`
|
||||
|
||||
|
@ -321,7 +321,7 @@ user name and password. You can use the :meth:`FormRequest.from_response`
|
||||
method for this job. Here's an example spider which uses it::
|
||||
|
||||
class LoginSpider(BaseSpider):
|
||||
domain_name = 'example.com'
|
||||
name = 'example.com'
|
||||
start_urls = ['http://www.example.com/users/login.php']
|
||||
|
||||
def parse(self, response):
|
||||
@ -466,12 +466,14 @@ TextResponse objects
|
||||
|
||||
.. attribute:: TextResponse.encoding
|
||||
|
||||
A string with the encoding of this response. The encoding is resolved in the
|
||||
following order:
|
||||
A string with the encoding of this response. The encoding is resolved by
|
||||
trying the following mechanisms, in order:
|
||||
|
||||
1. the encoding passed in the constructor `encoding` argument
|
||||
|
||||
2. the encoding declared in the Content-Type HTTP header
|
||||
2. the encoding declared in the Content-Type HTTP header. If this
|
||||
encoding is not valid (ie. unknown), it is ignored and the next
|
||||
resolution mechanism is tried.
|
||||
|
||||
3. the encoding declared in the response body. The TextResponse class
|
||||
doesn't provide any special functionality for this. However, the
|
||||
@ -483,23 +485,11 @@ TextResponse objects
|
||||
:class:`TextResponse` objects support the following methods in addition to
|
||||
the standard :class:`Response` ones:
|
||||
|
||||
.. method:: TextResponse.headers_encoding()
|
||||
|
||||
Returns a string with the encoding declared in the headers (ie. the
|
||||
Content-Type HTTP header).
|
||||
|
||||
.. method:: TextResponse.body_encoding()
|
||||
|
||||
Returns a string with the encoding of the body, either declared or inferred
|
||||
from its contents. The body encoding declaration is implemented in
|
||||
:class:`TextResponse` subclasses such as: :class:`HtmlResponse` or
|
||||
:class:`XmlResponse`.
|
||||
|
||||
.. method:: TextResponse.body_as_unicode()
|
||||
|
||||
Returns the body of the response as unicode. This is equivalent to::
|
||||
|
||||
response.body.encode(response.encoding)
|
||||
response.body.decode(response.encoding)
|
||||
|
||||
But **not** equivalent to::
|
||||
|
||||
|
@ -340,16 +340,6 @@ Default: ``True``
|
||||
|
||||
Whether to collect depth stats.
|
||||
|
||||
.. setting:: DOMAIN_SCHEDULER
|
||||
|
||||
SPIDER_SCHEDULER
|
||||
----------------
|
||||
|
||||
Default: ``'scrapy.contrib.spiderscheduler.FifoSpiderScheduler'``
|
||||
|
||||
The Spider Scheduler to use. The spider scheduler returns the next spider to
|
||||
scrape.
|
||||
|
||||
.. setting:: DOWNLOADER_DEBUG
|
||||
|
||||
DOWNLOADER_DEBUG
|
||||
@ -418,6 +408,15 @@ supported. Example::
|
||||
|
||||
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
|
||||
|
||||
This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY`
|
||||
setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
|
||||
amount of time between requests, but uses a random interval between 0.5 and 1.5
|
||||
* :setting:`DOWNLOAD_DELAY`.
|
||||
|
||||
Another way to change the download delay (per spider, instead of globally) is
|
||||
by using the ``download_delay`` spider attribute, which takes more precedence
|
||||
than this setting.
|
||||
|
||||
.. setting:: DOWNLOAD_TIMEOUT
|
||||
|
||||
DOWNLOAD_TIMEOUT
|
||||
@ -439,6 +438,69 @@ The class used to detect and filter duplicate requests.
|
||||
The default (``RequestFingerprintDupeFilter``) filters based on request fingerprint
|
||||
(using ``scrapy.utils.request.request_fingerprint``) and grouping per domain.
|
||||
|
||||
.. setting:: ENCODING_ALIASES
|
||||
|
||||
ENCODING_ALIASES
|
||||
----------------
|
||||
|
||||
Default: ``{}``
|
||||
|
||||
A mapping of custom encoding aliases for your project, where the keys are the
|
||||
aliases (and must be lower case) and the values are the encodings they map to.
|
||||
|
||||
This setting extends the :setting:`ENCODING_ALIASES_BASE` setting which
|
||||
contains some default mappings.
|
||||
|
||||
.. setting:: ENCODING_ALIASES_BASE
|
||||
|
||||
ENCODING_ALIASES_BASE
|
||||
---------------------
|
||||
|
||||
Default::
|
||||
|
||||
{
|
||||
# gb2312 is superseded by gb18030
|
||||
'gb2312': 'gb18030',
|
||||
'chinese': 'gb18030',
|
||||
'csiso58gb231280': 'gb18030',
|
||||
'euc- cn': 'gb18030',
|
||||
'euccn': 'gb18030',
|
||||
'eucgb2312-cn': 'gb18030',
|
||||
'gb2312-1980': 'gb18030',
|
||||
'gb2312-80': 'gb18030',
|
||||
'iso- ir-58': 'gb18030',
|
||||
# gbk is superseded by gb18030
|
||||
'gbk': 'gb18030',
|
||||
'936': 'gb18030',
|
||||
'cp936': 'gb18030',
|
||||
'ms936': 'gb18030',
|
||||
# latin_1 is a subset of cp1252
|
||||
'latin_1': 'cp1252',
|
||||
'iso-8859-1': 'cp1252',
|
||||
'iso8859-1': 'cp1252',
|
||||
'8859': 'cp1252',
|
||||
'cp819': 'cp1252',
|
||||
'latin': 'cp1252',
|
||||
'latin1': 'cp1252',
|
||||
'l1': 'cp1252',
|
||||
# others
|
||||
'zh-cn': 'gb18030',
|
||||
'win-1251': 'cp1251',
|
||||
'macintosh' : 'mac_roman',
|
||||
'x-sjis': 'shift_jis',
|
||||
}
|
||||
|
||||
The default encoding aliases defined in Scrapy. Don't override this setting in
|
||||
your project, override :setting:`ENCODING_ALIASES` instead.
|
||||
|
||||
The reason why `ISO-8859-1`_ (and all its aliases) are mapped to `CP1252`_ is
|
||||
due to a well known browser hack. For more information see: `Character
|
||||
encodings in HTML`_.
|
||||
|
||||
.. _ISO-8859-1: http://en.wikipedia.org/wiki/ISO/IEC_8859-1
|
||||
.. _CP1252: http://en.wikipedia.org/wiki/Windows-1252
|
||||
.. _Character encodings in HTML: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
|
||||
|
||||
.. setting:: EXTENSIONS
|
||||
|
||||
EXTENSIONS
|
||||
@ -517,7 +579,16 @@ LOG_ENABLED
|
||||
|
||||
Default: ``True``
|
||||
|
||||
Enable logging.
|
||||
Whether to enable logging.
|
||||
|
||||
.. setting:: LOG_ENCODING
|
||||
|
||||
LOG_ENCODING
|
||||
------------
|
||||
|
||||
Default: ``'utf-8'``
|
||||
|
||||
The encoding to use for logging.
|
||||
|
||||
.. setting:: LOG_FILE
|
||||
|
||||
@ -677,6 +748,27 @@ Example::
|
||||
|
||||
NEWSPIDER_MODULE = 'mybot.spiders_dev'
|
||||
|
||||
.. setting:: RANDOMIZE_DOWNLOAD_DELAY
|
||||
|
||||
RANDOMIZE_DOWNLOAD_DELAY
|
||||
------------------------
|
||||
|
||||
Default: ``True``
|
||||
|
||||
If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5
|
||||
* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same
|
||||
spider.
|
||||
|
||||
This randomization decreases the chance of the crawler being detected (and
|
||||
subsequently blocked) by sites which analyze requests looking for statistically
|
||||
significant similarities in the time between their times.
|
||||
|
||||
The randomization policy is the same used by `wget`_ ``--random-wait`` option.
|
||||
|
||||
If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect.
|
||||
|
||||
.. _wget: http://www.gnu.org/software/wget/manual/wget.html
|
||||
|
||||
.. setting:: REDIRECT_MAX_TIMES
|
||||
|
||||
REDIRECT_MAX_TIMES
|
||||
@ -773,7 +865,7 @@ The scheduler to use for crawling.
|
||||
SCHEDULER_ORDER
|
||||
---------------
|
||||
|
||||
Default: ``'BFO'``
|
||||
Default: ``'DFO'``
|
||||
|
||||
Scope: ``scrapy.core.scheduler``
|
||||
|
||||
@ -858,6 +950,16 @@ Example::
|
||||
|
||||
SPIDER_MODULES = ['mybot.spiders_prod', 'mybot.spiders_dev']
|
||||
|
||||
.. setting:: SPIDER_SCHEDULER
|
||||
|
||||
SPIDER_SCHEDULER
|
||||
----------------
|
||||
|
||||
Default: ``'scrapy.contrib.spiderscheduler.FifoSpiderScheduler'``
|
||||
|
||||
The Spider Scheduler to use. The spider scheduler returns the next spider to
|
||||
scrape.
|
||||
|
||||
.. setting:: STATS_CLASS
|
||||
|
||||
STATS_CLASS
|
||||
|
@ -163,7 +163,7 @@ This can be achieved by using the ``scrapy.shell.inspect_response`` function.
|
||||
Here's an example of how you would call it from your spider::
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
domain_name = 'example.com'
|
||||
...
|
||||
|
||||
def parse(self, response):
|
||||
if response.url == 'http://www.example.com/products.php':
|
||||
|
@ -210,11 +210,8 @@ OffsiteMiddleware
|
||||
|
||||
Filters out Requests for URLs outside the domains covered by the spider.
|
||||
|
||||
This middleware filters out every request whose host names don't match
|
||||
:attr:`~scrapy.spider.BaseSpider.domain_name`, or the spider
|
||||
:attr:`~scrapy.spider.BaseSpider.domain_name` prefixed by "www.".
|
||||
Spider can add more domains to exclude using
|
||||
:attr:`~scrapy.spider.BaseSpider.extra_domain_names` attribute.
|
||||
This middleware filters out every request whose host names aren't in the
|
||||
spider's :attr:`~scrapy.spider.BaseSpider.allowed_domains` attribute.
|
||||
|
||||
When your spider returns a request for a domain not belonging to those
|
||||
covered by the spider, this middleware will log a debug message similar to
|
||||
|
@ -70,20 +70,22 @@ BaseSpider
|
||||
requests the given ``start_urls``/``start_requests``, and calls the spider's
|
||||
method ``parse`` for each of the resulting responses.
|
||||
|
||||
.. attribute:: domain_name
|
||||
.. attribute:: name
|
||||
|
||||
A string which defines the domain name for this spider, which will also be
|
||||
the unique identifier for this spider (which means you can't have two
|
||||
spider with the same ``domain_name``). This is the most important spider
|
||||
attribute and it's required, and it's the name by which Scrapy will known
|
||||
the spider.
|
||||
A string which defines the name for this spider. The spider name is how
|
||||
the spider is located (and instantiated) by Scrapy, so it must be
|
||||
unique. However, nothing prevents you from instantiating more than one
|
||||
instance of the same spider. This is the most important spider attribute
|
||||
and it's required.
|
||||
|
||||
.. attribute:: extra_domain_names
|
||||
Is recommended to name your spiders after the domain that their crawl.
|
||||
|
||||
An optional list of strings containing additional domains that this
|
||||
spider is allowed to crawl. Requests for URLs not belonging to the
|
||||
domain name specified in :attr:`domain_name` or this list won't be
|
||||
followed.
|
||||
.. attribute:: allowed_domains
|
||||
|
||||
An optional list of strings containing domains that this spider is
|
||||
allowed to crawl. Requests for URLs not belonging to the domain names
|
||||
specified in this list won't be followed if
|
||||
:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware` is enabled.
|
||||
|
||||
.. attribute:: start_urls
|
||||
|
||||
@ -144,7 +146,7 @@ BaseSpider
|
||||
.. method:: log(message, [level, component])
|
||||
|
||||
Log a message using the :func:`scrapy.log.msg` function, automatically
|
||||
populating the domain argument with the :attr:`domain_name` of this
|
||||
populating the spider argument with the :attr:`name` of this
|
||||
spider. For more information see :ref:`topics-logging`.
|
||||
|
||||
|
||||
@ -157,7 +159,8 @@ Let's see an example::
|
||||
from scrapy.spider import BaseSpider
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
domain_name = 'http://www.example.com'
|
||||
name = 'example.com'
|
||||
allowed_domains = ['example.com']
|
||||
start_urls = [
|
||||
'http://www.example.com/1.html',
|
||||
'http://www.example.com/2.html',
|
||||
@ -177,7 +180,8 @@ Another example returning multiples Requests and Items from a single callback::
|
||||
from myproject.items import MyItem
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
domain_name = 'http://www.example.com'
|
||||
name = 'example.com'
|
||||
allowed_domains = ['example.com']
|
||||
start_urls = [
|
||||
'http://www.example.com/1.html',
|
||||
'http://www.example.com/2.html',
|
||||
@ -254,7 +258,8 @@ Let's now take a look at an example CrawlSpider with rules::
|
||||
from scrapy.item import Item
|
||||
|
||||
class MySpider(CrawlSpider):
|
||||
domain_name = 'example.com'
|
||||
name = 'example.com'
|
||||
allowed_domains = ['example.com']
|
||||
start_urls = ['http://www.example.com']
|
||||
|
||||
rules = (
|
||||
@ -378,7 +383,8 @@ These spiders are pretty easy to use, let's have at one example::
|
||||
from myproject.items import TestItem
|
||||
|
||||
class MySpider(XMLFeedSpider):
|
||||
domain_name = 'example.com'
|
||||
name = 'example.com'
|
||||
allowed_domains = ['example.com']
|
||||
start_urls = ['http://www.example.com/feed.xml']
|
||||
iterator = 'iternodes' # This is actually unnecesary, since it's the default value
|
||||
itertag = 'item'
|
||||
@ -435,7 +441,8 @@ Let's see an example similar to the previous one, but using a
|
||||
from myproject.items import TestItem
|
||||
|
||||
class MySpider(CSVFeedSpider):
|
||||
domain_name = 'example.com'
|
||||
name = 'example.com'
|
||||
allowed_domains = ['example.com']
|
||||
start_urls = ['http://www.example.com/feed.csv']
|
||||
delimiter = ';'
|
||||
headers = ['id', 'name', 'description']
|
||||
|
@ -204,15 +204,15 @@ MemoryStatsCollector
|
||||
|
||||
A simple stats collector that keeps the stats of the last scraping run (for
|
||||
each spider) in memory, after they're closed. The stats can be accessed
|
||||
through the :attr:`domain_stats` attribute, which is a dict keyed by spider
|
||||
through the :attr:`spider_stats` attribute, which is a dict keyed by spider
|
||||
domain name.
|
||||
|
||||
This is the default Stats Collector used in Scrapy.
|
||||
|
||||
.. attribute:: domain_stats
|
||||
.. attribute:: spider_stats
|
||||
|
||||
A dict of dicts (keyed by spider domain name) containing the stats of
|
||||
the last scraping run for each domain.
|
||||
A dict of dicts (keyed by spider name) containing the stats of the last
|
||||
scraping run for each spider.
|
||||
|
||||
DummyStatsCollector
|
||||
-------------------
|
||||
@ -240,11 +240,11 @@ SimpledbStatsCollector
|
||||
In addition to the existing stats keys the following keys are added at
|
||||
persitance time:
|
||||
|
||||
* ``domain``: the spider domain (so you can use it later for querying stats
|
||||
for that domain)
|
||||
* ``spider``: the spider name (so you can use it later for querying stats
|
||||
for that spider)
|
||||
* ``timestamp``: the timestamp when the stats were persisited
|
||||
|
||||
Both the ``domain`` and ``timestamp`` are used for generating the SimpleDB
|
||||
Both the ``spider`` and ``timestamp`` are used for generating the SimpleDB
|
||||
item name in order to avoid overwriting stats of previous scraping runs.
|
||||
|
||||
As `required by SimpleDB`_, datetime's are stored in ISO 8601 format and
|
||||
|
1
examples/experimental/googledir/googledir/__init__.py
Normal file
1
examples/experimental/googledir/googledir/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# googledir project
|
16
examples/experimental/googledir/googledir/items.py
Normal file
16
examples/experimental/googledir/googledir/items.py
Normal file
@ -0,0 +1,16 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# http://doc.scrapy.org/topics/items.html
|
||||
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
class GoogledirItem(Item):
|
||||
|
||||
name = Field(default='')
|
||||
url = Field(default='')
|
||||
description = Field(default='')
|
||||
|
||||
def __str__(self):
|
||||
return "Google Category: name=%s url=%s" \
|
||||
% (self['name'], self['url'])
|
22
examples/experimental/googledir/googledir/pipelines.py
Normal file
22
examples/experimental/googledir/googledir/pipelines.py
Normal file
@ -0,0 +1,22 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||
|
||||
from scrapy.core.exceptions import DropItem
|
||||
|
||||
class FilterWordsPipeline(object):
|
||||
"""
|
||||
A pipeline for filtering out items which contain certain
|
||||
words in their description
|
||||
"""
|
||||
|
||||
# put all words in lowercase
|
||||
words_to_filter = ['politics', 'religion']
|
||||
|
||||
def process_item(self, spider, item):
|
||||
for word in self.words_to_filter:
|
||||
if word in unicode(item['description']).lower():
|
||||
raise DropItem("Contains forbidden word: %s" % word)
|
||||
else:
|
||||
return item
|
21
examples/experimental/googledir/googledir/settings.py
Normal file
21
examples/experimental/googledir/googledir/settings.py
Normal file
@ -0,0 +1,21 @@
|
||||
# Scrapy settings for googledir project
|
||||
#
|
||||
# For simplicity, this file contains only the most important settings by
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/topics/settings.html
|
||||
#
|
||||
# Or you can copy and paste them from where they're defined in Scrapy:
|
||||
#
|
||||
# scrapy/conf/default_settings.py
|
||||
#
|
||||
|
||||
BOT_NAME = 'googledir'
|
||||
BOT_VERSION = '1.0'
|
||||
|
||||
SPIDER_MODULES = ['googledir.spiders']
|
||||
NEWSPIDER_MODULE = 'googledir.spiders'
|
||||
DEFAULT_ITEM_CLASS = 'googledir.items.GoogledirItem'
|
||||
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
||||
|
||||
ITEM_PIPELINES = ['googledir.pipelines.FilterWordsPipeline']
|
@ -0,0 +1,8 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# To create the first spider for your project use this command:
|
||||
#
|
||||
# scrapy-ctl.py genspider myspider myspider-domain.com
|
||||
#
|
||||
# For more info see:
|
||||
# http://doc.scrapy.org/topics/spiders.html
|
@ -0,0 +1,41 @@
|
||||
from scrapy.selector import HtmlXPathSelector
|
||||
from scrapy.contrib.loader import XPathItemLoader
|
||||
from scrapy.contrib_exp.crawlspider import CrawlSpider, Rule
|
||||
|
||||
from googledir.items import GoogledirItem
|
||||
|
||||
class GoogleDirectorySpider(CrawlSpider):
|
||||
|
||||
name = 'google_directory'
|
||||
allowed_domains = ['directory.google.com']
|
||||
start_urls = ['http://directory.google.com/']
|
||||
|
||||
rules = (
|
||||
# search for categories pattern and follow links
|
||||
Rule(r'/[A-Z][a-zA-Z_/]+$', 'parse_category', follow=True),
|
||||
)
|
||||
|
||||
def parse_category(self, response):
|
||||
# The main selector we're using to extract data from the page
|
||||
main_selector = HtmlXPathSelector(response)
|
||||
|
||||
# The XPath to website links in the directory page
|
||||
xpath = '//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font'
|
||||
|
||||
# Get a list of (sub) selectors to each website node pointed by the XPath
|
||||
sub_selectors = main_selector.select(xpath)
|
||||
|
||||
# Iterate over the sub-selectors to extract data for each website
|
||||
for selector in sub_selectors:
|
||||
item = GoogledirItem()
|
||||
|
||||
l = XPathItemLoader(item=item, selector=selector)
|
||||
l.add_xpath('name', 'a/text()')
|
||||
l.add_xpath('url', 'a/@href')
|
||||
l.add_xpath('description', 'font[2]/text()')
|
||||
|
||||
# Here we populate the item and yield it
|
||||
yield l.load_item()
|
||||
|
||||
SPIDER = GoogleDirectorySpider()
|
||||
|
7
examples/experimental/googledir/scrapy-ctl.py
Normal file
7
examples/experimental/googledir/scrapy-ctl.py
Normal file
@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'googledir.settings')
|
||||
|
||||
from scrapy.command.cmdline import execute
|
||||
execute()
|
1
examples/experimental/imdb/imdb/__init__.py
Normal file
1
examples/experimental/imdb/imdb/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# package
|
12
examples/experimental/imdb/imdb/items.py
Normal file
12
examples/experimental/imdb/imdb/items.py
Normal file
@ -0,0 +1,12 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# http://doc.scrapy.org/topics/items.html
|
||||
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
class ImdbItem(Item):
|
||||
# define the fields for your item here like:
|
||||
# name = Field()
|
||||
title = Field()
|
||||
url = Field()
|
8
examples/experimental/imdb/imdb/pipelines.py
Normal file
8
examples/experimental/imdb/imdb/pipelines.py
Normal file
@ -0,0 +1,8 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||
|
||||
class ImdbPipeline(object):
|
||||
def process_item(self, spider, item):
|
||||
return item
|
20
examples/experimental/imdb/imdb/settings.py
Normal file
20
examples/experimental/imdb/imdb/settings.py
Normal file
@ -0,0 +1,20 @@
|
||||
# Scrapy settings for imdb project
|
||||
#
|
||||
# For simplicity, this file contains only the most important settings by
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/topics/settings.html
|
||||
#
|
||||
# Or you can copy and paste them from where they're defined in Scrapy:
|
||||
#
|
||||
# scrapy/conf/default_settings.py
|
||||
#
|
||||
|
||||
BOT_NAME = 'imdb'
|
||||
BOT_VERSION = '1.0'
|
||||
|
||||
SPIDER_MODULES = ['imdb.spiders']
|
||||
NEWSPIDER_MODULE = 'imdb.spiders'
|
||||
DEFAULT_ITEM_CLASS = 'imdb.items.ImdbItem'
|
||||
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
||||
|
8
examples/experimental/imdb/imdb/spiders/__init__.py
Normal file
8
examples/experimental/imdb/imdb/spiders/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# To create the first spider for your project use this command:
|
||||
#
|
||||
# scrapy-ctl.py genspider myspider myspider-domain.com
|
||||
#
|
||||
# For more info see:
|
||||
# http://doc.scrapy.org/topics/spiders.html
|
141
examples/experimental/imdb/imdb/spiders/imdb_site.py
Normal file
141
examples/experimental/imdb/imdb/spiders/imdb_site.py
Normal file
@ -0,0 +1,141 @@
|
||||
from scrapy.http import Request
|
||||
from scrapy.selector import HtmlXPathSelector
|
||||
from scrapy.contrib.loader import XPathItemLoader
|
||||
from scrapy.contrib_exp.crawlspider import CrawlSpider, Rule
|
||||
from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
|
||||
from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize, \
|
||||
FilterDupes, FilterUrl
|
||||
from scrapy.utils.url import urljoin_rfc
|
||||
|
||||
from imdb.items import ImdbItem, Field
|
||||
|
||||
from itertools import chain, imap, izip
|
||||
|
||||
class UsaOpeningWeekMovie(ImdbItem):
|
||||
pass
|
||||
|
||||
class UsaTopWeekMovie(ImdbItem):
|
||||
pass
|
||||
|
||||
class Top250Movie(ImdbItem):
|
||||
rank = Field()
|
||||
rating = Field()
|
||||
year = Field()
|
||||
votes = Field()
|
||||
|
||||
class MovieItem(ImdbItem):
|
||||
release_date = Field()
|
||||
tagline = Field()
|
||||
|
||||
|
||||
class ImdbSiteSpider(CrawlSpider):
|
||||
name = 'imdb.com'
|
||||
allowed_domains = ['imdb.com']
|
||||
start_urls = ['http://www.imdb.com/']
|
||||
|
||||
# extract requests using this classes from urls matching 'follow' flag
|
||||
request_extractors = [
|
||||
SgmlRequestExtractor(tags=['a'], attrs=['href']),
|
||||
]
|
||||
|
||||
# process requests using this classes from urls matching 'follow' flag
|
||||
request_processors = [
|
||||
Canonicalize(),
|
||||
FilterDupes(),
|
||||
FilterUrl(deny=r'/tt\d+/$'), # deny movie url as we will dispatch
|
||||
# manually the movie requests
|
||||
]
|
||||
|
||||
# include domain bit for demo purposes
|
||||
rules = (
|
||||
# these two rules expects requests from start url
|
||||
Rule(r'imdb.com/nowplaying/$', 'parse_now_playing'),
|
||||
Rule(r'imdb.com/chart/top$', 'parse_top_250'),
|
||||
# this rule will parse requests manually dispatched
|
||||
Rule(r'imdb.com/title/tt\d+/$', 'parse_movie_info'),
|
||||
)
|
||||
|
||||
def parse_now_playing(self, response):
|
||||
"""Scrapes USA openings this week and top 10 in week"""
|
||||
self.log("Parsing USA Top Week")
|
||||
hxs = HtmlXPathSelector(response)
|
||||
|
||||
_urljoin = lambda url: self._urljoin(response, url)
|
||||
|
||||
#
|
||||
# openings this week
|
||||
#
|
||||
openings = hxs.select('//table[@class="movies"]//a[@class="title"]')
|
||||
boxoffice = hxs.select('//table[@class="boxoffice movies"]//a[@class="title"]')
|
||||
|
||||
opening_titles = openings.select('text()').extract()
|
||||
opening_urls = imap(_urljoin, openings.select('@href').extract())
|
||||
|
||||
box_titles = boxoffice.select('text()').extract()
|
||||
box_urls = imap(_urljoin, boxoffice.select('@href').extract())
|
||||
|
||||
# items
|
||||
opening_items = (UsaOpeningWeekMovie(title=title, url=url)
|
||||
for (title, url)
|
||||
in izip(opening_titles, opening_urls))
|
||||
|
||||
box_items = (UsaTopWeekMovie(title=title, url=url)
|
||||
for (title, url)
|
||||
in izip(box_titles, box_urls))
|
||||
|
||||
# movie requests
|
||||
requests = imap(self.make_requests_from_url,
|
||||
chain(opening_urls, box_urls))
|
||||
|
||||
return chain(opening_items, box_items, requests)
|
||||
|
||||
def parse_top_250(self, response):
|
||||
"""Scrapes movies from top 250 list"""
|
||||
self.log("Parsing Top 250")
|
||||
hxs = HtmlXPathSelector(response)
|
||||
|
||||
# scrap each row in the table
|
||||
rows = hxs.select('//div[@id="main"]/table/tr//a/ancestor::tr')
|
||||
for row in rows:
|
||||
fields = row.select('td//text()').extract()
|
||||
url, = row.select('td//a/@href').extract()
|
||||
url = self._urljoin(response, url)
|
||||
|
||||
item = Top250Movie()
|
||||
item['title'] = fields[2]
|
||||
item['url'] = url
|
||||
item['rank'] = fields[0]
|
||||
item['rating'] = fields[1]
|
||||
item['year'] = fields[3]
|
||||
item['votes'] = fields[4]
|
||||
|
||||
# scrapped top250 item
|
||||
yield item
|
||||
# fetch movie
|
||||
yield self.make_requests_from_url(url)
|
||||
|
||||
def parse_movie_info(self, response):
|
||||
"""Scrapes movie information"""
|
||||
self.log("Parsing Movie Info")
|
||||
hxs = HtmlXPathSelector(response)
|
||||
selector = hxs.select('//div[@class="maindetails"]')
|
||||
|
||||
item = MovieItem()
|
||||
# set url
|
||||
item['url'] = response.url
|
||||
|
||||
# use item loader for other attributes
|
||||
l = XPathItemLoader(item=item, selector=selector)
|
||||
l.add_xpath('title', './/h1/text()')
|
||||
l.add_xpath('release_date', './/h5[text()="Release Date:"]'
|
||||
'/following-sibling::div/text()')
|
||||
l.add_xpath('tagline', './/h5[text()="Tagline:"]'
|
||||
'/following-sibling::div/text()')
|
||||
|
||||
yield l.load_item()
|
||||
|
||||
def _urljoin(self, response, url):
|
||||
"""Helper to convert relative urls to absolute"""
|
||||
return urljoin_rfc(response.url, url, response.encoding)
|
||||
|
||||
SPIDER = ImdbSiteSpider()
|
7
examples/experimental/imdb/scrapy-ctl.py
Normal file
7
examples/experimental/imdb/scrapy-ctl.py
Normal file
@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'imdb.settings')
|
||||
|
||||
from scrapy.command.cmdline import execute
|
||||
execute()
|
@ -6,7 +6,8 @@ from googledir.items import GoogledirItem
|
||||
|
||||
class GoogleDirectorySpider(CrawlSpider):
|
||||
|
||||
domain_name = 'directory.google.com'
|
||||
name = 'directory.google.com'
|
||||
allow_domains = ['directory.google.com']
|
||||
start_urls = ['http://directory.google.com/']
|
||||
|
||||
rules = (
|
||||
|
@ -1,51 +0,0 @@
|
||||
"""
|
||||
Simple script to follow links from a start url. The links are followed in no
|
||||
particular order.
|
||||
|
||||
Usage:
|
||||
count_and_follow_links.py <start_url> <links_to_follow>
|
||||
|
||||
Example:
|
||||
count_and_follow_links.py http://scrapy.org/ 20
|
||||
|
||||
For each page visisted, this script will print the page body size and the
|
||||
number of links found.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from urlparse import urljoin
|
||||
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.selector import HtmlXPathSelector
|
||||
from scrapy.http import Request, HtmlResponse
|
||||
|
||||
links_followed = 0
|
||||
|
||||
def parse(response):
|
||||
global links_followed
|
||||
links_followed += 1
|
||||
if links_followed >= links_to_follow:
|
||||
crawler.stop()
|
||||
|
||||
# ignore non-HTML responses
|
||||
if not isinstance(response, HtmlResponse):
|
||||
return
|
||||
|
||||
links = HtmlXPathSelector(response).select('//a/@href').extract()
|
||||
abslinks = [urljoin(response.url, l) for l in links]
|
||||
|
||||
print "page %2d/%d: %s" % (links_followed, links_to_follow, response.url)
|
||||
print " size : %d bytes" % len(response.body)
|
||||
print " links: %d" % len(links)
|
||||
print
|
||||
|
||||
return [Request(l, callback=parse) for l in abslinks]
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print __doc__
|
||||
sys.exit(2)
|
||||
|
||||
start_url, links_to_follow = sys.argv[1], int(sys.argv[2])
|
||||
request = Request(start_url, callback=parse)
|
||||
crawler = Crawler()
|
||||
crawler.crawl(request)
|
@ -1,72 +0,0 @@
|
||||
DROP TABLE IF EXISTS `url_history`;
|
||||
DROP TABLE IF EXISTS `version`;
|
||||
DROP TABLE IF EXISTS `url_status`;
|
||||
DROP TABLE IF EXISTS `ticket`;
|
||||
DROP TABLE IF EXISTS `domain_stats`;
|
||||
DROP TABLE IF EXISTS `domain_stats_history`;
|
||||
DROP TABLE IF EXISTS `domain_data_history`;
|
||||
|
||||
CREATE TABLE `ticket` (
|
||||
`guid` char(40) NOT NULL,
|
||||
`domain` varchar(255) default NULL,
|
||||
`url` varchar(2048) default NULL,
|
||||
`url_hash` char(40) default NULL, -- so we can join to url_status
|
||||
PRIMARY KEY (`guid`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `version` (
|
||||
`id` bigint(20) NOT NULL auto_increment,
|
||||
`guid` char(40) NOT NULL,
|
||||
`version` char(40) NOT NULL,
|
||||
`seen` datetime NOT NULL,
|
||||
PRIMARY KEY (`id`),
|
||||
FOREIGN KEY (`guid`) REFERENCES ticket(guid) ON UPDATE CASCADE ON DELETE CASCADE,
|
||||
UNIQUE KEY (`version`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `url_status` (
|
||||
-- see http://support.microsoft.com/kb/q208427/ for explanation of 2048
|
||||
`url_hash` char(40) NOT NULL, -- for faster searches
|
||||
`url` varchar(2048) NOT NULL,
|
||||
`parent_hash` char(40) default NULL, -- the url that was followed to this one - for reporting
|
||||
`last_version` char(40) default NULL, -- can be null if it generated an error the last time is was checked
|
||||
`last_checked` datetime NOT NULL,
|
||||
PRIMARY KEY (`url_hash`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `url_history` (
|
||||
`url_hash` char(40) NOT NULL,
|
||||
`version` char(40) NOT NULL,
|
||||
`postdata_hash` char(40) default NULL,
|
||||
`created` datetime NOT NULL,
|
||||
PRIMARY KEY (`version`),
|
||||
FOREIGN KEY (`url_hash`) REFERENCES url_status(url_hash) ON UPDATE CASCADE ON DELETE CASCADE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `domain_stats` (
|
||||
`key1` varchar(128) NOT NULL,
|
||||
`key2` varchar(128) NOT NULL,
|
||||
`value` text,
|
||||
PRIMARY KEY `key1_key2` (`key1`, `key2`),
|
||||
KEY `key1` (`key1`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `domain_stats_history` (
|
||||
`id` bigint(20) NOT NULL auto_increment,
|
||||
`key1` varchar(128) NOT NULL,
|
||||
`key2` varchar(128) NOT NULL,
|
||||
`value` varchar(2048) NOT NULL,
|
||||
`stored` datetime NOT NULL,
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `key1_key2` (`key1`, `key2`),
|
||||
KEY `key1` (`key1`),
|
||||
KEY `stored` (`stored`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `domain_data_history` (
|
||||
`domain` varchar(255) NOT NULL,
|
||||
`stored` datetime NOT NULL,
|
||||
`data` text,
|
||||
KEY `domain_stored` (`domain`, `stored`),
|
||||
KEY `domain` (`domain`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
@ -2,8 +2,8 @@
|
||||
Scrapy - a screen scraping framework written in Python
|
||||
"""
|
||||
|
||||
version_info = (0, 8, 0, '', 0)
|
||||
__version__ = "0.8"
|
||||
version_info = (0, 9, 0, 'dev')
|
||||
__version__ = "0.9-dev"
|
||||
|
||||
import sys, os, warnings
|
||||
|
||||
@ -17,11 +17,6 @@ warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
|
||||
# monkey patches to fix external library issues
|
||||
from scrapy.xlib import twisted_250_monkeypatches
|
||||
|
||||
# add some common encoding aliases not included by default in Python
|
||||
from scrapy.utils.encoding import add_encoding_alias
|
||||
add_encoding_alias('gb2312', 'zh-cn')
|
||||
add_encoding_alias('cp1251', 'win-1251')
|
||||
|
||||
# optional_features is a set containing Scrapy optional features
|
||||
optional_features = set()
|
||||
|
||||
|
@ -7,20 +7,14 @@ import cProfile
|
||||
|
||||
import scrapy
|
||||
from scrapy import log
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.xlib import lsprofcalltree
|
||||
from scrapy.conf import settings
|
||||
from scrapy.command.models import ScrapyCommand
|
||||
from scrapy.utils.signal import send_catch_log
|
||||
|
||||
# This dict holds information about the executed command for later use
|
||||
command_executed = {}
|
||||
|
||||
def _save_command_executed(cmdname, cmd, args, opts):
|
||||
"""Save command executed info for later reference"""
|
||||
command_executed['name'] = cmdname
|
||||
command_executed['class'] = cmd
|
||||
command_executed['args'] = args[:]
|
||||
command_executed['opts'] = opts.__dict__.copy()
|
||||
# Signal that carries information about the command which was executed
|
||||
# args: cmdname, cmdobj, args, opts
|
||||
command_executed = object()
|
||||
|
||||
def _find_commands(dir):
|
||||
try:
|
||||
@ -127,7 +121,8 @@ def execute(argv=None):
|
||||
sys.exit(2)
|
||||
|
||||
del args[0] # remove command name from args
|
||||
_save_command_executed(cmdname, cmd, args, opts)
|
||||
send_catch_log(signal=command_executed, cmdname=cmdname, cmdobj=cmd, \
|
||||
args=args, opts=opts)
|
||||
from scrapy.core.manager import scrapymanager
|
||||
scrapymanager.configure(control_reactor=True)
|
||||
ret = _run_command(cmd, args, opts)
|
||||
@ -136,23 +131,25 @@ def execute(argv=None):
|
||||
|
||||
def _run_command(cmd, args, opts):
|
||||
if opts.profile or opts.lsprof:
|
||||
if opts.profile:
|
||||
log.msg("writing cProfile stats to %r" % opts.profile)
|
||||
if opts.lsprof:
|
||||
log.msg("writing lsprof stats to %r" % opts.lsprof)
|
||||
loc = locals()
|
||||
p = cProfile.Profile()
|
||||
p.runctx('ret = cmd.run(args, opts)', globals(), loc)
|
||||
if opts.profile:
|
||||
p.dump_stats(opts.profile)
|
||||
k = lsprofcalltree.KCacheGrind(p)
|
||||
if opts.lsprof:
|
||||
with open(opts.lsprof, 'w') as f:
|
||||
k.output(f)
|
||||
ret = loc['ret']
|
||||
return _run_command_profiled(cmd, args, opts)
|
||||
else:
|
||||
ret = cmd.run(args, opts)
|
||||
return ret
|
||||
return cmd.run(args, opts)
|
||||
|
||||
def _run_command_profiled(cmd, args, opts):
|
||||
if opts.profile:
|
||||
log.msg("writing cProfile stats to %r" % opts.profile)
|
||||
if opts.lsprof:
|
||||
log.msg("writing lsprof stats to %r" % opts.lsprof)
|
||||
loc = locals()
|
||||
p = cProfile.Profile()
|
||||
p.runctx('ret = cmd.run(args, opts)', globals(), loc)
|
||||
if opts.profile:
|
||||
p.dump_stats(opts.profile)
|
||||
k = lsprofcalltree.KCacheGrind(p)
|
||||
if opts.lsprof:
|
||||
with open(opts.lsprof, 'w') as f:
|
||||
k.output(f)
|
||||
return loc['ret']
|
||||
|
||||
if __name__ == '__main__':
|
||||
execute()
|
||||
|
@ -1,20 +1,27 @@
|
||||
from scrapy import log
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.core.manager import scrapymanager
|
||||
from scrapy.conf import settings
|
||||
from scrapy.http import Request
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.utils.url import is_url
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = True
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <domain|url> ..."
|
||||
return "[options] <spider|url> ..."
|
||||
|
||||
def short_desc(self):
|
||||
return "Start crawling a domain or URL"
|
||||
return "Start crawling from a spider or URL"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("--spider", dest="spider", default=None, \
|
||||
help="always use this spider when arguments are urls")
|
||||
parser.add_option("-n", "--nofollow", dest="nofollow", action="store_true", \
|
||||
help="don't follow links (for use with URLs only)")
|
||||
|
||||
@ -24,4 +31,45 @@ class Command(ScrapyCommand):
|
||||
settings.overrides['CRAWLSPIDER_FOLLOW_LINKS'] = False
|
||||
|
||||
def run(self, args, opts):
|
||||
scrapymanager.runonce(*args)
|
||||
urls, names = self._split_urls_and_names(args)
|
||||
for name in names:
|
||||
scrapymanager.crawl_spider_name(name)
|
||||
|
||||
if opts.spider:
|
||||
try:
|
||||
spider = spiders.create(opts.spider)
|
||||
for url in urls:
|
||||
scrapymanager.crawl_url(url, spider)
|
||||
except KeyError:
|
||||
log.msg('Could not find spider: %s' % opts.spider, log.ERROR)
|
||||
else:
|
||||
for name, urls in self._group_urls_by_spider(urls):
|
||||
spider = spiders.create(name)
|
||||
for url in urls:
|
||||
scrapymanager.crawl_url(url, spider)
|
||||
|
||||
scrapymanager.start()
|
||||
|
||||
def _group_urls_by_spider(self, urls):
|
||||
spider_urls = defaultdict(list)
|
||||
for url in urls:
|
||||
spider_names = spiders.find_by_request(Request(url))
|
||||
if not spider_names:
|
||||
log.msg('Could not find spider for url: %s' % url,
|
||||
log.ERROR)
|
||||
elif len(spider_names) > 1:
|
||||
log.msg('More than one spider found for url: %s' % url,
|
||||
log.ERROR)
|
||||
else:
|
||||
spider_urls[spider_names[0]].append(url)
|
||||
return spider_urls.items()
|
||||
|
||||
def _split_urls_and_names(self, args):
|
||||
urls = []
|
||||
names = []
|
||||
for arg in args:
|
||||
if is_url(arg):
|
||||
urls.append(arg)
|
||||
else:
|
||||
names.append(arg)
|
||||
return urls, names
|
||||
|
@ -1,7 +1,11 @@
|
||||
import pprint
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.utils.fetch import fetch
|
||||
from scrapy.core.manager import scrapymanager
|
||||
from scrapy.http import Request
|
||||
from scrapy.spider import BaseSpider, spiders
|
||||
from scrapy.utils.url import is_url
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
@ -19,17 +23,33 @@ class Command(ScrapyCommand):
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("--spider", dest="spider",
|
||||
help="use this spider")
|
||||
parser.add_option("--headers", dest="headers", action="store_true", \
|
||||
help="print response HTTP headers instead of body")
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1:
|
||||
print "One URL is required"
|
||||
return
|
||||
if len(args) != 1 or not is_url(args[0]):
|
||||
return False
|
||||
responses = [] # to collect downloaded responses
|
||||
request = Request(args[0], callback=responses.append, dont_filter=True)
|
||||
|
||||
responses = fetch(args)
|
||||
if opts.spider:
|
||||
try:
|
||||
spider = spiders.create(opts.spider)
|
||||
except KeyError:
|
||||
log.msg("Could not find spider: %s" % opts.spider, log.ERROR)
|
||||
else:
|
||||
spider = scrapymanager._create_spider_for_request(request, \
|
||||
BaseSpider('default'))
|
||||
|
||||
scrapymanager.crawl_request(request, spider)
|
||||
scrapymanager.start()
|
||||
|
||||
# display response
|
||||
if responses:
|
||||
if opts.headers:
|
||||
pprint.pprint(responses[0].headers)
|
||||
else:
|
||||
print responses[0].body
|
||||
|
||||
|
@ -15,10 +15,11 @@ SPIDER_TEMPLATES_PATH = join(scrapy.__path__[0], 'templates', 'spiders')
|
||||
|
||||
|
||||
def sanitize_module_name(module_name):
|
||||
"""Sanitize the given module name, by replacing dashes with underscores and
|
||||
prefixing it with a letter if it doesn't start with one
|
||||
"""Sanitize the given module name, by replacing dashes and points
|
||||
with underscores and prefixing it with a letter if it doesn't start
|
||||
with one
|
||||
"""
|
||||
module_name = module_name.replace('-', '_')
|
||||
module_name = module_name.replace('-', '_').replace('.', '_')
|
||||
if module_name[0] not in string.ascii_letters:
|
||||
module_name = "a" + module_name
|
||||
return module_name
|
||||
@ -28,7 +29,7 @@ class Command(ScrapyCommand):
|
||||
requires_project = True
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider_module_name> <spider_domain_name>"
|
||||
return "[options] <name> <domain>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Generate new spider based on template passed with -t or --template"
|
||||
@ -54,28 +55,37 @@ class Command(ScrapyCommand):
|
||||
print template.read()
|
||||
return
|
||||
|
||||
if len(args) < 2:
|
||||
if len(args) != 2:
|
||||
return False
|
||||
|
||||
module = sanitize_module_name(args[0])
|
||||
name = args[0]
|
||||
domain = args[1]
|
||||
spider = spiders.fromdomain(domain)
|
||||
if spider and not opts.force:
|
||||
print "Spider '%s' already exists in module:" % domain
|
||||
print " %s" % spider.__module__
|
||||
sys.exit(1)
|
||||
|
||||
module = sanitize_module_name(name)
|
||||
|
||||
# if spider already exists and not force option then halt
|
||||
try:
|
||||
spider = spiders.create(name)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
if not opts.force:
|
||||
print "Spider '%s' already exists in module:" % name
|
||||
print " %s" % spider.__module__
|
||||
sys.exit(1)
|
||||
|
||||
template_file = self._find_template(opts.template)
|
||||
if template_file:
|
||||
self._genspider(module, domain, opts.template, template_file)
|
||||
self._genspider(module, name, domain, opts.template, template_file)
|
||||
|
||||
def _genspider(self, module, domain, template_name, template_file):
|
||||
def _genspider(self, module, name, domain, template_name, template_file):
|
||||
"""Generate the spider module, based on the given template"""
|
||||
tvars = {
|
||||
'project_name': settings.get('BOT_NAME'),
|
||||
'ProjectName': string_camelcase(settings.get('BOT_NAME')),
|
||||
'module': module,
|
||||
'site': domain,
|
||||
'name': name,
|
||||
'domain': domain,
|
||||
'classname': '%sSpider' % ''.join([s.capitalize() \
|
||||
for s in module.split('_')])
|
||||
}
|
||||
@ -86,7 +96,7 @@ class Command(ScrapyCommand):
|
||||
|
||||
shutil.copyfile(template_file, spider_file)
|
||||
render_templatefile(spider_file, **tvars)
|
||||
print "Created spider %r using template %r in module:" % (domain, \
|
||||
print "Created spider %r using template %r in module:" % (name, \
|
||||
template_name)
|
||||
print " %s.%s" % (spiders_module.__name__, module)
|
||||
|
||||
|
@ -1,11 +1,15 @@
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.utils.fetch import fetch
|
||||
from scrapy.core.manager import scrapymanager
|
||||
from scrapy.http import Request
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.utils import display
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.utils.url import is_url
|
||||
from scrapy import log
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = True
|
||||
@ -18,6 +22,8 @@ class Command(ScrapyCommand):
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("--spider", dest="spider", default=None, \
|
||||
help="always use this spider")
|
||||
parser.add_option("--nolinks", dest="nolinks", action="store_true", \
|
||||
help="don't show extracted links")
|
||||
parser.add_option("--noitems", dest="noitems", action="store_true", \
|
||||
@ -37,18 +43,13 @@ class Command(ScrapyCommand):
|
||||
return item
|
||||
|
||||
def run_callback(self, spider, response, callback, args, opts):
|
||||
spider = spiders.fromurl(response.url)
|
||||
if not spider:
|
||||
log.msg('Cannot find spider for url: %s' % response.url, level=log.ERROR)
|
||||
return (), ()
|
||||
|
||||
if callback:
|
||||
callback_fcn = callback if callable(callback) else getattr(spider, callback, None)
|
||||
if not callback_fcn:
|
||||
log.msg('Cannot find callback %s in %s spider' % (callback, spider.domain_name))
|
||||
log.msg('Cannot find callback %s in %s spider' % (callback, spider.name))
|
||||
return (), ()
|
||||
|
||||
result = callback_fcn(response)
|
||||
result = iterate_spider_output(callback_fcn(response))
|
||||
links = [i for i in result if isinstance(i, Request)]
|
||||
items = [self.pipeline_process(i, spider, opts) for i in result if \
|
||||
isinstance(i, BaseItem)]
|
||||
@ -71,36 +72,68 @@ class Command(ScrapyCommand):
|
||||
display.pprint(list(links))
|
||||
|
||||
def run(self, args, opts):
|
||||
if not args:
|
||||
print "An URL is required"
|
||||
if not len(args) == 1 or not is_url(args[0]):
|
||||
return False
|
||||
|
||||
request = Request(args[0])
|
||||
|
||||
if opts.spider:
|
||||
try:
|
||||
spider = spiders.create(opts.spider)
|
||||
except KeyError:
|
||||
log.msg('Could not find spider: %s' % opts.spider, log.ERROR)
|
||||
return
|
||||
else:
|
||||
spider = scrapymanager._create_spider_for_request(request, \
|
||||
log_none=True, log_multiple=True)
|
||||
|
||||
if not spider:
|
||||
return
|
||||
|
||||
for response in fetch(args):
|
||||
spider = spiders.fromurl(response.url)
|
||||
if not spider:
|
||||
log.msg('Cannot find spider for "%s"' % response.url)
|
||||
continue
|
||||
responses = [] # to collect downloaded responses
|
||||
request = request.replace(callback=responses.append)
|
||||
|
||||
if self.callbacks:
|
||||
for callback in self.callbacks:
|
||||
items, links = self.run_callback(spider, response, callback, args, opts)
|
||||
self.print_results(items, links, callback, opts)
|
||||
scrapymanager.crawl_request(request, spider)
|
||||
scrapymanager.start()
|
||||
|
||||
elif opts.rules:
|
||||
rules = getattr(spider, 'rules', None)
|
||||
if rules:
|
||||
items, links = [], []
|
||||
for rule in rules:
|
||||
if rule.callback and rule.link_extractor.matches(response.url):
|
||||
items, links = self.run_callback(spider, response, rule.callback, args, opts)
|
||||
self.print_results(items, links, rule.callback, opts)
|
||||
break
|
||||
else:
|
||||
log.msg('No rules found for spider "%s", please specify a callback for parsing' \
|
||||
% spider.domain_name)
|
||||
continue
|
||||
if not responses:
|
||||
log.msg('No response returned', log.ERROR, spider=spider)
|
||||
return
|
||||
|
||||
# now process response
|
||||
# - if callbacks defined then call each one print results
|
||||
# - if --rules option given search for matching spider's rule
|
||||
# - default print result using default 'parse' spider's callback
|
||||
response = responses[0]
|
||||
|
||||
if self.callbacks:
|
||||
# apply each callback
|
||||
for callback in self.callbacks:
|
||||
items, links = self.run_callback(spider, response,
|
||||
callback, args, opts)
|
||||
self.print_results(items, links, callback, opts)
|
||||
elif opts.rules:
|
||||
# search for matching spider's rule
|
||||
if hasattr(spider, 'rules') and spider.rules:
|
||||
items, links = [], []
|
||||
for rule in spider.rules:
|
||||
if rule.link_extractor.matches(response.url) \
|
||||
and rule.callback:
|
||||
|
||||
items, links = self.run_callback(spider,
|
||||
response, rule.callback,
|
||||
args, opts)
|
||||
self.print_results(items, links,
|
||||
rule.callback, opts)
|
||||
# first-match rule breaks rules loop
|
||||
break
|
||||
else:
|
||||
items, links = self.run_callback(spider, response, 'parse', args, opts)
|
||||
self.print_results(items, links, 'parse', opts)
|
||||
log.msg('No rules found for spider "%s", ' \
|
||||
'please specify a callback for parsing' \
|
||||
% spider.name, log.ERROR)
|
||||
else:
|
||||
# default callback 'parse'
|
||||
items, links = self.run_callback(spider, response,
|
||||
'parse', args, opts)
|
||||
self.print_results(items, links, 'parse', opts)
|
||||
|
||||
|
@ -52,6 +52,10 @@ class Command(ScrapyCommand):
|
||||
dispatcher.connect(exporter.export_item, signal=signals.item_passed)
|
||||
exporter.start_exporting()
|
||||
module = _import_file(args[0])
|
||||
scrapymanager.runonce(module.SPIDER)
|
||||
|
||||
# schedule spider and start engine
|
||||
scrapymanager.crawl_spider(module.SPIDER)
|
||||
scrapymanager.start()
|
||||
|
||||
if opts.output:
|
||||
exporter.finish_exporting()
|
||||
|
@ -9,4 +9,4 @@ class Command(ScrapyCommand):
|
||||
return "Start the Scrapy manager but don't run any spider (idle mode)"
|
||||
|
||||
def run(self, args, opts):
|
||||
scrapymanager.start(*args)
|
||||
scrapymanager.start(keep_alive=True)
|
||||
|
@ -7,7 +7,7 @@ from os.path import join, exists
|
||||
import scrapy
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.utils.template import render_templatefile, string_camelcase
|
||||
from scrapy.utils.python import ignore_patterns, copytree
|
||||
from scrapy.utils.py26 import ignore_patterns, copytree
|
||||
|
||||
TEMPLATES_PATH = join(scrapy.__path__[0], 'templates', 'project')
|
||||
|
||||
|
@ -57,8 +57,6 @@ class ScrapyCommand(object):
|
||||
help="log level (default: %s)" % settings['LOGLEVEL'])
|
||||
group.add_option("--nolog", action="store_true", dest="nolog", \
|
||||
help="disable logging completely")
|
||||
group.add_option("--spider", dest="spider", default=None, \
|
||||
help="always use this spider when arguments are urls")
|
||||
group.add_option("--profile", dest="profile", metavar="FILE", default=None, \
|
||||
help="write python cProfile stats to FILE")
|
||||
group.add_option("--lsprof", dest="lsprof", metavar="FILE", default=None, \
|
||||
@ -99,10 +97,6 @@ class ScrapyCommand(object):
|
||||
if opts.nolog:
|
||||
settings.overrides['LOG_ENABLED'] = False
|
||||
|
||||
if opts.spider:
|
||||
from scrapy.spider import spiders
|
||||
spiders.force_domain = opts.spider
|
||||
|
||||
if opts.pidfile:
|
||||
with open(opts.pidfile, "w") as f:
|
||||
f.write(str(os.getpid()))
|
||||
|
@ -71,6 +71,40 @@ DOWNLOADER_STATS = True
|
||||
|
||||
DUPEFILTER_CLASS = 'scrapy.contrib.dupefilter.RequestFingerprintDupeFilter'
|
||||
|
||||
ENCODING_ALIASES = {}
|
||||
|
||||
ENCODING_ALIASES_BASE = {
|
||||
# gb2312 is superseded by gb18030
|
||||
'gb2312': 'gb18030',
|
||||
'chinese': 'gb18030',
|
||||
'csiso58gb231280': 'gb18030',
|
||||
'euc- cn': 'gb18030',
|
||||
'euccn': 'gb18030',
|
||||
'eucgb2312-cn': 'gb18030',
|
||||
'gb2312-1980': 'gb18030',
|
||||
'gb2312-80': 'gb18030',
|
||||
'iso- ir-58': 'gb18030',
|
||||
# gbk is superseded by gb18030
|
||||
'gbk': 'gb18030',
|
||||
'936': 'gb18030',
|
||||
'cp936': 'gb18030',
|
||||
'ms936': 'gb18030',
|
||||
# latin_1 is a subset of cp1252
|
||||
'latin_1': 'cp1252',
|
||||
'iso-8859-1': 'cp1252',
|
||||
'iso8859-1': 'cp1252',
|
||||
'8859': 'cp1252',
|
||||
'cp819': 'cp1252',
|
||||
'latin': 'cp1252',
|
||||
'latin1': 'cp1252',
|
||||
'l1': 'cp1252',
|
||||
# others
|
||||
'zh-cn': 'gb18030',
|
||||
'win-1251': 'cp1251',
|
||||
'macintosh' : 'mac_roman',
|
||||
'x-sjis': 'shift_jis',
|
||||
}
|
||||
|
||||
EXTENSIONS = {}
|
||||
|
||||
EXTENSIONS_BASE = {
|
||||
@ -101,6 +135,7 @@ ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
||||
ITEM_PIPELINES = []
|
||||
|
||||
LOG_ENABLED = True
|
||||
LOG_ENCODING = 'utf-8'
|
||||
LOG_FORMATTER_CRAWLED = 'scrapy.contrib.logformatter.crawled_logline'
|
||||
LOG_STDOUT = False
|
||||
LOG_LEVEL = 'DEBUG'
|
||||
@ -122,6 +157,8 @@ MYSQL_CONNECTION_SETTINGS = {}
|
||||
|
||||
NEWSPIDER_MODULE = ''
|
||||
|
||||
RANDOMIZE_DOWNLOAD_DELAY = True
|
||||
|
||||
REDIRECT_MAX_METAREFRESH_DELAY = 100
|
||||
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
|
||||
REDIRECT_PRIORITY_ADJUST = +2
|
||||
@ -150,7 +187,7 @@ SCHEDULER_MIDDLEWARES_BASE = {
|
||||
'scrapy.contrib.schedulermiddleware.duplicatesfilter.DuplicatesFilterMiddleware': 500,
|
||||
}
|
||||
|
||||
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
|
||||
SCHEDULER_ORDER = 'DFO'
|
||||
|
||||
SPIDER_MANAGER_CLASS = 'scrapy.contrib.spidermanager.TwistedPluginSpiderManager'
|
||||
|
||||
|
@ -5,13 +5,13 @@ because Amazon Web Service use timestamps for authentication.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from time import strftime, gmtime
|
||||
from scrapy.utils.aws import sign_request
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
class AWSMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.access_key = settings['AWS_ACCESS_KEY_ID'] or \
|
||||
os.environ.get('AWS_ACCESS_KEY_ID')
|
||||
@ -19,9 +19,6 @@ class AWSMiddleware(object):
|
||||
os.environ.get('AWS_SECRET_ACCESS_KEY')
|
||||
|
||||
def process_request(self, request, spider):
|
||||
hostname = urlparse_cached(request).hostname
|
||||
if spider.domain_name == 's3.amazonaws.com' \
|
||||
or (hostname and hostname.endswith('s3.amazonaws.com')):
|
||||
request.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", \
|
||||
time.gmtime())
|
||||
if request.meta.get('sign_s3_request'):
|
||||
request.headers['Date'] = strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime())
|
||||
sign_request(request, self.access_key, self.secret_key)
|
||||
|
@ -108,7 +108,7 @@ class FilesystemCacheStorage(object):
|
||||
|
||||
def _get_request_path(self, spider, request):
|
||||
key = request_fingerprint(request)
|
||||
return join(self.cachedir, spider.domain_name, key[0:2], key)
|
||||
return join(self.cachedir, spider.name, key[0:2], key)
|
||||
|
||||
def _read_meta(self, spider, request):
|
||||
rpath = self._get_request_path(spider, request)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from scrapy import log
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.url import urljoin_rfc
|
||||
from scrapy.utils.response import get_meta_refresh
|
||||
from scrapy.core.exceptions import IgnoreRequest
|
||||
@ -24,10 +25,11 @@ class RedirectMiddleware(object):
|
||||
redirected = request.replace(url=redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
interval, url = get_meta_refresh(response)
|
||||
if url and interval < self.max_metarefresh_delay:
|
||||
redirected = self._redirect_request_using_get(request, url)
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
if isinstance(response, HtmlResponse):
|
||||
interval, url = get_meta_refresh(response)
|
||||
if url and interval < self.max_metarefresh_delay:
|
||||
redirected = self._redirect_request_using_get(request, url)
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
|
||||
return response
|
||||
|
||||
|
@ -1,9 +1,5 @@
|
||||
from scrapy.contrib.exporter import BaseItemExporter
|
||||
|
||||
try:
|
||||
import json
|
||||
except ImportError:
|
||||
import simplejson as json
|
||||
from scrapy.utils.py26 import json
|
||||
|
||||
class JsonLinesItemExporter(BaseItemExporter):
|
||||
|
||||
|
@ -1,26 +0,0 @@
|
||||
"""
|
||||
Extensions to override scrapy settings with per-group settings according to the
|
||||
group the spider belongs to. It only overrides the settings when running the
|
||||
crawl command with *only one domain as argument*.
|
||||
"""
|
||||
|
||||
from scrapy.conf import settings
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
from scrapy.command.cmdline import command_executed
|
||||
|
||||
class GroupSettings(object):
|
||||
|
||||
def __init__(self):
|
||||
if not settings.getbool("GROUPSETTINGS_ENABLED"):
|
||||
raise NotConfigured
|
||||
|
||||
if command_executed and command_executed['name'] == 'crawl':
|
||||
mod = __import__(settings['GROUPSETTINGS_MODULE'], {}, {}, [''])
|
||||
args = command_executed['args']
|
||||
if len(args) == 1 and not args[0].startswith('http://'):
|
||||
domain = args[0]
|
||||
settings.overrides.update(mod.default_settings)
|
||||
for group, domains in mod.group_spiders.iteritems():
|
||||
if domain in domains:
|
||||
settings.overrides.update(mod.group_settings.get(group, {}))
|
||||
|
@ -1,6 +1,6 @@
|
||||
"""
|
||||
This module provides a mechanism for collecting one (or more) sample items per
|
||||
domain.
|
||||
spider.
|
||||
|
||||
The items are collected in a dict of guid->item and persisted by pickling that
|
||||
dict into a file.
|
||||
@ -8,7 +8,7 @@ dict into a file.
|
||||
This can be useful for testing changes made to the framework or other common
|
||||
code that affects several spiders.
|
||||
|
||||
It uses the scrapy stats service to keep track of which domains are already
|
||||
It uses the scrapy stats service to keep track of which spiders are already
|
||||
sampled.
|
||||
|
||||
Settings that affect this module:
|
||||
@ -48,7 +48,7 @@ class ItemSamplerPipeline(object):
|
||||
raise NotConfigured
|
||||
self.items = {}
|
||||
self.spiders_count = 0
|
||||
self.empty_domains = set()
|
||||
self.empty_spiders = set()
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
|
||||
|
||||
@ -66,21 +66,21 @@ class ItemSamplerPipeline(object):
|
||||
def engine_stopped(self):
|
||||
with open(self.filename, 'w') as f:
|
||||
pickle.dump(self.items, f)
|
||||
if self.empty_domains:
|
||||
log.msg("No products sampled for: %s" % " ".join(self.empty_domains), \
|
||||
if self.empty_spiders:
|
||||
log.msg("No products sampled for: %s" % " ".join(self.empty_spiders), \
|
||||
level=log.WARNING)
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
if reason == 'finished' and not stats.get_value("items_sampled", spider=spider):
|
||||
self.empty_domains.add(spider.domain_name)
|
||||
self.empty_spiders.add(spider.name)
|
||||
self.spiders_count += 1
|
||||
log.msg("Sampled %d domains so far (%d empty)" % (self.spiders_count, \
|
||||
len(self.empty_domains)), level=log.INFO)
|
||||
log.msg("Sampled %d spiders so far (%d empty)" % (self.spiders_count, \
|
||||
len(self.empty_spiders)), level=log.INFO)
|
||||
|
||||
|
||||
class ItemSamplerMiddleware(object):
|
||||
"""This middleware drops items and requests (when domain sampling has been
|
||||
completed) to accelerate the processing of remaining domains"""
|
||||
"""This middleware drops items and requests (when spider sampling has been
|
||||
completed) to accelerate the processing of remaining spiders"""
|
||||
|
||||
def __init__(self):
|
||||
if not settings['ITEMSAMPLER_FILE']:
|
||||
|
@ -26,7 +26,7 @@ class HtmlParserLinkExtractor(HTMLParser):
|
||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||
|
||||
ret = []
|
||||
base_url = self.base_url if self.base_url else response_url
|
||||
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||
for link in links:
|
||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
|
@ -3,7 +3,6 @@ This module implements the HtmlImageLinkExtractor for extracting
|
||||
image links only.
|
||||
"""
|
||||
|
||||
import urlparse
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.url import canonicalize_url, urljoin_rfc
|
||||
@ -25,13 +24,13 @@ class HTMLImageLinkExtractor(object):
|
||||
self.unique = unique
|
||||
self.canonicalize = canonicalize
|
||||
|
||||
def extract_from_selector(self, selector, parent=None):
|
||||
def extract_from_selector(self, selector, encoding, parent=None):
|
||||
ret = []
|
||||
def _add_link(url_sel, alt_sel=None):
|
||||
url = flatten([url_sel.extract()])
|
||||
alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
|
||||
if url:
|
||||
ret.append(Link(unicode_to_str(url[0]), alt[0]))
|
||||
ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
|
||||
|
||||
if selector.xmlNode.type == 'element':
|
||||
if selector.xmlNode.name == 'img':
|
||||
@ -41,7 +40,7 @@ class HTMLImageLinkExtractor(object):
|
||||
children = selector.select('child::*')
|
||||
if len(children):
|
||||
for child in children:
|
||||
ret.extend(self.extract_from_selector(child, parent=selector))
|
||||
ret.extend(self.extract_from_selector(child, encoding, parent=selector))
|
||||
elif selector.xmlNode.name == 'a' and not parent:
|
||||
_add_link(selector.select('@href'), selector.select('@title'))
|
||||
else:
|
||||
@ -52,7 +51,7 @@ class HTMLImageLinkExtractor(object):
|
||||
def extract_links(self, response):
|
||||
xs = HtmlXPathSelector(response)
|
||||
base_url = xs.select('//base/@href').extract()
|
||||
base_url = unicode_to_str(base_url[0]) if base_url else unicode_to_str(response.url)
|
||||
base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url
|
||||
|
||||
links = []
|
||||
for location in self.locations:
|
||||
@ -64,7 +63,7 @@ class HTMLImageLinkExtractor(object):
|
||||
continue
|
||||
|
||||
for selector in selectors:
|
||||
links.extend(self.extract_from_selector(selector))
|
||||
links.extend(self.extract_from_selector(selector, response.encoding))
|
||||
|
||||
seen, ret = set(), []
|
||||
for link in links:
|
||||
|
@ -29,7 +29,7 @@ class LxmlLinkExtractor(object):
|
||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||
|
||||
ret = []
|
||||
base_url = self.base_url if self.base_url else response_url
|
||||
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||
for link in links:
|
||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
|
@ -16,8 +16,9 @@ def clean_link(link_text):
|
||||
|
||||
class RegexLinkExtractor(SgmlLinkExtractor):
|
||||
"""High performant link extractor"""
|
||||
|
||||
def _extract_links(self, response_text, response_url, response_encoding):
|
||||
base_url = self.base_url if self.base_url else response_url
|
||||
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||
|
||||
clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
|
||||
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
|
||||
|
@ -28,7 +28,7 @@ class BaseSgmlLinkExtractor(FixedSGMLParser):
|
||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||
|
||||
ret = []
|
||||
base_url = self.base_url if self.base_url else response_url
|
||||
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||
for link in links:
|
||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
|
@ -8,6 +8,7 @@ from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.core import signals
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
from scrapy.contrib import exporter
|
||||
from scrapy.contrib.exporter import jsonlines
|
||||
from scrapy.conf import settings
|
||||
|
||||
class FileExportPipeline(object):
|
||||
@ -48,7 +49,6 @@ class FileExportPipeline(object):
|
||||
elif format == 'pickle':
|
||||
exp = exporter.PickleItemExporter(file, **exp_kwargs)
|
||||
elif format == 'json':
|
||||
from scrapy.contrib.exporter import jsonlines
|
||||
exp = jsonlines.JsonLinesItemExporter(file, **exp_kwargs)
|
||||
else:
|
||||
raise NotConfigured("Unsupported export format: %s" % format)
|
||||
|
@ -47,7 +47,7 @@ class FSImagesStore(object):
|
||||
dispatcher.connect(self.spider_closed, signals.spider_closed)
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.created_directories.pop(spider.domain_name, None)
|
||||
self.created_directories.pop(spider.name, None)
|
||||
|
||||
def persist_image(self, key, image, buf, info):
|
||||
absolute_path = self._get_filesystem_path(key)
|
||||
@ -92,7 +92,7 @@ class _S3AmazonAWSSpider(BaseSpider):
|
||||
It means that a spider that uses download_delay or alike is not going to be
|
||||
delayed even more because it is uploading images to s3.
|
||||
"""
|
||||
domain_name = "s3.amazonaws.com"
|
||||
name = "s3.amazonaws.com"
|
||||
start_urls = ['http://s3.amazonaws.com/']
|
||||
max_concurrent_requests = 100
|
||||
|
||||
@ -143,7 +143,7 @@ class S3ImagesStore(object):
|
||||
def _build_request(self, key, method, body=None, headers=None):
|
||||
url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
|
||||
return Request(url, method=method, body=body, headers=headers, \
|
||||
priority=self.request_priority)
|
||||
meta={'sign_s3_request': True}, priority=self.request_priority)
|
||||
|
||||
def _download_request(self, request, info):
|
||||
"""This method is used for HEAD and PUT requests sent to amazon S3
|
||||
|
@ -4,7 +4,6 @@ spiders
|
||||
"""
|
||||
|
||||
import sys
|
||||
import urlparse
|
||||
|
||||
from twisted.plugin import getCache
|
||||
from twisted.python.rebuild import rebuild
|
||||
@ -19,42 +18,38 @@ class TwistedPluginSpiderManager(object):
|
||||
|
||||
def __init__(self):
|
||||
self.loaded = False
|
||||
self.force_domain = None
|
||||
self._invaliddict = {}
|
||||
self._spiders = {}
|
||||
|
||||
def fromdomain(self, domain):
|
||||
return self._spiders.get(domain)
|
||||
def create(self, spider_name, **spider_kwargs):
|
||||
"""Returns a Spider instance for the given spider name, using the given
|
||||
spider arguments. If the sipder name is not found, it raises a
|
||||
KeyError.
|
||||
"""
|
||||
spider = self._spiders[spider_name]
|
||||
spider.__dict__.update(spider_kwargs)
|
||||
return spider
|
||||
|
||||
def fromurl(self, url):
|
||||
if self.force_domain:
|
||||
return self._spiders.get(self.force_domain)
|
||||
domain = urlparse.urlparse(url).hostname
|
||||
domain = str(domain).replace('www.', '')
|
||||
if domain:
|
||||
if domain in self._spiders: # try first locating by domain
|
||||
return self._spiders[domain]
|
||||
else: # else search spider by spider
|
||||
plist = self._spiders.values()
|
||||
for p in plist:
|
||||
if url_is_from_spider(url, p):
|
||||
return p
|
||||
def find_by_request(self, request):
|
||||
"""Returns list of spiders names that match the given Request"""
|
||||
return [name for name, spider in self._spiders.iteritems()
|
||||
if url_is_from_spider(request.url, spider)]
|
||||
|
||||
def list(self):
|
||||
"""Returns list of spiders available."""
|
||||
return self._spiders.keys()
|
||||
|
||||
def load(self, spider_modules=None):
|
||||
"""Load spiders from module directory."""
|
||||
if spider_modules is None:
|
||||
spider_modules = settings.getlist('SPIDER_MODULES')
|
||||
self.spider_modules = spider_modules
|
||||
self._invaliddict = {}
|
||||
self._spiders = {}
|
||||
|
||||
modules = [__import__(m, {}, {}, ['']) for m in self.spider_modules]
|
||||
for module in modules:
|
||||
for spider in self._getspiders(ISpider, module):
|
||||
ISpider.validateInvariants(spider)
|
||||
self._spiders[spider.domain_name] = spider
|
||||
self._spiders[spider.name] = spider
|
||||
self.loaded = True
|
||||
|
||||
def _getspiders(self, interface, package):
|
||||
@ -77,14 +72,14 @@ class TwistedPluginSpiderManager(object):
|
||||
"""Reload spider module to release any resources held on to by the
|
||||
spider
|
||||
"""
|
||||
domain = spider.domain_name
|
||||
if domain not in self._spiders:
|
||||
name = spider.name
|
||||
if name not in self._spiders:
|
||||
return
|
||||
spider = self._spiders[domain]
|
||||
spider = self._spiders[name]
|
||||
module_name = spider.__module__
|
||||
module = sys.modules[module_name]
|
||||
if hasattr(module, 'SPIDER'):
|
||||
log.msg("Reloading module %s" % module_name, spider=spider, \
|
||||
level=log.DEBUG)
|
||||
new_module = rebuild(module, doLog=0)
|
||||
self._spiders[domain] = new_module.SPIDER
|
||||
self._spiders[name] = new_module.SPIDER
|
||||
|
@ -47,8 +47,7 @@ class OffsiteMiddleware(object):
|
||||
return re.compile(regex)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
domains = [spider.domain_name] + spider.extra_domain_names
|
||||
self.host_regexes[spider] = self.get_host_regex(domains)
|
||||
self.host_regexes[spider] = self.get_host_regex(spider.allowed_domains)
|
||||
self.domains_seen[spider] = set()
|
||||
|
||||
def spider_closed(self, spider):
|
||||
|
@ -59,9 +59,9 @@ class CrawlSpider(InitSpider):
|
||||
"""
|
||||
rules = ()
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, *a, **kw):
|
||||
"""Constructor takes care of compiling rules"""
|
||||
super(CrawlSpider, self).__init__()
|
||||
super(CrawlSpider, self).__init__(*a, **kw)
|
||||
self._compile_rules()
|
||||
|
||||
def parse(self, response):
|
||||
|
@ -3,8 +3,8 @@ from scrapy.spider import BaseSpider
|
||||
class InitSpider(BaseSpider):
|
||||
"""Base Spider with initialization facilities"""
|
||||
|
||||
def __init__(self):
|
||||
super(InitSpider, self).__init__()
|
||||
def __init__(self, *a, **kw):
|
||||
super(InitSpider, self).__init__(*a, **kw)
|
||||
self._postinit_reqs = []
|
||||
self._init_complete = False
|
||||
self._init_started = False
|
||||
|
@ -23,6 +23,6 @@ class StatsMailer(object):
|
||||
mail = MailSender()
|
||||
body = "Global stats\n\n"
|
||||
body += "\n".join("%-50s : %s" % i for i in stats.get_stats().items())
|
||||
body += "\n\n%s stats\n\n" % spider.domain_name
|
||||
body += "\n\n%s stats\n\n" % spider.name
|
||||
body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
|
||||
mail.send(self.recipients, "Scrapy stats for: %s" % spider.domain_name, body)
|
||||
mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body)
|
||||
|
@ -60,7 +60,7 @@ class LiveStats(object):
|
||||
runtime = datetime.now() - stats.started
|
||||
|
||||
s += '<tr><td>%s</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td>%s</td><td>%s</td></tr>\n' % \
|
||||
(spider.domain_name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(runtime))
|
||||
(spider.name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(runtime))
|
||||
|
||||
totdomains += 1
|
||||
totscraped += stats.scraped
|
||||
|
@ -25,18 +25,18 @@ class Spiderctl(object):
|
||||
dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.running[spider.domain_name] = spider
|
||||
self.running[spider.name] = spider
|
||||
|
||||
def spider_closed(self, spider):
|
||||
del self.running[spider.domain_name]
|
||||
self.finished.add(spider.domain_name)
|
||||
del self.running[spider.name]
|
||||
self.finished.add(spider.name)
|
||||
|
||||
def webconsole_render(self, wc_request):
|
||||
if wc_request.args:
|
||||
changes = self.webconsole_control(wc_request)
|
||||
|
||||
self.scheduled = [s.domain_name for s in scrapyengine.spider_scheduler._pending_spiders]
|
||||
self.idle = [d for d in self.enabled_domains if d not in self.scheduled
|
||||
self.scheduled = [s.name for s in scrapyengine.spider_scheduler._pending_spiders]
|
||||
self.idle = [d for d in self.enabled_spiders if d not in self.scheduled
|
||||
and d not in self.running
|
||||
and d not in self.finished]
|
||||
|
||||
@ -53,9 +53,9 @@ class Spiderctl(object):
|
||||
# idle
|
||||
s += "<td valign='top'>\n"
|
||||
s += '<form method="post" action=".">\n'
|
||||
s += '<select name="add_pending_domains" multiple="multiple">\n'
|
||||
for domain in sorted(self.idle):
|
||||
s += "<option>%s</option>\n" % domain
|
||||
s += '<select name="add_pending_spiders" multiple="multiple">\n'
|
||||
for name in sorted(self.idle):
|
||||
s += "<option>%s</option>\n" % name
|
||||
s += '</select><br>\n'
|
||||
s += '<br />'
|
||||
s += '<input type="submit" value="Schedule selected">\n'
|
||||
@ -65,9 +65,9 @@ class Spiderctl(object):
|
||||
# scheduled
|
||||
s += "<td valign='top'>\n"
|
||||
s += '<form method="post" action=".">\n'
|
||||
s += '<select name="remove_pending_domains" multiple="multiple">\n'
|
||||
for domain in self.scheduled:
|
||||
s += "<option>%s</option>\n" % domain
|
||||
s += '<select name="remove_pending_spiders" multiple="multiple">\n'
|
||||
for name in self.scheduled:
|
||||
s += "<option>%s</option>\n" % name
|
||||
s += '</select><br>\n'
|
||||
s += '<br />'
|
||||
s += '<input type="submit" value="Remove selected">\n'
|
||||
@ -78,9 +78,9 @@ class Spiderctl(object):
|
||||
# running
|
||||
s += "<td valign='top'>\n"
|
||||
s += '<form method="post" action=".">\n'
|
||||
s += '<select name="stop_running_domains" multiple="multiple">\n'
|
||||
for domain in sorted(self.running):
|
||||
s += "<option>%s</option>\n" % domain
|
||||
s += '<select name="stop_running_spiders" multiple="multiple">\n'
|
||||
for name in sorted(self.running):
|
||||
s += "<option>%s</option>\n" % name
|
||||
s += '</select><br>\n'
|
||||
s += '<br />'
|
||||
s += '<input type="submit" value="Stop selected">\n'
|
||||
@ -90,9 +90,9 @@ class Spiderctl(object):
|
||||
# finished
|
||||
s += "<td valign='top'>\n"
|
||||
s += '<form method="post" action=".">\n'
|
||||
s += '<select name="rerun_finished_domains" multiple="multiple">\n'
|
||||
for domain in sorted(self.finished):
|
||||
s += "<option>%s</option>\n" % domain
|
||||
s += '<select name="rerun_finished_spiders" multiple="multiple">\n'
|
||||
for name in sorted(self.finished):
|
||||
s += "<option>%s</option>\n" % name
|
||||
s += '</select><br>\n'
|
||||
s += '<br />'
|
||||
s += '<input type="submit" value="Re-schedule selected">\n'
|
||||
@ -114,42 +114,42 @@ class Spiderctl(object):
|
||||
args = wc_request.args
|
||||
s = "<hr />\n"
|
||||
|
||||
if "stop_running_domains" in args:
|
||||
if "stop_running_spiders" in args:
|
||||
s += "<p>"
|
||||
stopped_domains = []
|
||||
for domain in args["stop_running_domains"]:
|
||||
if domain in self.running:
|
||||
scrapyengine.close_spider(self.running[domain])
|
||||
stopped_domains.append(domain)
|
||||
s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_domains)
|
||||
stopped_spiders = []
|
||||
for name in args["stop_running_spiders"]:
|
||||
if name in self.running:
|
||||
scrapyengine.close_spider(self.running[name])
|
||||
stopped_spiders.append(name)
|
||||
s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_spiders)
|
||||
s += "</p>"
|
||||
if "remove_pending_domains" in args:
|
||||
if "remove_pending_spiders" in args:
|
||||
removed = []
|
||||
for domain in args["remove_pending_domains"]:
|
||||
if scrapyengine.spider_scheduler.remove_pending_domain(domain):
|
||||
removed.append(domain)
|
||||
for name in args["remove_pending_spiders"]:
|
||||
if scrapyengine.spider_scheduler.remove_pending_spider(name):
|
||||
removed.append(name)
|
||||
if removed:
|
||||
s += "<p>"
|
||||
s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_domains"])
|
||||
s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_spiders"])
|
||||
s += "</p>"
|
||||
if "add_pending_domains" in args:
|
||||
for domain in args["add_pending_domains"]:
|
||||
if domain not in scrapyengine.scheduler.pending_requests:
|
||||
scrapymanager.crawl(domain)
|
||||
if "add_pending_spiders" in args:
|
||||
for name in args["add_pending_spiders"]:
|
||||
if name not in scrapyengine.scheduler.pending_requests:
|
||||
scrapymanager.crawl_spider_name(name)
|
||||
s += "<p>"
|
||||
s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_domains"])
|
||||
s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_spiders"])
|
||||
s += "</p>"
|
||||
if "rerun_finished_domains" in args:
|
||||
for domain in args["rerun_finished_domains"]:
|
||||
if domain not in scrapyengine.scheduler.pending_requests:
|
||||
scrapymanager.crawl(domain)
|
||||
self.finished.remove(domain)
|
||||
if "rerun_finished_spiders" in args:
|
||||
for name in args["rerun_finished_spiders"]:
|
||||
if name not in scrapyengine.scheduler.pending_requests:
|
||||
scrapymanager.crawl_spider_name(name)
|
||||
self.finished.remove(name)
|
||||
s += "<p>"
|
||||
s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_domains"])
|
||||
s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_spiders"])
|
||||
s += "</p>"
|
||||
|
||||
return s
|
||||
|
||||
def webconsole_discover_module(self):
|
||||
self.enabled_domains = spiders.list()
|
||||
self.enabled_spiders = spiders.list()
|
||||
return self
|
||||
|
@ -23,7 +23,7 @@ class StatsDump(object):
|
||||
s += "<h3>Global stats</h3>\n"
|
||||
s += stats_html_table(stats.get_stats())
|
||||
for spider, spider_stats in stats.iter_spider_stats():
|
||||
s += "<h3>%s</h3>\n" % spider.domain_name
|
||||
s += "<h3>%s</h3>\n" % spider.name
|
||||
s += stats_html_table(spider_stats)
|
||||
s += "</body>\n"
|
||||
s += "</html>\n"
|
||||
|
4
scrapy/contrib_exp/crawlspider/__init__.py
Normal file
4
scrapy/contrib_exp/crawlspider/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
"""CrawlSpider v2"""
|
||||
|
||||
from .rules import Rule
|
||||
from .spider import CrawlSpider
|
61
scrapy/contrib_exp/crawlspider/matchers.py
Normal file
61
scrapy/contrib_exp/crawlspider/matchers.py
Normal file
@ -0,0 +1,61 @@
|
||||
"""
|
||||
Request/Response Matchers
|
||||
|
||||
Perform evaluation to Request or Response attributes
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
class BaseMatcher(object):
|
||||
"""Base matcher. Returns True by default."""
|
||||
|
||||
def matches_request(self, request):
|
||||
"""Performs Request Matching"""
|
||||
return True
|
||||
|
||||
def matches_response(self, response):
|
||||
"""Performs Response Matching"""
|
||||
return True
|
||||
|
||||
|
||||
class UrlMatcher(BaseMatcher):
|
||||
"""Matches URL attribute"""
|
||||
|
||||
def __init__(self, url):
|
||||
"""Initialize url attribute"""
|
||||
self._url = url
|
||||
|
||||
def matches_url(self, url):
|
||||
"""Returns True if given url is equal to matcher's url"""
|
||||
return self._url == url
|
||||
|
||||
def matches_request(self, request):
|
||||
"""Returns True if Request's url matches initial url"""
|
||||
return self.matches_url(request.url)
|
||||
|
||||
def matches_response(self, response):
|
||||
"""Returns True if Response's url matches initial url"""
|
||||
return self.matches_url(response.url)
|
||||
|
||||
|
||||
class UrlRegexMatcher(UrlMatcher):
|
||||
"""Matches URL using regular expression"""
|
||||
|
||||
def __init__(self, regex, flags=0):
|
||||
"""Initialize regular expression"""
|
||||
self._regex = re.compile(regex, flags)
|
||||
|
||||
def matches_url(self, url):
|
||||
"""Returns True if url matches regular expression"""
|
||||
return self._regex.search(url) is not None
|
||||
|
||||
|
||||
class UrlListMatcher(UrlMatcher):
|
||||
"""Matches if URL is in List"""
|
||||
|
||||
def __init__(self, urls):
|
||||
self._urls = urls
|
||||
|
||||
def matches_url(self, url):
|
||||
"""Returns True if url is in urls list"""
|
||||
return url in self._urls
|
117
scrapy/contrib_exp/crawlspider/reqext.py
Normal file
117
scrapy/contrib_exp/crawlspider/reqext.py
Normal file
@ -0,0 +1,117 @@
|
||||
"""Request Extractors"""
|
||||
from scrapy.http import Request
|
||||
from scrapy.selector import HtmlXPathSelector
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.python import FixedSGMLParser, str_to_unicode
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc
|
||||
|
||||
from itertools import ifilter
|
||||
|
||||
|
||||
class BaseSgmlRequestExtractor(FixedSGMLParser):
|
||||
"""Base SGML Request Extractor"""
|
||||
|
||||
def __init__(self, tag='a', attr='href'):
|
||||
"""Initialize attributes"""
|
||||
FixedSGMLParser.__init__(self)
|
||||
|
||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.current_request = None
|
||||
|
||||
def extract_requests(self, response):
|
||||
"""Returns list of requests extracted from response"""
|
||||
return self._extract_requests(response.body, response.url,
|
||||
response.encoding)
|
||||
|
||||
def _extract_requests(self, response_text, response_url, response_encoding):
|
||||
"""Extract requests with absolute urls"""
|
||||
self.reset()
|
||||
self.feed(response_text)
|
||||
self.close()
|
||||
|
||||
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||
self._make_absolute_urls(base_url, response_encoding)
|
||||
self._fix_link_text_encoding(response_encoding)
|
||||
|
||||
return self.requests
|
||||
|
||||
def _make_absolute_urls(self, base_url, encoding):
|
||||
"""Makes all request's urls absolute"""
|
||||
for req in self.requests:
|
||||
url = req.url
|
||||
# make absolute url
|
||||
url = urljoin_rfc(base_url, url, encoding)
|
||||
url = safe_url_string(url, encoding)
|
||||
# replace in-place request's url
|
||||
req.url = url
|
||||
|
||||
def _fix_link_text_encoding(self, encoding):
|
||||
"""Convert link_text to unicode for each request"""
|
||||
for req in self.requests:
|
||||
req.meta.setdefault('link_text', '')
|
||||
req.meta['link_text'] = str_to_unicode(req.meta['link_text'],
|
||||
encoding)
|
||||
|
||||
def reset(self):
|
||||
"""Reset state"""
|
||||
FixedSGMLParser.reset(self)
|
||||
self.requests = []
|
||||
self.base_url = None
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
"""Process unknown start tag"""
|
||||
if 'base' == tag:
|
||||
self.base_url = dict(attrs).get('href')
|
||||
|
||||
_matches = lambda (attr, value): self.scan_attr(attr) \
|
||||
and value is not None
|
||||
if self.scan_tag(tag):
|
||||
for attr, value in ifilter(_matches, attrs):
|
||||
req = Request(url=value)
|
||||
self.requests.append(req)
|
||||
self.current_request = req
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
"""Process unknown end tag"""
|
||||
self.current_request = None
|
||||
|
||||
def handle_data(self, data):
|
||||
"""Process data"""
|
||||
current = self.current_request
|
||||
if current and not 'link_text' in current.meta:
|
||||
current.meta['link_text'] = data.strip()
|
||||
|
||||
|
||||
class SgmlRequestExtractor(BaseSgmlRequestExtractor):
|
||||
"""SGML Request Extractor"""
|
||||
|
||||
def __init__(self, tags=None, attrs=None):
|
||||
"""Initialize with custom tag & attribute function checkers"""
|
||||
# defaults
|
||||
tags = tuple(tags) if tags else ('a', 'area')
|
||||
attrs = tuple(attrs) if attrs else ('href', )
|
||||
|
||||
tag_func = lambda x: x in tags
|
||||
attr_func = lambda x: x in attrs
|
||||
BaseSgmlRequestExtractor.__init__(self, tag=tag_func, attr=attr_func)
|
||||
|
||||
# TODO: move to own file
|
||||
class XPathRequestExtractor(SgmlRequestExtractor):
|
||||
"""SGML Request Extractor with XPath restriction"""
|
||||
|
||||
def __init__(self, restrict_xpaths, tags=None, attrs=None):
|
||||
"""Initialize XPath restrictions"""
|
||||
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
|
||||
SgmlRequestExtractor.__init__(self, tags, attrs)
|
||||
|
||||
def extract_requests(self, response):
|
||||
"""Restrict to XPath regions"""
|
||||
hxs = HtmlXPathSelector(response)
|
||||
fragments = (''.join(
|
||||
html_frag for html_frag in hxs.select(xpath).extract()
|
||||
) for xpath in self.restrict_xpaths)
|
||||
html_slice = ''.join(html_frag for html_frag in fragments)
|
||||
return self._extract_requests(html_slice, response.url,
|
||||
response.encoding)
|
||||
|
27
scrapy/contrib_exp/crawlspider/reqgen.py
Normal file
27
scrapy/contrib_exp/crawlspider/reqgen.py
Normal file
@ -0,0 +1,27 @@
|
||||
"""Request Generator"""
|
||||
from itertools import imap
|
||||
|
||||
class RequestGenerator(object):
|
||||
"""Extracto and process requests from response"""
|
||||
|
||||
def __init__(self, req_extractors, req_processors, callback, spider=None):
|
||||
"""Initialize attributes"""
|
||||
self._request_extractors = req_extractors
|
||||
self._request_processors = req_processors
|
||||
#TODO: resolve callback?
|
||||
self._callback = callback
|
||||
|
||||
def generate_requests(self, response):
|
||||
"""Extract and process new requests from response.
|
||||
Attach callback to each request as default callback."""
|
||||
requests = []
|
||||
for ext in self._request_extractors:
|
||||
requests.extend(ext.extract_requests(response))
|
||||
|
||||
for proc in self._request_processors:
|
||||
requests = proc(requests)
|
||||
|
||||
# return iterator
|
||||
# @@@ creates new Request object with callback
|
||||
return imap(lambda r: r.replace(callback=self._callback), requests)
|
||||
|
111
scrapy/contrib_exp/crawlspider/reqproc.py
Normal file
111
scrapy/contrib_exp/crawlspider/reqproc.py
Normal file
@ -0,0 +1,111 @@
|
||||
"""Request Processors"""
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
|
||||
|
||||
from itertools import ifilter, imap
|
||||
|
||||
import re
|
||||
|
||||
class Canonicalize(object):
|
||||
"""Canonicalize Request Processor"""
|
||||
def _replace_url(self, req):
|
||||
# replace in-place
|
||||
req.url = canonicalize_url(req.url)
|
||||
return req
|
||||
|
||||
def __call__(self, requests):
|
||||
"""Canonicalize all requests' urls"""
|
||||
return imap(self._replace_url, requests)
|
||||
|
||||
|
||||
class FilterDupes(object):
|
||||
"""Filter duplicate Requests"""
|
||||
|
||||
def __init__(self, *attributes):
|
||||
"""Initialize comparison attributes"""
|
||||
self._attributes = tuple(attributes) if attributes \
|
||||
else tuple(['url'])
|
||||
|
||||
def _equal_attr(self, obj1, obj2, attr):
|
||||
return getattr(obj1, attr) == getattr(obj2, attr)
|
||||
|
||||
def _requests_equal(self, req1, req2):
|
||||
"""Attribute comparison helper"""
|
||||
# look for not equal attribute
|
||||
_not_equal = lambda attr: not self._equal_attr(req1, req2, attr)
|
||||
for attr in ifilter(_not_equal, self._attributes):
|
||||
return False
|
||||
# all attributes equal
|
||||
return True
|
||||
|
||||
def _request_in(self, request, requests_seen):
|
||||
"""Check if request is in given requests seen list"""
|
||||
_req_seen = lambda r: self._requests_equal(r, request)
|
||||
for seen in ifilter(_req_seen, requests_seen):
|
||||
return True
|
||||
# request not seen
|
||||
return False
|
||||
|
||||
def __call__(self, requests):
|
||||
"""Filter seen requests"""
|
||||
# per-call duplicates filter
|
||||
self.requests_seen = set()
|
||||
_not_seen = lambda r: not self._request_in(r, self.requests_seen)
|
||||
for req in ifilter(_not_seen, requests):
|
||||
yield req
|
||||
# registry seen request
|
||||
self.requests_seen.add(req)
|
||||
|
||||
|
||||
class FilterDomain(object):
|
||||
"""Filter request's domain"""
|
||||
|
||||
def __init__(self, allow=(), deny=()):
|
||||
"""Initialize allow/deny attributes"""
|
||||
self.allow = tuple(arg_to_iter(allow))
|
||||
self.deny = tuple(arg_to_iter(deny))
|
||||
|
||||
def __call__(self, requests):
|
||||
"""Filter domains"""
|
||||
processed = (req for req in requests)
|
||||
|
||||
if self.allow:
|
||||
processed = (req for req in requests
|
||||
if url_is_from_any_domain(req.url, self.allow))
|
||||
if self.deny:
|
||||
processed = (req for req in requests
|
||||
if not url_is_from_any_domain(req.url, self.deny))
|
||||
|
||||
return processed
|
||||
|
||||
|
||||
class FilterUrl(object):
|
||||
"""Filter request's url"""
|
||||
|
||||
def __init__(self, allow=(), deny=()):
|
||||
"""Initialize allow/deny attributes"""
|
||||
_re_type = type(re.compile('', 0))
|
||||
|
||||
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x)
|
||||
for x in arg_to_iter(allow)]
|
||||
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x)
|
||||
for x in arg_to_iter(deny)]
|
||||
|
||||
def __call__(self, requests):
|
||||
"""Filter request's url based on allow/deny rules"""
|
||||
#TODO: filter valid urls here?
|
||||
processed = (req for req in requests)
|
||||
|
||||
if self.allow_res:
|
||||
processed = (req for req in requests
|
||||
if self._matches(req.url, self.allow_res))
|
||||
if self.deny_res:
|
||||
processed = (req for req in requests
|
||||
if not self._matches(req.url, self.deny_res))
|
||||
|
||||
return processed
|
||||
|
||||
def _matches(self, url, regexs):
|
||||
"""Returns True if url matches any regex in given list"""
|
||||
return any(r.search(url) for r in regexs)
|
||||
|
100
scrapy/contrib_exp/crawlspider/rules.py
Normal file
100
scrapy/contrib_exp/crawlspider/rules.py
Normal file
@ -0,0 +1,100 @@
|
||||
"""Crawler Rules"""
|
||||
from scrapy.http import Request
|
||||
from scrapy.http import Response
|
||||
|
||||
from functools import partial
|
||||
from itertools import ifilter
|
||||
|
||||
from .matchers import BaseMatcher
|
||||
# default strint-to-matcher class
|
||||
from .matchers import UrlRegexMatcher
|
||||
|
||||
class CompiledRule(object):
|
||||
"""Compiled version of Rule"""
|
||||
def __init__(self, matcher, callback=None, follow=False):
|
||||
"""Initialize attributes checking type"""
|
||||
assert isinstance(matcher, BaseMatcher)
|
||||
assert callback is None or callable(callback)
|
||||
assert isinstance(follow, bool)
|
||||
|
||||
self.matcher = matcher
|
||||
self.callback = callback
|
||||
self.follow = follow
|
||||
|
||||
|
||||
class Rule(object):
|
||||
"""Crawler Rule"""
|
||||
def __init__(self, matcher=None, callback=None, follow=False, **kwargs):
|
||||
"""Store attributes"""
|
||||
self.matcher = matcher
|
||||
self.callback = callback
|
||||
self.cb_kwargs = kwargs if kwargs else {}
|
||||
self.follow = True if follow else False
|
||||
|
||||
if self.callback is None and self.follow is False:
|
||||
raise ValueError("Rule must either have a callback or "
|
||||
"follow=True: %r" % self)
|
||||
|
||||
def __repr__(self):
|
||||
return "Rule(matcher=%r, callback=%r, follow=%r, **%r)" \
|
||||
% (self.matcher, self.callback, self.follow, self.cb_kwargs)
|
||||
|
||||
|
||||
class RulesManager(object):
|
||||
"""Rules Manager"""
|
||||
def __init__(self, rules, spider, default_matcher=UrlRegexMatcher):
|
||||
"""Initialize rules using spider and default matcher"""
|
||||
self._rules = tuple()
|
||||
|
||||
# compile absolute/relative-to-spider callbacks"""
|
||||
for rule in rules:
|
||||
# prepare matcher
|
||||
if rule.matcher is None:
|
||||
# instance BaseMatcher by default
|
||||
matcher = BaseMatcher()
|
||||
elif isinstance(rule.matcher, BaseMatcher):
|
||||
matcher = rule.matcher
|
||||
else:
|
||||
# matcher not BaseMatcher, check for string
|
||||
if isinstance(rule.matcher, basestring):
|
||||
# instance default matcher
|
||||
matcher = default_matcher(rule.matcher)
|
||||
else:
|
||||
raise ValueError('Not valid matcher given %r in %r' \
|
||||
% (rule.matcher, rule))
|
||||
|
||||
# prepare callback
|
||||
if callable(rule.callback):
|
||||
callback = rule.callback
|
||||
elif not rule.callback is None:
|
||||
# callback from spider
|
||||
callback = getattr(spider, rule.callback)
|
||||
|
||||
if not callable(callback):
|
||||
raise AttributeError('Invalid callback %r can not be resolved' \
|
||||
% callback)
|
||||
else:
|
||||
callback = None
|
||||
|
||||
if rule.cb_kwargs:
|
||||
# build partial callback
|
||||
callback = partial(callback, **rule.cb_kwargs)
|
||||
|
||||
# append compiled rule to rules list
|
||||
crule = CompiledRule(matcher, callback, follow=rule.follow)
|
||||
self._rules += (crule, )
|
||||
|
||||
def get_rule_from_request(self, request):
|
||||
"""Returns first rule that matches given Request"""
|
||||
_matches = lambda r: r.matcher.matches_request(request)
|
||||
for rule in ifilter(_matches, self._rules):
|
||||
# return first match of iterator
|
||||
return rule
|
||||
|
||||
def get_rule_from_response(self, response):
|
||||
"""Returns first rule that matches given Response"""
|
||||
_matches = lambda r: r.matcher.matches_response(response)
|
||||
for rule in ifilter(_matches, self._rules):
|
||||
# return first match of iterator
|
||||
return rule
|
||||
|
69
scrapy/contrib_exp/crawlspider/spider.py
Normal file
69
scrapy/contrib_exp/crawlspider/spider.py
Normal file
@ -0,0 +1,69 @@
|
||||
"""CrawlSpider v2"""
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
from .matchers import UrlListMatcher
|
||||
from .rules import Rule, RulesManager
|
||||
from .reqext import SgmlRequestExtractor
|
||||
from .reqgen import RequestGenerator
|
||||
from .reqproc import Canonicalize, FilterDupes
|
||||
|
||||
class CrawlSpider(BaseSpider):
|
||||
"""CrawlSpider v2"""
|
||||
|
||||
request_extractors = None
|
||||
request_processors = None
|
||||
rules = []
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
"""Initialize dispatcher"""
|
||||
super(CrawlSpider, self).__init__(*a, **kw)
|
||||
|
||||
# auto follow start urls
|
||||
if self.start_urls:
|
||||
_matcher = UrlListMatcher(self.start_urls)
|
||||
# append new rule using type from current self.rules
|
||||
rules = self.rules + type(self.rules)([
|
||||
Rule(_matcher, follow=True)
|
||||
])
|
||||
else:
|
||||
rules = self.rules
|
||||
|
||||
# set defaults if not set
|
||||
if self.request_extractors is None:
|
||||
# default link extractor. Extracts all links from response
|
||||
self.request_extractors = [ SgmlRequestExtractor() ]
|
||||
|
||||
if self.request_processors is None:
|
||||
# default proccessor. Filter duplicates requests
|
||||
self.request_processors = [ FilterDupes() ]
|
||||
|
||||
|
||||
# wrap rules
|
||||
self._rulesman = RulesManager(rules, spider=self)
|
||||
# generates new requests with given callback
|
||||
self._reqgen = RequestGenerator(self.request_extractors,
|
||||
self.request_processors,
|
||||
callback=self.parse)
|
||||
|
||||
def parse(self, response):
|
||||
"""Dispatch callback and generate requests"""
|
||||
# get rule for response
|
||||
rule = self._rulesman.get_rule_from_response(response)
|
||||
|
||||
if rule:
|
||||
# dispatch callback if set
|
||||
if rule.callback:
|
||||
output = iterate_spider_output(rule.callback(response))
|
||||
for req_or_item in output:
|
||||
yield req_or_item
|
||||
|
||||
if rule.follow:
|
||||
for req in self._reqgen.generate_requests(response):
|
||||
# only dispatch request if has matching rule
|
||||
if self._rulesman.get_rule_from_request(req):
|
||||
yield req
|
||||
else:
|
||||
self.log("No rule for response %s" % response, level=log.WARNING)
|
||||
|
||||
|
@ -1,55 +0,0 @@
|
||||
"""
|
||||
A pipeline to persist objects using shove.
|
||||
|
||||
Shove is a "new generation" shelve. For more information see:
|
||||
http://pypi.python.org/pypi/shove
|
||||
"""
|
||||
|
||||
from string import Template
|
||||
|
||||
from shove import Shove
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.core import signals
|
||||
from scrapy.conf import settings
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
|
||||
class ShoveItemPipeline(object):
|
||||
|
||||
def __init__(self):
|
||||
self.uritpl = settings['SHOVEITEM_STORE_URI']
|
||||
if not self.uritpl:
|
||||
raise NotConfigured
|
||||
self.opts = settings['SHOVEITEM_STORE_OPT'] or {}
|
||||
self.stores = {}
|
||||
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def process_item(self, spider, item):
|
||||
guid = str(item.guid)
|
||||
|
||||
if guid in self.stores[spider]:
|
||||
if self.stores[spider][guid] == item:
|
||||
status = 'old'
|
||||
else:
|
||||
status = 'upd'
|
||||
else:
|
||||
status = 'new'
|
||||
|
||||
if not status == 'old':
|
||||
self.stores[spider][guid] = item
|
||||
self.log(spider, item, status)
|
||||
return item
|
||||
|
||||
def spider_opened(self, spider):
|
||||
uri = Template(self.uritpl).substitute(domain=spider.domain_name)
|
||||
self.stores[spider] = Shove(uri, **self.opts)
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.stores[spider].sync()
|
||||
|
||||
def log(self, spider, item, status):
|
||||
log.msg("Shove (%s): Item guid=%s" % (status, item.guid), level=log.DEBUG, \
|
||||
spider=spider)
|
@ -2,6 +2,7 @@
|
||||
Download web pages using asynchronous IO
|
||||
"""
|
||||
|
||||
import random
|
||||
from time import time
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
@ -20,15 +21,21 @@ class SpiderInfo(object):
|
||||
|
||||
def __init__(self, download_delay=None, max_concurrent_requests=None):
|
||||
if download_delay is None:
|
||||
self.download_delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
else:
|
||||
self.download_delay = download_delay
|
||||
if self.download_delay:
|
||||
self._download_delay = float(download_delay)
|
||||
if self._download_delay:
|
||||
self.max_concurrent_requests = 1
|
||||
elif max_concurrent_requests is None:
|
||||
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
|
||||
else:
|
||||
self.max_concurrent_requests = max_concurrent_requests
|
||||
if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
|
||||
# same policy as wget --random-wait
|
||||
self.random_delay_interval = (0.5*self._download_delay, \
|
||||
1.5*self._download_delay)
|
||||
else:
|
||||
self.random_delay_interval = None
|
||||
|
||||
self.active = set()
|
||||
self.queue = []
|
||||
@ -44,6 +51,12 @@ class SpiderInfo(object):
|
||||
# use self.active to include requests in the downloader middleware
|
||||
return len(self.active) > 2 * self.max_concurrent_requests
|
||||
|
||||
def download_delay(self):
|
||||
if self.random_delay_interval:
|
||||
return random.uniform(*self.random_delay_interval)
|
||||
else:
|
||||
return self._download_delay
|
||||
|
||||
def cancel_request_calls(self):
|
||||
for call in self.next_request_calls:
|
||||
call.cancel()
|
||||
@ -99,8 +112,9 @@ class Downloader(object):
|
||||
|
||||
# Delay queue processing if a download_delay is configured
|
||||
now = time()
|
||||
if site.download_delay:
|
||||
penalty = site.download_delay - now + site.lastseen
|
||||
delay = site.download_delay()
|
||||
if delay:
|
||||
penalty = delay - now + site.lastseen
|
||||
if penalty > 0:
|
||||
d = defer.Deferred()
|
||||
d.addCallback(self._process_queue)
|
||||
|
@ -1,5 +1,4 @@
|
||||
import signal
|
||||
from collections import defaultdict
|
||||
|
||||
from twisted.internet import reactor
|
||||
|
||||
@ -7,54 +6,13 @@ from scrapy.extension import extensions
|
||||
from scrapy import log
|
||||
from scrapy.http import Request
|
||||
from scrapy.core.engine import scrapyengine
|
||||
from scrapy.spider import BaseSpider, spiders
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.url import is_url
|
||||
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
|
||||
|
||||
def _get_spider_requests(*args):
|
||||
"""Collect requests and spiders from the given arguments. Returns a dict of
|
||||
spider -> list of requests
|
||||
"""
|
||||
spider_requests = defaultdict(list)
|
||||
for arg in args:
|
||||
if isinstance(arg, tuple):
|
||||
request, spider = arg
|
||||
spider_requests[spider] = request
|
||||
elif isinstance(arg, Request):
|
||||
spider = spiders.fromurl(arg.url) or BaseSpider('default')
|
||||
if spider:
|
||||
spider_requests[spider] += [arg]
|
||||
else:
|
||||
log.msg('Could not find spider for request: %s' % arg, log.ERROR)
|
||||
elif isinstance(arg, BaseSpider):
|
||||
spider_requests[arg] += arg.start_requests()
|
||||
elif is_url(arg):
|
||||
spider = spiders.fromurl(arg) or BaseSpider('default')
|
||||
if spider:
|
||||
for req in arg_to_iter(spider.make_requests_from_url(arg)):
|
||||
spider_requests[spider] += [req]
|
||||
else:
|
||||
log.msg('Could not find spider for url: %s' % arg, log.ERROR)
|
||||
elif isinstance(arg, basestring):
|
||||
spider = spiders.fromdomain(arg)
|
||||
if spider:
|
||||
spider_requests[spider] += spider.start_requests()
|
||||
else:
|
||||
log.msg('Could not find spider for domain: %s' % arg, log.ERROR)
|
||||
else:
|
||||
raise TypeError("Unsupported argument: %r" % arg)
|
||||
return spider_requests
|
||||
|
||||
|
||||
class ExecutionManager(object):
|
||||
"""Process a list of sites or urls.
|
||||
|
||||
This class should be used in a main for process a list of sites/urls.
|
||||
|
||||
It extracts products and could be used to store results in a database or
|
||||
just for testing spiders.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.interrupted = False
|
||||
self.configured = False
|
||||
@ -78,24 +36,46 @@ class ExecutionManager(object):
|
||||
scrapyengine.configure()
|
||||
self.configured = True
|
||||
|
||||
def crawl(self, *args):
|
||||
"""Schedule the given args for crawling. args is a list of urls or domains"""
|
||||
def crawl_url(self, url, spider=None):
|
||||
"""Schedule given url for crawling."""
|
||||
if spider is None:
|
||||
spider = self._create_spider_for_request(Request(url), log_none=True, \
|
||||
log_multiple=True)
|
||||
if spider:
|
||||
requests = arg_to_iter(spider.make_requests_from_url(url))
|
||||
self._crawl_requests(requests, spider)
|
||||
|
||||
def crawl_request(self, request, spider=None):
|
||||
"""Schedule request for crawling."""
|
||||
assert self.configured, "Scrapy Manager not yet configured"
|
||||
spider_requests = _get_spider_requests(*args)
|
||||
for spider, requests in spider_requests.iteritems():
|
||||
for request in requests:
|
||||
scrapyengine.crawl(request, spider)
|
||||
if spider is None:
|
||||
spider = self._create_spider_for_request(request, log_none=True, \
|
||||
log_multiple=True)
|
||||
if spider:
|
||||
scrapyengine.crawl(request, spider)
|
||||
|
||||
def runonce(self, *args):
|
||||
"""Run the engine until it finishes scraping all domains and then exit"""
|
||||
self.crawl(*args)
|
||||
scrapyengine.start()
|
||||
if self.control_reactor:
|
||||
reactor.run(installSignalHandlers=False)
|
||||
def crawl_spider_name(self, name):
|
||||
"""Schedule given spider by name for crawling."""
|
||||
try:
|
||||
spider = spiders.create(name)
|
||||
except KeyError:
|
||||
log.msg('Could not find spider: %s' % name, log.ERROR)
|
||||
else:
|
||||
self.crawl_spider(spider)
|
||||
|
||||
def start(self):
|
||||
def crawl_spider(self, spider):
|
||||
"""Schedule spider for crawling."""
|
||||
requests = spider.start_requests()
|
||||
self._crawl_requests(requests, spider)
|
||||
|
||||
def _crawl_requests(self, requests, spider):
|
||||
"""Shortcut to schedule a list of requests"""
|
||||
for req in requests:
|
||||
self.crawl_request(req, spider)
|
||||
|
||||
def start(self, keep_alive=False):
|
||||
"""Start the scrapy server, without scheduling any domains"""
|
||||
scrapyengine.keep_alive = True
|
||||
scrapyengine.keep_alive = keep_alive
|
||||
scrapyengine.start()
|
||||
if self.control_reactor:
|
||||
reactor.run(installSignalHandlers=False)
|
||||
@ -105,6 +85,17 @@ class ExecutionManager(object):
|
||||
self.interrupted = True
|
||||
scrapyengine.stop()
|
||||
|
||||
def _create_spider_for_request(self, request, default=None, log_none=False, \
|
||||
log_multiple=False):
|
||||
spider_names = spiders.find_by_request(request)
|
||||
if len(spider_names) == 1:
|
||||
return spiders.create(spider_names[0])
|
||||
if len(spider_names) > 1 and log_multiple:
|
||||
log.msg('More than one spider found for: %s' % request, log.ERROR)
|
||||
if len(spider_names) == 0 and log_none:
|
||||
log.msg('Could not find spider for: %s' % request, log.ERROR)
|
||||
return default
|
||||
|
||||
def _signal_shutdown(self, signum, _):
|
||||
signame = signal_names[signum]
|
||||
log.msg("Received %s, shutting down gracefully. Send again to force " \
|
||||
|
@ -1,66 +0,0 @@
|
||||
"""
|
||||
Crawler class
|
||||
|
||||
The Crawler class can be used to crawl pages using the Scrapy crawler from
|
||||
outside a Scrapy project, for example, from a standalone script.
|
||||
|
||||
To use it, instantiate it and call the "crawl" method with one (or more)
|
||||
requests. For example:
|
||||
|
||||
>>> from scrapy.crawler import Crawler
|
||||
>>> from scrapy.http import Request
|
||||
>>> def parse_response(response):
|
||||
... print "Visited: %s" % response.url
|
||||
...
|
||||
>>> request = Request('http://scrapy.org', callback=parse_response)
|
||||
>>> crawler = Crawler()
|
||||
>>> crawler.crawl(request)
|
||||
Visited: http://scrapy.org
|
||||
>>>
|
||||
|
||||
Request callbacks follow the same API of spiders callback, which means that all
|
||||
requests returned from the callbacks will be followed.
|
||||
|
||||
See examples/scripts/count_and_follow_links.py for a more detailed example.
|
||||
|
||||
WARNING: The Crawler class currently has a big limitation - it cannot be used
|
||||
more than once in the same Python process. This is due to the fact that Twisted
|
||||
reactors cannot be restarted. Hopefully, this limitation will be removed in the
|
||||
future.
|
||||
"""
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.core.manager import scrapymanager
|
||||
from scrapy.core.engine import scrapyengine
|
||||
from scrapy.conf import settings as scrapy_settings
|
||||
from scrapy import log
|
||||
|
||||
class Crawler(object):
|
||||
|
||||
def __init__(self, enable_log=False, stop_on_error=False, silence_errors=False, \
|
||||
settings=None):
|
||||
self.stop_on_error = stop_on_error
|
||||
self.silence_errors = silence_errors
|
||||
# disable offsite middleware (by default) because it prevents free crawling
|
||||
if settings is not None:
|
||||
settings.overrides.update(settings)
|
||||
scrapy_settings.overrides['SPIDER_MIDDLEWARES'] = {
|
||||
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None}
|
||||
scrapy_settings.overrides['LOG_ENABLED'] = enable_log
|
||||
scrapymanager.configure()
|
||||
dispatcher.connect(self._logmessage_received, signal=log.logmessage_received)
|
||||
|
||||
def crawl(self, *args):
|
||||
scrapymanager.runonce(*args)
|
||||
|
||||
def stop(self):
|
||||
scrapyengine.stop()
|
||||
log.log_level = log.SILENT
|
||||
scrapyengine.kill()
|
||||
|
||||
def _logmessage_received(self, message, level):
|
||||
if level <= log.ERROR:
|
||||
if not self.silence_errors:
|
||||
print "Crawler error: %s" % message
|
||||
if self.stop_on_error:
|
||||
self.stop()
|
@ -96,20 +96,12 @@ class Request(object_ref):
|
||||
"""Return a copy of this Request"""
|
||||
return self.replace()
|
||||
|
||||
def replace(self, url=None, callback=None, method=None, headers=None, body=None, \
|
||||
cookies=None, meta=None, encoding=None, priority=None, \
|
||||
dont_filter=None, errback=None):
|
||||
def replace(self, *args, **kwargs):
|
||||
"""Create a new Request with the same attributes except for those
|
||||
given new values.
|
||||
"""
|
||||
return self.__class__(url=self.url if url is None else url,
|
||||
callback=callback,
|
||||
method=self.method if method is None else method,
|
||||
headers=copy.deepcopy(self.headers) if headers is None else headers,
|
||||
body=self.body if body is None else body,
|
||||
cookies=self.cookies if cookies is None else cookies,
|
||||
meta=self.meta if meta is None else meta,
|
||||
encoding=self.encoding if encoding is None else encoding,
|
||||
priority=self.priority if priority is None else priority,
|
||||
dont_filter=self.dont_filter if dont_filter is None else dont_filter,
|
||||
errback=errback)
|
||||
for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', \
|
||||
'encoding', 'priority', 'dont_filter']:
|
||||
kwargs.setdefault(x, getattr(self, x))
|
||||
cls = kwargs.pop('cls', self.__class__)
|
||||
return cls(*args, **kwargs)
|
||||
|
@ -71,18 +71,11 @@ class Response(object_ref):
|
||||
"""Return a copy of this Response"""
|
||||
return self.replace()
|
||||
|
||||
def replace(self, url=None, status=None, headers=None, body=None, meta=None, \
|
||||
flags=None, cls=None, **kwargs):
|
||||
def replace(self, *args, **kwargs):
|
||||
"""Create a new Response with the same attributes except for those
|
||||
given new values.
|
||||
"""
|
||||
if cls is None:
|
||||
cls = self.__class__
|
||||
new = cls(url=self.url if url is None else url,
|
||||
status=self.status if status is None else status,
|
||||
headers=copy.deepcopy(self.headers) if headers is None else headers,
|
||||
body=self.body if body is None else body,
|
||||
meta=self.meta if meta is None else meta,
|
||||
flags=self.flags if flags is None else flags,
|
||||
**kwargs)
|
||||
return new
|
||||
for x in ['url', 'status', 'headers', 'body', 'meta', 'flags']:
|
||||
kwargs.setdefault(x, getattr(self, x))
|
||||
cls = kwargs.pop('cls', self.__class__)
|
||||
return cls(*args, **kwargs)
|
||||
|
@ -23,9 +23,6 @@ class HtmlResponse(TextResponse):
|
||||
METATAG_RE = re.compile(r'<meta\s+%s\s+%s' % (_httpequiv_re, _content_re), re.I)
|
||||
METATAG_RE2 = re.compile(r'<meta\s+%s\s+%s' % (_content_re, _httpequiv_re), re.I)
|
||||
|
||||
def body_encoding(self):
|
||||
return self._body_declared_encoding() or super(HtmlResponse, self).body_encoding()
|
||||
|
||||
@memoizemethod_noargs
|
||||
def _body_declared_encoding(self):
|
||||
chunk = self.body[:5000]
|
||||
|
@ -6,24 +6,31 @@ See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import codecs
|
||||
from scrapy.xlib.BeautifulSoup import UnicodeDammit
|
||||
|
||||
from scrapy.http.response import Response
|
||||
from scrapy.utils.python import memoizemethod_noargs
|
||||
from scrapy.utils.encoding import encoding_exists, resolve_encoding
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
# Python decoder doesn't follow unicode standard when handling
|
||||
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
|
||||
codecs.register_error('scrapy_replace', lambda exc: (u'\ufffd', exc.start+1))
|
||||
|
||||
|
||||
class TextResponse(Response):
|
||||
|
||||
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
|
||||
_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
|
||||
|
||||
__slots__ = ['_encoding', '_body_inferred_encoding']
|
||||
__slots__ = ['_encoding', '_cached_benc', '_cached_ubody']
|
||||
|
||||
def __init__(self, url, status=200, headers=None, body=None, meta=None, \
|
||||
flags=None, encoding=None):
|
||||
self._encoding = encoding
|
||||
self._body_inferred_encoding = None
|
||||
self._cached_benc = None
|
||||
self._cached_ubody = None
|
||||
super(TextResponse, self).__init__(url, status, headers, body, meta, flags)
|
||||
|
||||
def _get_url(self):
|
||||
@ -56,31 +63,48 @@ class TextResponse(Response):
|
||||
|
||||
@property
|
||||
def encoding(self):
|
||||
return self._encoding or self.headers_encoding() or self.body_encoding()
|
||||
return self._get_encoding(infer=True)
|
||||
|
||||
@memoizemethod_noargs
|
||||
def headers_encoding(self):
|
||||
content_type = self.headers.get('Content-Type')
|
||||
if content_type:
|
||||
encoding = self._ENCODING_RE.search(content_type)
|
||||
if encoding:
|
||||
return encoding.group(1)
|
||||
def _get_encoding(self, infer=False):
|
||||
enc = self._declared_encoding()
|
||||
if enc and not encoding_exists(enc):
|
||||
enc = None
|
||||
if not enc and infer:
|
||||
enc = self._body_inferred_encoding()
|
||||
if not enc:
|
||||
enc = self._DEFAULT_ENCODING
|
||||
return resolve_encoding(enc)
|
||||
|
||||
def _declared_encoding(self):
|
||||
return self._encoding or self._headers_encoding() \
|
||||
or self._body_declared_encoding()
|
||||
|
||||
@memoizemethod_noargs
|
||||
def body_as_unicode(self):
|
||||
"""Return body as unicode"""
|
||||
possible_encodings = (self._encoding, self.headers_encoding(), \
|
||||
self._body_declared_encoding())
|
||||
dammit = UnicodeDammit(self.body, possible_encodings)
|
||||
self._body_inferred_encoding = dammit.originalEncoding
|
||||
if self._body_inferred_encoding in ('ascii', None):
|
||||
self._body_inferred_encoding = self._DEFAULT_ENCODING
|
||||
return dammit.unicode
|
||||
if self._cached_ubody is None:
|
||||
self._cached_ubody = self.body.decode(self.encoding, 'scrapy_replace')
|
||||
return self._cached_ubody
|
||||
|
||||
def body_encoding(self):
|
||||
if self._body_inferred_encoding is None:
|
||||
self.body_as_unicode()
|
||||
return self._body_inferred_encoding
|
||||
@memoizemethod_noargs
|
||||
def _headers_encoding(self):
|
||||
content_type = self.headers.get('Content-Type')
|
||||
if content_type:
|
||||
m = self._ENCODING_RE.search(content_type)
|
||||
if m:
|
||||
encoding = m.group(1)
|
||||
if encoding_exists(encoding):
|
||||
return encoding
|
||||
|
||||
def _body_inferred_encoding(self):
|
||||
if self._cached_benc is None:
|
||||
enc = self._get_encoding()
|
||||
dammit = UnicodeDammit(self.body, [enc])
|
||||
benc = dammit.originalEncoding
|
||||
self._cached_benc = benc
|
||||
# UnicodeDammit is buggy decoding utf-16
|
||||
if self._cached_ubody is None and benc != 'utf-16':
|
||||
self._cached_ubody = dammit.unicode
|
||||
return self._cached_benc
|
||||
|
||||
def _body_declared_encoding(self):
|
||||
# implemented in subclasses (XmlResponse, HtmlResponse)
|
||||
|
@ -18,9 +18,6 @@ class XmlResponse(TextResponse):
|
||||
_encoding_re = _template % ('encoding', r'(?P<charset>[\w-]+)')
|
||||
XMLDECL_RE = re.compile(r'<\?xml\s.*?%s' % _encoding_re, re.I)
|
||||
|
||||
def body_encoding(self):
|
||||
return self._body_declared_encoding() or super(XmlResponse, self).body_encoding()
|
||||
|
||||
@memoizemethod_noargs
|
||||
def _body_declared_encoding(self):
|
||||
chunk = self.body[:5000]
|
||||
|
@ -29,8 +29,9 @@ BOT_NAME = settings['BOT_NAME']
|
||||
# args: message, level, spider
|
||||
logmessage_received = object()
|
||||
|
||||
# default logging level
|
||||
# default values
|
||||
log_level = DEBUG
|
||||
log_encoding = 'utf-8'
|
||||
|
||||
started = False
|
||||
|
||||
@ -47,11 +48,12 @@ def _get_log_level(level_name_or_id=None):
|
||||
|
||||
def start(logfile=None, loglevel=None, logstdout=None):
|
||||
"""Initialize and start logging facility"""
|
||||
global log_level, started
|
||||
global log_level, log_encoding, started
|
||||
|
||||
if started or not settings.getbool('LOG_ENABLED'):
|
||||
return
|
||||
log_level = _get_log_level(loglevel)
|
||||
log_encoding = settings['LOG_ENCODING']
|
||||
started = True
|
||||
|
||||
# set log observer
|
||||
@ -73,8 +75,8 @@ def msg(message, level=INFO, component=BOT_NAME, domain=None, spider=None):
|
||||
"use 'spider' argument instead", DeprecationWarning, stacklevel=2)
|
||||
dispatcher.send(signal=logmessage_received, message=message, level=level, \
|
||||
spider=spider)
|
||||
system = domain or (spider.domain_name if spider else component)
|
||||
msg_txt = unicode_to_str("%s: %s" % (level_names[level], message))
|
||||
system = domain or (spider.name if spider else component)
|
||||
msg_txt = unicode_to_str("%s: %s" % (level_names[level], message), log_encoding)
|
||||
log.msg(msg_txt, system=system)
|
||||
|
||||
def exc(message, level=ERROR, component=BOT_NAME, domain=None, spider=None):
|
||||
@ -91,7 +93,7 @@ def err(_stuff=None, _why=None, **kwargs):
|
||||
import warnings
|
||||
warnings.warn("'domain' argument of scrapy.log.err() is deprecated, " \
|
||||
"use 'spider' argument instead", DeprecationWarning, stacklevel=2)
|
||||
kwargs['system'] = domain or (spider.domain_name if spider else component)
|
||||
kwargs['system'] = domain or (spider.name if spider else component)
|
||||
if _why:
|
||||
_why = unicode_to_str("ERROR: %s" % _why)
|
||||
_why = unicode_to_str("ERROR: %s" % _why, log_encoding)
|
||||
log.err(_stuff, _why, **kwargs)
|
||||
|
@ -47,34 +47,26 @@ class MailSender(object):
|
||||
part = MIMEBase(*mimetype.split('/'))
|
||||
part.set_payload(f.read())
|
||||
Encoders.encode_base64(part)
|
||||
part.add_header('Content-Disposition', 'attachment; filename="%s"' % attach_name)
|
||||
part.add_header('Content-Disposition', 'attachment; filename="%s"' \
|
||||
% attach_name)
|
||||
msg.attach(part)
|
||||
else:
|
||||
msg.set_payload(body)
|
||||
|
||||
# FIXME ---------------------------------------------------------------------
|
||||
# There seems to be a problem with sending emails using deferreds when
|
||||
# the last thing left to do is sending the mail, cause the engine stops
|
||||
# the reactor and the email don't get send. we need to fix this. until
|
||||
# then, we'll revert to use Python standard (IO-blocking) smtplib.
|
||||
|
||||
#dfd = self._sendmail(self.smtphost, self.mailfrom, rcpts, msg.as_string())
|
||||
#dfd.addCallbacks(self._sent_ok, self._sent_failed,
|
||||
# callbackArgs=[to, cc, subject, len(attachs)],
|
||||
# errbackArgs=[to, cc, subject, len(attachs)])
|
||||
import smtplib
|
||||
smtp = smtplib.SMTP(self.smtphost)
|
||||
smtp.sendmail(self.mailfrom, rcpts, msg.as_string())
|
||||
log.msg('Mail sent: To=%s Cc=%s Subject="%s"' % (to, cc, subject))
|
||||
smtp.close()
|
||||
# ---------------------------------------------------------------------------
|
||||
dfd = self._sendmail(self.smtphost, self.mailfrom, rcpts, msg.as_string())
|
||||
dfd.addCallbacks(self._sent_ok, self._sent_failed,
|
||||
callbackArgs=[to, cc, subject, len(attachs)],
|
||||
errbackArgs=[to, cc, subject, len(attachs)])
|
||||
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
|
||||
|
||||
def _sent_ok(self, result, to, cc, subject, nattachs):
|
||||
log.msg('Mail sent OK: To=%s Cc=%s Subject="%s" Attachs=%d' % (to, cc, subject, nattachs))
|
||||
log.msg('Mail sent OK: To=%s Cc=%s Subject="%s" Attachs=%d' % \
|
||||
(to, cc, subject, nattachs))
|
||||
|
||||
def _sent_failed(self, failure, to, cc, subject, nattachs):
|
||||
errstr = str(failure.value)
|
||||
log.msg('Unable to send mail: To=%s Cc=%s Subject="%s" Attachs=%d - %s' % (to, cc, subject, nattachs, errstr), level=log.ERROR)
|
||||
log.msg('Unable to send mail: To=%s Cc=%s Subject="%s" Attachs=%d - %s' % \
|
||||
(to, cc, subject, nattachs, errstr), level=log.ERROR)
|
||||
|
||||
def _sendmail(self, smtphost, from_addr, to_addrs, msg, port=25):
|
||||
""" This is based on twisted.mail.smtp.sendmail except that it
|
||||
|
@ -29,8 +29,8 @@ class XPathSelector(object_ref):
|
||||
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
|
||||
self.xmlNode = self.doc.xmlDoc
|
||||
elif text:
|
||||
response = TextResponse(url='about:blank', body=unicode_to_str(text), \
|
||||
encoding='utf-8')
|
||||
response = TextResponse(url='about:blank', \
|
||||
body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
|
||||
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
|
||||
self.xmlNode = self.doc.xmlDoc
|
||||
self.expr = expr
|
||||
|
60
scrapy/service.py
Normal file
60
scrapy/service.py
Normal file
@ -0,0 +1,60 @@
|
||||
import sys, os
|
||||
|
||||
from twisted.python import log
|
||||
from twisted.internet import reactor, protocol, error
|
||||
from twisted.application.service import Service
|
||||
|
||||
from scrapy.utils.py26 import cpu_count
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
class ScrapyService(Service):
|
||||
|
||||
def startService(self):
|
||||
reactor.callWhenRunning(self.start_processes)
|
||||
|
||||
def start_processes(self):
|
||||
for i in range(cpu_count()):
|
||||
self.start_process(i+1)
|
||||
|
||||
def start_process(self, id):
|
||||
args = [sys.executable, '-m', 'scrapy.service']
|
||||
env = os.environ.copy()
|
||||
self.set_log_file(env, id)
|
||||
pp = ScrapyProcessProtocol(self, id, env.get('SCRAPY_LOG_FILE'))
|
||||
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
|
||||
|
||||
def set_log_file(self, env, suffix):
|
||||
logfile = settings['LOG_FILE']
|
||||
if logfile:
|
||||
file, ext = os.path.splitext(logfile)
|
||||
env['SCRAPY_LOG_FILE'] = "%s-%s%s" % (file, suffix, ext)
|
||||
|
||||
|
||||
class ScrapyProcessProtocol(protocol.ProcessProtocol):
|
||||
|
||||
def __init__(self, service, id, logfile):
|
||||
self.service = service
|
||||
self.id = id
|
||||
self.logfile = logfile
|
||||
self.pid = None
|
||||
|
||||
def connectionMade(self):
|
||||
self.pid = self.transport.pid
|
||||
log.msg("Process %r started: pid=%r logfile=%r" % (self.id, self.pid, \
|
||||
self.logfile))
|
||||
|
||||
def processEnded(self, status):
|
||||
if isinstance(status.value, error.ProcessDone):
|
||||
log.msg("Process %r finished: pid=%r logfile=%r" % (self.id, \
|
||||
self.pid, self.logfile))
|
||||
else:
|
||||
log.msg("Process %r died: exitstatus=%r pid=%r logfile=%r" % \
|
||||
(self.id, status.value.exitCode, self.pid, self.logfile))
|
||||
reactor.callLater(5, self.service.start_process, self.id)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from scrapy.core.manager import scrapymanager
|
||||
scrapymanager.configure()
|
||||
scrapymanager.start(keep_alive=True)
|
@ -35,6 +35,7 @@ def parse_url(url):
|
||||
u = urlparse.urlparse(url)
|
||||
return url
|
||||
|
||||
|
||||
class Shell(object):
|
||||
|
||||
requires_project = False
|
||||
@ -52,18 +53,21 @@ class Shell(object):
|
||||
else:
|
||||
url = parse_url(request_or_url)
|
||||
request = Request(url)
|
||||
spider = spiders.fromurl(url) or BaseSpider('default')
|
||||
|
||||
spider = scrapymanager._create_spider_for_request(request, \
|
||||
BaseSpider('default'), log_multiple=True)
|
||||
|
||||
print "Fetching %s..." % request
|
||||
response = threads.blockingCallFromThread(reactor, scrapyengine.schedule, \
|
||||
request, spider)
|
||||
if response:
|
||||
self.populate_vars(url, response, request)
|
||||
self.populate_vars(url, response, request, spider)
|
||||
if print_help:
|
||||
self.print_help()
|
||||
else:
|
||||
print "Done - use shelp() to see available objects"
|
||||
|
||||
def populate_vars(self, url=None, response=None, request=None):
|
||||
def populate_vars(self, url=None, response=None, request=None, spider=None):
|
||||
item = self.item_class()
|
||||
self.vars['item'] = item
|
||||
if url:
|
||||
@ -73,7 +77,7 @@ class Shell(object):
|
||||
self.vars['url'] = url
|
||||
self.vars['response'] = response
|
||||
self.vars['request'] = request
|
||||
self.vars['spider'] = spiders.fromurl(url)
|
||||
self.vars['spider'] = spider
|
||||
if not self.nofetch:
|
||||
self.vars['fetch'] = self.fetch
|
||||
self.vars['view'] = open_in_browser
|
||||
@ -104,7 +108,7 @@ class Shell(object):
|
||||
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||||
|
||||
reactor.callInThread(self._console_thread, url)
|
||||
scrapymanager.start()
|
||||
scrapymanager.start(keep_alive=True)
|
||||
|
||||
def inspect_response(self, response):
|
||||
print
|
||||
|
@ -3,6 +3,9 @@ Base class for Scrapy spiders
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
from zope.interface import Interface, Attribute, invariant, implements
|
||||
from twisted.plugin import IPlugin
|
||||
|
||||
@ -11,17 +14,9 @@ from scrapy.http import Request
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.trackref import object_ref
|
||||
|
||||
def _valid_domain_name(obj):
|
||||
"""Check the domain name specified is valid"""
|
||||
if not obj.domain_name:
|
||||
raise ValueError("Spider 'domain_name' attribute is required")
|
||||
|
||||
class ISpider(Interface, IPlugin) :
|
||||
"""Interface to be implemented by site-specific web spiders"""
|
||||
|
||||
domain_name = Attribute("The domain name of the site to be scraped.")
|
||||
|
||||
invariant(_valid_domain_name)
|
||||
"""Interface used by TwistedPluginSpiderManager to discover spiders"""
|
||||
pass
|
||||
|
||||
class BaseSpider(object_ref):
|
||||
"""Base class for scrapy spiders. All spiders must inherit from this
|
||||
@ -31,19 +26,37 @@ class BaseSpider(object_ref):
|
||||
implements(ISpider)
|
||||
|
||||
# XXX: class attributes kept for backwards compatibility
|
||||
domain_name = None
|
||||
name = None
|
||||
start_urls = []
|
||||
extra_domain_names = []
|
||||
allowed_domains = []
|
||||
|
||||
def __init__(self, domain_name=None):
|
||||
if domain_name is not None:
|
||||
self.domain_name = domain_name
|
||||
def __init__(self, name=None, **kwargs):
|
||||
self.__dict__.update(kwargs)
|
||||
# XXX: SEP-12 backward compatibility (remove for 0.10)
|
||||
if hasattr(self, 'domain_name'):
|
||||
warnings.warn("Spider.domain_name attribute is deprecated, use Spider.name instead and Spider.allowed_domains", \
|
||||
DeprecationWarning, stacklevel=4)
|
||||
self.name = self.domain_name
|
||||
self.allowed_domains = [self.name]
|
||||
if hasattr(self, 'extra_domain_names'):
|
||||
warnings.warn("Spider.extra_domain_names attribute is deprecated - user Spider.allowed_domains instead", \
|
||||
DeprecationWarning, stacklevel=4)
|
||||
self.allowed_domains += list(self.extra_domain_names)
|
||||
|
||||
if name is not None:
|
||||
self.name = name
|
||||
# XXX: create instance attributes (class attributes were kept for
|
||||
# backwards compatibility)
|
||||
if not self.start_urls:
|
||||
self.start_urls = []
|
||||
if not self.extra_domain_names:
|
||||
self.extra_domain_names = []
|
||||
if not self.allowed_domains:
|
||||
self.allowed_domains = []
|
||||
if not self.name:
|
||||
raise ValueError("%s must have a name" % type(self).__name__)
|
||||
|
||||
# XXX: SEP-12 forward compatibility (remove for 0.10)
|
||||
self.domain_name = self.name
|
||||
self.extra_domain_names = self.allowed_domains
|
||||
|
||||
def log(self, message, level=log.DEBUG):
|
||||
"""Log the given messages at the given log level. Always use this
|
||||
@ -67,6 +80,6 @@ class BaseSpider(object_ref):
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return "<%s %r>" % (type(self).__name__, self.domain_name)
|
||||
return "<%s %r>" % (type(self).__name__, self.name)
|
||||
|
||||
__repr__ = __str__
|
||||
|
@ -76,11 +76,11 @@ class MemoryStatsCollector(StatsCollector):
|
||||
|
||||
def __init__(self):
|
||||
super(MemoryStatsCollector, self).__init__()
|
||||
self.domain_stats = {}
|
||||
self.spider_stats = {}
|
||||
|
||||
def _persist_stats(self, stats, spider=None):
|
||||
if spider is not None:
|
||||
self.domain_stats[spider.domain_name] = stats
|
||||
self.spider_stats[spider.name] = stats
|
||||
|
||||
|
||||
class DummyStatsCollector(StatsCollector):
|
||||
|
@ -1,31 +0,0 @@
|
||||
"""
|
||||
A Stats collector for persisting stats (pickled) to a MySQL db
|
||||
"""
|
||||
|
||||
import cPickle as pickle
|
||||
from datetime import datetime
|
||||
|
||||
from scrapy.stats.collector import StatsCollector
|
||||
from scrapy.utils.mysql import mysql_connect
|
||||
from scrapy.conf import settings
|
||||
|
||||
class MysqlStatsCollector(StatsCollector):
|
||||
|
||||
def __init__(self):
|
||||
super(MysqlStatsCollector, self).__init__()
|
||||
mysqluri = settings['STATS_MYSQL_URI']
|
||||
self._mysql_conn = mysql_connect(mysqluri, use_unicode=False) if mysqluri else None
|
||||
|
||||
def _persist_stats(self, stats, spider=None):
|
||||
if spider is None: # only store spider-specific stats
|
||||
return
|
||||
if self._mysql_conn is None:
|
||||
return
|
||||
stored = datetime.utcnow()
|
||||
datas = pickle.dumps(stats)
|
||||
table = 'domain_data_history'
|
||||
|
||||
c = self._mysql_conn.cursor()
|
||||
c.execute("INSERT INTO %s (domain,stored,data) VALUES (%%s,%%s,%%s)" % table, \
|
||||
(spider.domain_name, stored, datas))
|
||||
self._mysql_conn.commit()
|
@ -36,9 +36,9 @@ class SimpledbStatsCollector(StatsCollector):
|
||||
|
||||
def _persist_to_sdb(self, spider, stats):
|
||||
ts = self._get_timestamp(spider).isoformat()
|
||||
sdb_item_id = "%s_%s" % (spider.domain_name, ts)
|
||||
sdb_item_id = "%s_%s" % (spider.name, ts)
|
||||
sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems())
|
||||
sdb_item['domain'] = spider.domain_name
|
||||
sdb_item['spider'] = spider.name
|
||||
sdb_item['timestamp'] = self._to_sdb_value(ts)
|
||||
connect_sdb().put_attributes(self._sdbdomain, sdb_item_id, sdb_item)
|
||||
|
||||
|
@ -4,5 +4,5 @@
|
||||
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||
|
||||
class ${ProjectName}Pipeline(object):
|
||||
def process_item(self, domain, item):
|
||||
def process_item(self, spider, item):
|
||||
return item
|
||||
|
@ -1,9 +1,10 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
|
||||
class $classname(BaseSpider):
|
||||
domain_name = "$site"
|
||||
name = "$name"
|
||||
allowed_domains = ["$domain"]
|
||||
start_urls = (
|
||||
'http://www.$site/',
|
||||
'http://www.$domain/',
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
|
@ -6,19 +6,20 @@ from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from $project_name.items import ${ProjectName}Item
|
||||
|
||||
class $classname(CrawlSpider):
|
||||
domain_name = '$site'
|
||||
start_urls = ['http://www.$site/']
|
||||
name = '$name'
|
||||
allowed_domains = ['$domain']
|
||||
start_urls = ['http://www.$domain/']
|
||||
|
||||
rules = (
|
||||
Rule(SgmlLinkExtractor(allow=(r'Items/', )), 'parse_item', follow=True),
|
||||
Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
|
||||
)
|
||||
|
||||
def parse_item(self, response):
|
||||
xs = HtmlXPathSelector(response)
|
||||
hxs = HtmlXPathSelector(response)
|
||||
i = ${ProjectName}Item()
|
||||
#i['site_id'] = xs.select('//input[@id="sid"]/@value').extract()
|
||||
#i['name'] = xs.select('//div[@id="name"]').extract()
|
||||
#i['description'] = xs.select('//div[@id="description"]').extract()
|
||||
#i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract()
|
||||
#i['name'] = hxs.select('//div[@id="name"]').extract()
|
||||
#i['description'] = hxs.select('//div[@id="description"]').extract()
|
||||
return i
|
||||
|
||||
SPIDER = $classname()
|
||||
|
@ -2,8 +2,9 @@ from scrapy.contrib.spiders import CSVFeedSpider
|
||||
from $project_name.items import ${ProjectName}Item
|
||||
|
||||
class $classname(CSVFeedSpider):
|
||||
domain_name = '$site'
|
||||
start_urls = ['http://www.$site/feed.csv']
|
||||
name = '$name'
|
||||
allowed_domains = ['$domain']
|
||||
start_urls = ['http://www.$domain/feed.csv']
|
||||
# headers = ['id', 'name', 'description', 'image_link']
|
||||
# delimiter = '\t'
|
||||
|
||||
|
@ -2,8 +2,9 @@ from scrapy.contrib.spiders import XMLFeedSpider
|
||||
from $project_name.items import ${ProjectName}Item
|
||||
|
||||
class $classname(XMLFeedSpider):
|
||||
domain_name = '$site'
|
||||
start_urls = ['http://www.$site/feed.xml']
|
||||
name = '$name'
|
||||
allowed_domains = ['$domain']
|
||||
start_urls = ['http://www.$domain/feed.xml']
|
||||
|
||||
def parse_item(self, response, selector):
|
||||
i = ${ProjectName}Item()
|
||||
|
@ -6,9 +6,6 @@ To run all Scrapy unittests go to Scrapy main dir and type:
|
||||
bin/runtests.sh
|
||||
|
||||
If you're in windows use runtests.bat instead.
|
||||
|
||||
Keep in mind that some tests may be skipped if you don't have some (optional)
|
||||
modules available like MySQLdb or simplejson, but that's not a problem.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
@ -59,10 +59,18 @@ class CommandTest(ProjectTest):
|
||||
|
||||
class GenspiderCommandTest(CommandTest):
|
||||
|
||||
def test_arguments(self):
|
||||
# only pass one argument. spider script shouldn't be created
|
||||
self.assertEqual(0, self.call('genspider', 'test_name'))
|
||||
assert not exists(join(self.proj_mod_path, 'spiders', 'test_name.py'))
|
||||
# pass two arguments <name> <domain>. spider script should be created
|
||||
self.assertEqual(0, self.call('genspider', 'test_name', 'test.com'))
|
||||
assert exists(join(self.proj_mod_path, 'spiders', 'test_name.py'))
|
||||
|
||||
def test_template_default(self, *args):
|
||||
self.assertEqual(0, self.call('genspider', 'testspider', 'test.com', *args))
|
||||
assert exists(join(self.proj_mod_path, 'spiders', 'testspider.py'))
|
||||
self.assertEqual(1, self.call('genspider', 'otherspider', 'test.com'))
|
||||
self.assertEqual(0, self.call('genspider', 'test_spider', 'test.com', *args))
|
||||
assert exists(join(self.proj_mod_path, 'spiders', 'test_spider.py'))
|
||||
self.assertEqual(1, self.call('genspider', 'test_spider', 'test.com'))
|
||||
|
||||
def test_template_basic(self):
|
||||
self.test_template_default('--template=basic')
|
||||
|
94
scrapy/tests/test_contrib_exp_crawlspider_matchers.py
Normal file
94
scrapy/tests/test_contrib_exp_crawlspider_matchers.py
Normal file
@ -0,0 +1,94 @@
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.http import Response
|
||||
|
||||
from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
|
||||
from scrapy.contrib_exp.crawlspider.matchers import UrlListMatcher
|
||||
|
||||
import re
|
||||
|
||||
class MatchersTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
pass
|
||||
|
||||
def test_base_matcher(self):
|
||||
matcher = BaseMatcher()
|
||||
|
||||
request = Request('http://example.com')
|
||||
response = Response('http://example.com')
|
||||
|
||||
self.assertTrue(matcher.matches_request(request))
|
||||
self.assertTrue(matcher.matches_response(response))
|
||||
|
||||
def test_url_matcher(self):
|
||||
matcher = UrlMatcher('http://example.com')
|
||||
|
||||
request = Request('http://example.com')
|
||||
response = Response('http://example.com')
|
||||
|
||||
self.failUnless(matcher.matches_request(request))
|
||||
self.failUnless(matcher.matches_request(response))
|
||||
|
||||
request = Request('http://example2.com')
|
||||
response = Response('http://example2.com')
|
||||
|
||||
self.failIf(matcher.matches_request(request))
|
||||
self.failIf(matcher.matches_request(response))
|
||||
|
||||
def test_url_regex_matcher(self):
|
||||
matcher = UrlRegexMatcher(r'sample')
|
||||
urls = (
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
'http://example.com/sample3.html',
|
||||
'http://example.com/sample4.html',
|
||||
)
|
||||
for url in urls:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failUnless(matcher.matches_request(request))
|
||||
self.failUnless(matcher.matches_response(response))
|
||||
|
||||
matcher = UrlRegexMatcher(r'sample_fail')
|
||||
for url in urls:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failIf(matcher.matches_request(request))
|
||||
self.failIf(matcher.matches_response(response))
|
||||
|
||||
matcher = UrlRegexMatcher(r'SAMPLE\d+', re.IGNORECASE)
|
||||
for url in urls:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failUnless(matcher.matches_request(request))
|
||||
self.failUnless(matcher.matches_response(response))
|
||||
|
||||
def test_url_list_matcher(self):
|
||||
urls = (
|
||||
'http://example.com/sample1.html',
|
||||
'http://example.com/sample2.html',
|
||||
'http://example.com/sample3.html',
|
||||
'http://example.com/sample4.html',
|
||||
)
|
||||
urls2 = (
|
||||
'http://example.com/sample5.html',
|
||||
'http://example.com/sample6.html',
|
||||
'http://example.com/sample7.html',
|
||||
'http://example.com/sample8.html',
|
||||
'http://example.com/',
|
||||
)
|
||||
matcher = UrlListMatcher(urls)
|
||||
|
||||
# match urls
|
||||
for url in urls:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failUnless(matcher.matches_request(request))
|
||||
self.failUnless(matcher.matches_response(response))
|
||||
|
||||
# non-match urls
|
||||
for url in urls2:
|
||||
request, response = Request(url), Response(url)
|
||||
self.failIf(matcher.matches_request(request))
|
||||
self.failIf(matcher.matches_response(response))
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user