mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 23:04:14 +00:00
Automated merge with http://hg.scrapy.org/scrapy-0.8
This commit is contained in:
commit
3fb8058016
39
AUTHORS
39
AUTHORS
@ -1,28 +1,25 @@
|
|||||||
Scrapy was brought to life by Shane Evans while hacking a scraping framework
|
Scrapy was brought to life by Shane Evans while hacking a scraping framework
|
||||||
prototype for Mydeco (mydeco.com). It soon became maintained, extended and
|
prototype for Mydeco (mydeco.com). It soon became maintained, extended and
|
||||||
improved by Insophia (insophia.com), with the sponsorship of By Design (the
|
improved by Insophia (insophia.com), with the initial sponsorship of Mydeco to
|
||||||
company behind Mydeco).
|
bootstrap the project.
|
||||||
|
|
||||||
Here is the list of the primary authors & contributors, along with their user
|
Here is the list of the primary authors & contributors:
|
||||||
name (in Scrapy trac/subversion). Emails are intentionally left out to avoid
|
|
||||||
spam.
|
|
||||||
|
|
||||||
* Pablo Hoffman (pablo)
|
* Pablo Hoffman
|
||||||
* Daniel Graña (daniel)
|
* Daniel Graña
|
||||||
* Martin Olveyra (olveyra)
|
* Martin Olveyra
|
||||||
* Gabriel García (elpolilla)
|
* Gabriel García
|
||||||
* Michael Cetrulo (samus_)
|
* Michael Cetrulo
|
||||||
* Artem Bogomyagkov (artem)
|
* Artem Bogomyagkov
|
||||||
* Damian Canabal (calarval)
|
* Damian Canabal
|
||||||
* Andres Moreira (andres)
|
* Andres Moreira
|
||||||
* Ismael Carnales (ismael)
|
* Ismael Carnales
|
||||||
* Matías Aguirre (omab)
|
* Matías Aguirre
|
||||||
* German Hoffman (german)
|
* German Hoffmann
|
||||||
* Anibal Pacheco (anibal)
|
* Anibal Pacheco
|
||||||
* Bruno Deferrari
|
* Bruno Deferrari
|
||||||
* Shane Evans
|
* Shane Evans
|
||||||
|
* Ezequiel Rivero
|
||||||
And here is the list of people who have helped to put the Scrapy homepage live:
|
* Patrick Mezard
|
||||||
|
* Rolando Espinoza
|
||||||
* Ezequiel Rivero (ezequiel)
|
|
||||||
|
|
||||||
|
5
bin/scrapy.tac
Normal file
5
bin/scrapy.tac
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from twisted.application.service import Application
|
||||||
|
from scrapy.service import ScrapyService
|
||||||
|
|
||||||
|
application = Application("Scrapy")
|
||||||
|
ScrapyService().setServiceParent(application)
|
128
docs/experimental/crawlspider-v2.rst
Normal file
128
docs/experimental/crawlspider-v2.rst
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
.. _topics-crawlspider-v2:
|
||||||
|
|
||||||
|
==============
|
||||||
|
CrawlSpider v2
|
||||||
|
==============
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
============
|
||||||
|
|
||||||
|
TODO: introduction
|
||||||
|
|
||||||
|
Rules Matching
|
||||||
|
==============
|
||||||
|
|
||||||
|
TODO: describe purpose of rules
|
||||||
|
|
||||||
|
Request Extractors & Processors
|
||||||
|
===============================
|
||||||
|
|
||||||
|
TODO: describe purpose of extractors & processors
|
||||||
|
|
||||||
|
Examples
|
||||||
|
========
|
||||||
|
|
||||||
|
TODO: plenty of examples
|
||||||
|
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib_exp.crawlspider.spider
|
||||||
|
:synopsis: CrawlSpider
|
||||||
|
|
||||||
|
|
||||||
|
Reference
|
||||||
|
=========
|
||||||
|
|
||||||
|
CrawlSpider
|
||||||
|
-----------
|
||||||
|
|
||||||
|
TODO: describe crawlspider
|
||||||
|
|
||||||
|
.. class:: CrawlSpider
|
||||||
|
|
||||||
|
TODO: describe class
|
||||||
|
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib_exp.crawlspider.rules
|
||||||
|
:synopsis: Rules
|
||||||
|
|
||||||
|
Rules
|
||||||
|
-----
|
||||||
|
|
||||||
|
TODO: describe spider rules
|
||||||
|
|
||||||
|
.. class:: Rule
|
||||||
|
|
||||||
|
TODO: describe Rules class
|
||||||
|
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib_exp.crawlspider.reqext
|
||||||
|
:synopsis: Request Extractors
|
||||||
|
|
||||||
|
Request Extractors
|
||||||
|
------------------
|
||||||
|
|
||||||
|
TODO: describe extractors purpose
|
||||||
|
|
||||||
|
.. class:: BaseSgmlRequestExtractor
|
||||||
|
|
||||||
|
TODO: describe base extractor
|
||||||
|
|
||||||
|
.. class:: SgmlRequestExtractor
|
||||||
|
|
||||||
|
TODO: describe sgml extractor
|
||||||
|
|
||||||
|
.. class:: XPathRequestExtractor
|
||||||
|
|
||||||
|
TODO: describe xpath request extractor
|
||||||
|
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib_exp.crawlspider.reqproc
|
||||||
|
:synopsis: Request Processors
|
||||||
|
|
||||||
|
Request Processors
|
||||||
|
------------------
|
||||||
|
|
||||||
|
TODO: describe request processors
|
||||||
|
|
||||||
|
.. class:: Canonicalize
|
||||||
|
|
||||||
|
TODO: describe proc
|
||||||
|
|
||||||
|
.. class:: Unique
|
||||||
|
|
||||||
|
TODO: describe unique
|
||||||
|
|
||||||
|
.. class:: FilterDomain
|
||||||
|
|
||||||
|
TODO: describe filter domain
|
||||||
|
|
||||||
|
.. class:: FilterUrl
|
||||||
|
|
||||||
|
TODO: describe filter url
|
||||||
|
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib_exp.crawlspider.matchers
|
||||||
|
:synopsis: Matchers
|
||||||
|
|
||||||
|
Request/Response Matchers
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
TODO: describe matchers
|
||||||
|
|
||||||
|
.. class:: BaseMatcher
|
||||||
|
|
||||||
|
TODO: describe base matcher
|
||||||
|
|
||||||
|
.. class:: UrlMatcher
|
||||||
|
|
||||||
|
TODO: describe url matcher
|
||||||
|
|
||||||
|
.. class:: UrlRegexMatcher
|
||||||
|
|
||||||
|
TODO: describe UrlListMatcher
|
||||||
|
|
||||||
|
.. class:: UrlListMatcher
|
||||||
|
|
||||||
|
TODO: describe url list matcher
|
||||||
|
|
||||||
|
|
@ -21,3 +21,4 @@ it's properly merged) . Use at your own risk.
|
|||||||
|
|
||||||
djangoitems
|
djangoitems
|
||||||
scheduler-middleware
|
scheduler-middleware
|
||||||
|
crawlspider-v2
|
||||||
|
@ -128,7 +128,8 @@ Finally, here's the spider code::
|
|||||||
|
|
||||||
class MininovaSpider(CrawlSpider):
|
class MininovaSpider(CrawlSpider):
|
||||||
|
|
||||||
domain_name = 'mininova.org'
|
name = 'mininova.org'
|
||||||
|
allowed_domains = ['mininova.org']
|
||||||
start_urls = ['http://www.mininova.org/today']
|
start_urls = ['http://www.mininova.org/today']
|
||||||
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
||||||
|
|
||||||
|
@ -102,8 +102,8 @@ to parse the contents of those pages to extract :ref:`items <topics-items>`.
|
|||||||
To create a Spider, you must subclass :class:`scrapy.spider.BaseSpider`, and
|
To create a Spider, you must subclass :class:`scrapy.spider.BaseSpider`, and
|
||||||
define the three main, mandatory, attributes:
|
define the three main, mandatory, attributes:
|
||||||
|
|
||||||
* :attr:`~scrapy.spider.BaseSpider.domain_name`: identifies the Spider. It must
|
* :attr:`~scrapy.spider.BaseSpider.name`: identifies the Spider. It must be
|
||||||
be unique, that is, you can't set the same domain name for different Spiders.
|
unique, that is, you can't set the same name for different Spiders.
|
||||||
|
|
||||||
* :attr:`~scrapy.spider.BaseSpider.start_urls`: is a list of URLs where the
|
* :attr:`~scrapy.spider.BaseSpider.start_urls`: is a list of URLs where the
|
||||||
Spider will begin to crawl from. So, the first pages downloaded will be those
|
Spider will begin to crawl from. So, the first pages downloaded will be those
|
||||||
@ -128,7 +128,8 @@ This is the code for our first Spider, save it in a file named
|
|||||||
from scrapy.spider import BaseSpider
|
from scrapy.spider import BaseSpider
|
||||||
|
|
||||||
class DmozSpider(BaseSpider):
|
class DmozSpider(BaseSpider):
|
||||||
domain_name = "dmoz.org"
|
name = "dmoz.org"
|
||||||
|
allowed_domains = ["dmoz.org"]
|
||||||
start_urls = [
|
start_urls = [
|
||||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
||||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
|
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
|
||||||
@ -354,7 +355,8 @@ Let's add this code to our spider::
|
|||||||
from scrapy.selector import HtmlXPathSelector
|
from scrapy.selector import HtmlXPathSelector
|
||||||
|
|
||||||
class DmozSpider(BaseSpider):
|
class DmozSpider(BaseSpider):
|
||||||
domain_name = "dmoz.org"
|
name = "dmoz.org"
|
||||||
|
allowed_domains = ["dmoz.org"]
|
||||||
start_urls = [
|
start_urls = [
|
||||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
||||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
|
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
|
||||||
@ -398,7 +400,8 @@ scraped so far, the code for our Spider should be like this::
|
|||||||
from dmoz.items import DmozItem
|
from dmoz.items import DmozItem
|
||||||
|
|
||||||
class DmozSpider(BaseSpider):
|
class DmozSpider(BaseSpider):
|
||||||
domain_name = "dmoz.org"
|
name = "dmoz.org"
|
||||||
|
allowed_domains = ["dmoz.org"]
|
||||||
start_urls = [
|
start_urls = [
|
||||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
|
||||||
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
|
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
|
||||||
@ -420,8 +423,8 @@ scraped so far, the code for our Spider should be like this::
|
|||||||
|
|
||||||
Now doing a crawl on the dmoz.org domain yields ``DmozItem``'s::
|
Now doing a crawl on the dmoz.org domain yields ``DmozItem``'s::
|
||||||
|
|
||||||
[dmoz.org] DEBUG: Scraped DmozItem({'title': [u'Text Processing in Python'], 'link': [u'http://gnosis.cx/TPiP/'], 'desc': [u' - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]\n']}) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
[dmoz.org] DEBUG: Scraped DmozItem(desc=[u' - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]\n'], link=[u'http://gnosis.cx/TPiP/'], title=[u'Text Processing in Python']) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||||
[dmoz.org] DEBUG: Scraped DmozItem({'title': [u'XML Processing with Python'], 'link': [u'http://www.informit.com/store/product.aspx?isbn=0130211192'], 'desc': [u' - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n']}) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
[dmoz.org] DEBUG: Scraped DmozItem(desc=[u' - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n'], link=[u'http://www.informit.com/store/product.aspx?isbn=0130211192'], title=[u'XML Processing with Python']) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||||
|
|
||||||
|
|
||||||
Storing the data (using an Item Pipeline)
|
Storing the data (using an Item Pipeline)
|
||||||
|
@ -199,7 +199,7 @@ HttpAuthMiddleware
|
|||||||
|
|
||||||
http_user = 'someuser'
|
http_user = 'someuser'
|
||||||
http_pass = 'somepass'
|
http_pass = 'somepass'
|
||||||
domain_name = 'intranet.example.com'
|
name = 'intranet.example.com'
|
||||||
|
|
||||||
# .. rest of the spider code omitted ...
|
# .. rest of the spider code omitted ...
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ Exporter to export scraped items to different files, one per spider::
|
|||||||
self.files = {}
|
self.files = {}
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
def spider_opened(self, spider):
|
||||||
file = open('%s_products.xml' % spider.domain_name, 'w+b')
|
file = open('%s_products.xml' % spider.name, 'w+b')
|
||||||
self.files[spider] = file
|
self.files[spider] = file
|
||||||
self.exporter = XmlItemExporter(file)
|
self.exporter = XmlItemExporter(file)
|
||||||
self.exporter.start_exporting()
|
self.exporter.start_exporting()
|
||||||
|
@ -105,10 +105,10 @@ every time a domain/spider is opened and closed::
|
|||||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
def spider_opened(self, spider):
|
||||||
log.msg("opened spider %s" % spider.domain_name)
|
log.msg("opened spider %s" % spider.name)
|
||||||
|
|
||||||
def spider_closed(self, spider):
|
def spider_closed(self, spider):
|
||||||
log.msg("closed spider %s" % spider.domain_name)
|
log.msg("closed spider %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
.. _topics-extensions-ref-manager:
|
.. _topics-extensions-ref-manager:
|
||||||
|
@ -79,7 +79,8 @@ This is how the spider would look so far::
|
|||||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||||
|
|
||||||
class GoogleDirectorySpider(CrawlSpider):
|
class GoogleDirectorySpider(CrawlSpider):
|
||||||
domain_name = 'directory.google.com'
|
name = 'directory.google.com'
|
||||||
|
allowed_domains = ['directory.google.com']
|
||||||
start_urls = ['http://directory.google.com/']
|
start_urls = ['http://directory.google.com/']
|
||||||
|
|
||||||
rules = (
|
rules = (
|
||||||
|
@ -98,10 +98,10 @@ spider returns multiples items with the same id::
|
|||||||
del self.duplicates[spider]
|
del self.duplicates[spider]
|
||||||
|
|
||||||
def process_item(self, spider, item):
|
def process_item(self, spider, item):
|
||||||
if item.id in self.duplicates[spider]:
|
if item['id'] in self.duplicates[spider]:
|
||||||
raise DropItem("Duplicate item found: %s" % item)
|
raise DropItem("Duplicate item found: %s" % item)
|
||||||
else:
|
else:
|
||||||
self.duplicates[spider].add(item.id)
|
self.duplicates[spider].add(item['id'])
|
||||||
return item
|
return item
|
||||||
|
|
||||||
Built-in Item Pipelines reference
|
Built-in Item Pipelines reference
|
||||||
|
@ -129,3 +129,14 @@ scrapy.log module
|
|||||||
|
|
||||||
Log level for debugging messages (recommended level for development)
|
Log level for debugging messages (recommended level for development)
|
||||||
|
|
||||||
|
Logging settings
|
||||||
|
================
|
||||||
|
|
||||||
|
These settings can be used to configure the logging:
|
||||||
|
|
||||||
|
* :setting:`LOG_ENABLED`
|
||||||
|
* :setting:`LOG_ENCODING`
|
||||||
|
* :setting:`LOG_FILE`
|
||||||
|
* :setting:`LOG_LEVEL`
|
||||||
|
* :setting:`LOG_STDOUT`
|
||||||
|
|
||||||
|
@ -321,7 +321,7 @@ user name and password. You can use the :meth:`FormRequest.from_response`
|
|||||||
method for this job. Here's an example spider which uses it::
|
method for this job. Here's an example spider which uses it::
|
||||||
|
|
||||||
class LoginSpider(BaseSpider):
|
class LoginSpider(BaseSpider):
|
||||||
domain_name = 'example.com'
|
name = 'example.com'
|
||||||
start_urls = ['http://www.example.com/users/login.php']
|
start_urls = ['http://www.example.com/users/login.php']
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
@ -466,12 +466,14 @@ TextResponse objects
|
|||||||
|
|
||||||
.. attribute:: TextResponse.encoding
|
.. attribute:: TextResponse.encoding
|
||||||
|
|
||||||
A string with the encoding of this response. The encoding is resolved in the
|
A string with the encoding of this response. The encoding is resolved by
|
||||||
following order:
|
trying the following mechanisms, in order:
|
||||||
|
|
||||||
1. the encoding passed in the constructor `encoding` argument
|
1. the encoding passed in the constructor `encoding` argument
|
||||||
|
|
||||||
2. the encoding declared in the Content-Type HTTP header
|
2. the encoding declared in the Content-Type HTTP header. If this
|
||||||
|
encoding is not valid (ie. unknown), it is ignored and the next
|
||||||
|
resolution mechanism is tried.
|
||||||
|
|
||||||
3. the encoding declared in the response body. The TextResponse class
|
3. the encoding declared in the response body. The TextResponse class
|
||||||
doesn't provide any special functionality for this. However, the
|
doesn't provide any special functionality for this. However, the
|
||||||
@ -483,23 +485,11 @@ TextResponse objects
|
|||||||
:class:`TextResponse` objects support the following methods in addition to
|
:class:`TextResponse` objects support the following methods in addition to
|
||||||
the standard :class:`Response` ones:
|
the standard :class:`Response` ones:
|
||||||
|
|
||||||
.. method:: TextResponse.headers_encoding()
|
|
||||||
|
|
||||||
Returns a string with the encoding declared in the headers (ie. the
|
|
||||||
Content-Type HTTP header).
|
|
||||||
|
|
||||||
.. method:: TextResponse.body_encoding()
|
|
||||||
|
|
||||||
Returns a string with the encoding of the body, either declared or inferred
|
|
||||||
from its contents. The body encoding declaration is implemented in
|
|
||||||
:class:`TextResponse` subclasses such as: :class:`HtmlResponse` or
|
|
||||||
:class:`XmlResponse`.
|
|
||||||
|
|
||||||
.. method:: TextResponse.body_as_unicode()
|
.. method:: TextResponse.body_as_unicode()
|
||||||
|
|
||||||
Returns the body of the response as unicode. This is equivalent to::
|
Returns the body of the response as unicode. This is equivalent to::
|
||||||
|
|
||||||
response.body.encode(response.encoding)
|
response.body.decode(response.encoding)
|
||||||
|
|
||||||
But **not** equivalent to::
|
But **not** equivalent to::
|
||||||
|
|
||||||
|
@ -340,16 +340,6 @@ Default: ``True``
|
|||||||
|
|
||||||
Whether to collect depth stats.
|
Whether to collect depth stats.
|
||||||
|
|
||||||
.. setting:: DOMAIN_SCHEDULER
|
|
||||||
|
|
||||||
SPIDER_SCHEDULER
|
|
||||||
----------------
|
|
||||||
|
|
||||||
Default: ``'scrapy.contrib.spiderscheduler.FifoSpiderScheduler'``
|
|
||||||
|
|
||||||
The Spider Scheduler to use. The spider scheduler returns the next spider to
|
|
||||||
scrape.
|
|
||||||
|
|
||||||
.. setting:: DOWNLOADER_DEBUG
|
.. setting:: DOWNLOADER_DEBUG
|
||||||
|
|
||||||
DOWNLOADER_DEBUG
|
DOWNLOADER_DEBUG
|
||||||
@ -418,6 +408,15 @@ supported. Example::
|
|||||||
|
|
||||||
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
|
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
|
||||||
|
|
||||||
|
This setting is also affected by the :setting:`RANDOMIZE_DOWNLOAD_DELAY`
|
||||||
|
setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
|
||||||
|
amount of time between requests, but uses a random interval between 0.5 and 1.5
|
||||||
|
* :setting:`DOWNLOAD_DELAY`.
|
||||||
|
|
||||||
|
Another way to change the download delay (per spider, instead of globally) is
|
||||||
|
by using the ``download_delay`` spider attribute, which takes more precedence
|
||||||
|
than this setting.
|
||||||
|
|
||||||
.. setting:: DOWNLOAD_TIMEOUT
|
.. setting:: DOWNLOAD_TIMEOUT
|
||||||
|
|
||||||
DOWNLOAD_TIMEOUT
|
DOWNLOAD_TIMEOUT
|
||||||
@ -439,6 +438,69 @@ The class used to detect and filter duplicate requests.
|
|||||||
The default (``RequestFingerprintDupeFilter``) filters based on request fingerprint
|
The default (``RequestFingerprintDupeFilter``) filters based on request fingerprint
|
||||||
(using ``scrapy.utils.request.request_fingerprint``) and grouping per domain.
|
(using ``scrapy.utils.request.request_fingerprint``) and grouping per domain.
|
||||||
|
|
||||||
|
.. setting:: ENCODING_ALIASES
|
||||||
|
|
||||||
|
ENCODING_ALIASES
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Default: ``{}``
|
||||||
|
|
||||||
|
A mapping of custom encoding aliases for your project, where the keys are the
|
||||||
|
aliases (and must be lower case) and the values are the encodings they map to.
|
||||||
|
|
||||||
|
This setting extends the :setting:`ENCODING_ALIASES_BASE` setting which
|
||||||
|
contains some default mappings.
|
||||||
|
|
||||||
|
.. setting:: ENCODING_ALIASES_BASE
|
||||||
|
|
||||||
|
ENCODING_ALIASES_BASE
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Default::
|
||||||
|
|
||||||
|
{
|
||||||
|
# gb2312 is superseded by gb18030
|
||||||
|
'gb2312': 'gb18030',
|
||||||
|
'chinese': 'gb18030',
|
||||||
|
'csiso58gb231280': 'gb18030',
|
||||||
|
'euc- cn': 'gb18030',
|
||||||
|
'euccn': 'gb18030',
|
||||||
|
'eucgb2312-cn': 'gb18030',
|
||||||
|
'gb2312-1980': 'gb18030',
|
||||||
|
'gb2312-80': 'gb18030',
|
||||||
|
'iso- ir-58': 'gb18030',
|
||||||
|
# gbk is superseded by gb18030
|
||||||
|
'gbk': 'gb18030',
|
||||||
|
'936': 'gb18030',
|
||||||
|
'cp936': 'gb18030',
|
||||||
|
'ms936': 'gb18030',
|
||||||
|
# latin_1 is a subset of cp1252
|
||||||
|
'latin_1': 'cp1252',
|
||||||
|
'iso-8859-1': 'cp1252',
|
||||||
|
'iso8859-1': 'cp1252',
|
||||||
|
'8859': 'cp1252',
|
||||||
|
'cp819': 'cp1252',
|
||||||
|
'latin': 'cp1252',
|
||||||
|
'latin1': 'cp1252',
|
||||||
|
'l1': 'cp1252',
|
||||||
|
# others
|
||||||
|
'zh-cn': 'gb18030',
|
||||||
|
'win-1251': 'cp1251',
|
||||||
|
'macintosh' : 'mac_roman',
|
||||||
|
'x-sjis': 'shift_jis',
|
||||||
|
}
|
||||||
|
|
||||||
|
The default encoding aliases defined in Scrapy. Don't override this setting in
|
||||||
|
your project, override :setting:`ENCODING_ALIASES` instead.
|
||||||
|
|
||||||
|
The reason why `ISO-8859-1`_ (and all its aliases) are mapped to `CP1252`_ is
|
||||||
|
due to a well known browser hack. For more information see: `Character
|
||||||
|
encodings in HTML`_.
|
||||||
|
|
||||||
|
.. _ISO-8859-1: http://en.wikipedia.org/wiki/ISO/IEC_8859-1
|
||||||
|
.. _CP1252: http://en.wikipedia.org/wiki/Windows-1252
|
||||||
|
.. _Character encodings in HTML: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
|
||||||
|
|
||||||
.. setting:: EXTENSIONS
|
.. setting:: EXTENSIONS
|
||||||
|
|
||||||
EXTENSIONS
|
EXTENSIONS
|
||||||
@ -517,7 +579,16 @@ LOG_ENABLED
|
|||||||
|
|
||||||
Default: ``True``
|
Default: ``True``
|
||||||
|
|
||||||
Enable logging.
|
Whether to enable logging.
|
||||||
|
|
||||||
|
.. setting:: LOG_ENCODING
|
||||||
|
|
||||||
|
LOG_ENCODING
|
||||||
|
------------
|
||||||
|
|
||||||
|
Default: ``'utf-8'``
|
||||||
|
|
||||||
|
The encoding to use for logging.
|
||||||
|
|
||||||
.. setting:: LOG_FILE
|
.. setting:: LOG_FILE
|
||||||
|
|
||||||
@ -677,6 +748,27 @@ Example::
|
|||||||
|
|
||||||
NEWSPIDER_MODULE = 'mybot.spiders_dev'
|
NEWSPIDER_MODULE = 'mybot.spiders_dev'
|
||||||
|
|
||||||
|
.. setting:: RANDOMIZE_DOWNLOAD_DELAY
|
||||||
|
|
||||||
|
RANDOMIZE_DOWNLOAD_DELAY
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
Default: ``True``
|
||||||
|
|
||||||
|
If enabled, Scrapy will wait a random amount of time (between 0.5 and 1.5
|
||||||
|
* :setting:`DOWNLOAD_DELAY`) while fetching requests from the same
|
||||||
|
spider.
|
||||||
|
|
||||||
|
This randomization decreases the chance of the crawler being detected (and
|
||||||
|
subsequently blocked) by sites which analyze requests looking for statistically
|
||||||
|
significant similarities in the time between their times.
|
||||||
|
|
||||||
|
The randomization policy is the same used by `wget`_ ``--random-wait`` option.
|
||||||
|
|
||||||
|
If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect.
|
||||||
|
|
||||||
|
.. _wget: http://www.gnu.org/software/wget/manual/wget.html
|
||||||
|
|
||||||
.. setting:: REDIRECT_MAX_TIMES
|
.. setting:: REDIRECT_MAX_TIMES
|
||||||
|
|
||||||
REDIRECT_MAX_TIMES
|
REDIRECT_MAX_TIMES
|
||||||
@ -773,7 +865,7 @@ The scheduler to use for crawling.
|
|||||||
SCHEDULER_ORDER
|
SCHEDULER_ORDER
|
||||||
---------------
|
---------------
|
||||||
|
|
||||||
Default: ``'BFO'``
|
Default: ``'DFO'``
|
||||||
|
|
||||||
Scope: ``scrapy.core.scheduler``
|
Scope: ``scrapy.core.scheduler``
|
||||||
|
|
||||||
@ -858,6 +950,16 @@ Example::
|
|||||||
|
|
||||||
SPIDER_MODULES = ['mybot.spiders_prod', 'mybot.spiders_dev']
|
SPIDER_MODULES = ['mybot.spiders_prod', 'mybot.spiders_dev']
|
||||||
|
|
||||||
|
.. setting:: SPIDER_SCHEDULER
|
||||||
|
|
||||||
|
SPIDER_SCHEDULER
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Default: ``'scrapy.contrib.spiderscheduler.FifoSpiderScheduler'``
|
||||||
|
|
||||||
|
The Spider Scheduler to use. The spider scheduler returns the next spider to
|
||||||
|
scrape.
|
||||||
|
|
||||||
.. setting:: STATS_CLASS
|
.. setting:: STATS_CLASS
|
||||||
|
|
||||||
STATS_CLASS
|
STATS_CLASS
|
||||||
|
@ -163,7 +163,7 @@ This can be achieved by using the ``scrapy.shell.inspect_response`` function.
|
|||||||
Here's an example of how you would call it from your spider::
|
Here's an example of how you would call it from your spider::
|
||||||
|
|
||||||
class MySpider(BaseSpider):
|
class MySpider(BaseSpider):
|
||||||
domain_name = 'example.com'
|
...
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
if response.url == 'http://www.example.com/products.php':
|
if response.url == 'http://www.example.com/products.php':
|
||||||
|
@ -210,11 +210,8 @@ OffsiteMiddleware
|
|||||||
|
|
||||||
Filters out Requests for URLs outside the domains covered by the spider.
|
Filters out Requests for URLs outside the domains covered by the spider.
|
||||||
|
|
||||||
This middleware filters out every request whose host names don't match
|
This middleware filters out every request whose host names aren't in the
|
||||||
:attr:`~scrapy.spider.BaseSpider.domain_name`, or the spider
|
spider's :attr:`~scrapy.spider.BaseSpider.allowed_domains` attribute.
|
||||||
:attr:`~scrapy.spider.BaseSpider.domain_name` prefixed by "www.".
|
|
||||||
Spider can add more domains to exclude using
|
|
||||||
:attr:`~scrapy.spider.BaseSpider.extra_domain_names` attribute.
|
|
||||||
|
|
||||||
When your spider returns a request for a domain not belonging to those
|
When your spider returns a request for a domain not belonging to those
|
||||||
covered by the spider, this middleware will log a debug message similar to
|
covered by the spider, this middleware will log a debug message similar to
|
||||||
|
@ -70,20 +70,22 @@ BaseSpider
|
|||||||
requests the given ``start_urls``/``start_requests``, and calls the spider's
|
requests the given ``start_urls``/``start_requests``, and calls the spider's
|
||||||
method ``parse`` for each of the resulting responses.
|
method ``parse`` for each of the resulting responses.
|
||||||
|
|
||||||
.. attribute:: domain_name
|
.. attribute:: name
|
||||||
|
|
||||||
A string which defines the domain name for this spider, which will also be
|
A string which defines the name for this spider. The spider name is how
|
||||||
the unique identifier for this spider (which means you can't have two
|
the spider is located (and instantiated) by Scrapy, so it must be
|
||||||
spider with the same ``domain_name``). This is the most important spider
|
unique. However, nothing prevents you from instantiating more than one
|
||||||
attribute and it's required, and it's the name by which Scrapy will known
|
instance of the same spider. This is the most important spider attribute
|
||||||
the spider.
|
and it's required.
|
||||||
|
|
||||||
.. attribute:: extra_domain_names
|
Is recommended to name your spiders after the domain that their crawl.
|
||||||
|
|
||||||
An optional list of strings containing additional domains that this
|
.. attribute:: allowed_domains
|
||||||
spider is allowed to crawl. Requests for URLs not belonging to the
|
|
||||||
domain name specified in :attr:`domain_name` or this list won't be
|
An optional list of strings containing domains that this spider is
|
||||||
followed.
|
allowed to crawl. Requests for URLs not belonging to the domain names
|
||||||
|
specified in this list won't be followed if
|
||||||
|
:class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware` is enabled.
|
||||||
|
|
||||||
.. attribute:: start_urls
|
.. attribute:: start_urls
|
||||||
|
|
||||||
@ -144,7 +146,7 @@ BaseSpider
|
|||||||
.. method:: log(message, [level, component])
|
.. method:: log(message, [level, component])
|
||||||
|
|
||||||
Log a message using the :func:`scrapy.log.msg` function, automatically
|
Log a message using the :func:`scrapy.log.msg` function, automatically
|
||||||
populating the domain argument with the :attr:`domain_name` of this
|
populating the spider argument with the :attr:`name` of this
|
||||||
spider. For more information see :ref:`topics-logging`.
|
spider. For more information see :ref:`topics-logging`.
|
||||||
|
|
||||||
|
|
||||||
@ -157,7 +159,8 @@ Let's see an example::
|
|||||||
from scrapy.spider import BaseSpider
|
from scrapy.spider import BaseSpider
|
||||||
|
|
||||||
class MySpider(BaseSpider):
|
class MySpider(BaseSpider):
|
||||||
domain_name = 'http://www.example.com'
|
name = 'example.com'
|
||||||
|
allowed_domains = ['example.com']
|
||||||
start_urls = [
|
start_urls = [
|
||||||
'http://www.example.com/1.html',
|
'http://www.example.com/1.html',
|
||||||
'http://www.example.com/2.html',
|
'http://www.example.com/2.html',
|
||||||
@ -177,7 +180,8 @@ Another example returning multiples Requests and Items from a single callback::
|
|||||||
from myproject.items import MyItem
|
from myproject.items import MyItem
|
||||||
|
|
||||||
class MySpider(BaseSpider):
|
class MySpider(BaseSpider):
|
||||||
domain_name = 'http://www.example.com'
|
name = 'example.com'
|
||||||
|
allowed_domains = ['example.com']
|
||||||
start_urls = [
|
start_urls = [
|
||||||
'http://www.example.com/1.html',
|
'http://www.example.com/1.html',
|
||||||
'http://www.example.com/2.html',
|
'http://www.example.com/2.html',
|
||||||
@ -254,7 +258,8 @@ Let's now take a look at an example CrawlSpider with rules::
|
|||||||
from scrapy.item import Item
|
from scrapy.item import Item
|
||||||
|
|
||||||
class MySpider(CrawlSpider):
|
class MySpider(CrawlSpider):
|
||||||
domain_name = 'example.com'
|
name = 'example.com'
|
||||||
|
allowed_domains = ['example.com']
|
||||||
start_urls = ['http://www.example.com']
|
start_urls = ['http://www.example.com']
|
||||||
|
|
||||||
rules = (
|
rules = (
|
||||||
@ -378,7 +383,8 @@ These spiders are pretty easy to use, let's have at one example::
|
|||||||
from myproject.items import TestItem
|
from myproject.items import TestItem
|
||||||
|
|
||||||
class MySpider(XMLFeedSpider):
|
class MySpider(XMLFeedSpider):
|
||||||
domain_name = 'example.com'
|
name = 'example.com'
|
||||||
|
allowed_domains = ['example.com']
|
||||||
start_urls = ['http://www.example.com/feed.xml']
|
start_urls = ['http://www.example.com/feed.xml']
|
||||||
iterator = 'iternodes' # This is actually unnecesary, since it's the default value
|
iterator = 'iternodes' # This is actually unnecesary, since it's the default value
|
||||||
itertag = 'item'
|
itertag = 'item'
|
||||||
@ -435,7 +441,8 @@ Let's see an example similar to the previous one, but using a
|
|||||||
from myproject.items import TestItem
|
from myproject.items import TestItem
|
||||||
|
|
||||||
class MySpider(CSVFeedSpider):
|
class MySpider(CSVFeedSpider):
|
||||||
domain_name = 'example.com'
|
name = 'example.com'
|
||||||
|
allowed_domains = ['example.com']
|
||||||
start_urls = ['http://www.example.com/feed.csv']
|
start_urls = ['http://www.example.com/feed.csv']
|
||||||
delimiter = ';'
|
delimiter = ';'
|
||||||
headers = ['id', 'name', 'description']
|
headers = ['id', 'name', 'description']
|
||||||
|
@ -204,15 +204,15 @@ MemoryStatsCollector
|
|||||||
|
|
||||||
A simple stats collector that keeps the stats of the last scraping run (for
|
A simple stats collector that keeps the stats of the last scraping run (for
|
||||||
each spider) in memory, after they're closed. The stats can be accessed
|
each spider) in memory, after they're closed. The stats can be accessed
|
||||||
through the :attr:`domain_stats` attribute, which is a dict keyed by spider
|
through the :attr:`spider_stats` attribute, which is a dict keyed by spider
|
||||||
domain name.
|
domain name.
|
||||||
|
|
||||||
This is the default Stats Collector used in Scrapy.
|
This is the default Stats Collector used in Scrapy.
|
||||||
|
|
||||||
.. attribute:: domain_stats
|
.. attribute:: spider_stats
|
||||||
|
|
||||||
A dict of dicts (keyed by spider domain name) containing the stats of
|
A dict of dicts (keyed by spider name) containing the stats of the last
|
||||||
the last scraping run for each domain.
|
scraping run for each spider.
|
||||||
|
|
||||||
DummyStatsCollector
|
DummyStatsCollector
|
||||||
-------------------
|
-------------------
|
||||||
@ -240,11 +240,11 @@ SimpledbStatsCollector
|
|||||||
In addition to the existing stats keys the following keys are added at
|
In addition to the existing stats keys the following keys are added at
|
||||||
persitance time:
|
persitance time:
|
||||||
|
|
||||||
* ``domain``: the spider domain (so you can use it later for querying stats
|
* ``spider``: the spider name (so you can use it later for querying stats
|
||||||
for that domain)
|
for that spider)
|
||||||
* ``timestamp``: the timestamp when the stats were persisited
|
* ``timestamp``: the timestamp when the stats were persisited
|
||||||
|
|
||||||
Both the ``domain`` and ``timestamp`` are used for generating the SimpleDB
|
Both the ``spider`` and ``timestamp`` are used for generating the SimpleDB
|
||||||
item name in order to avoid overwriting stats of previous scraping runs.
|
item name in order to avoid overwriting stats of previous scraping runs.
|
||||||
|
|
||||||
As `required by SimpleDB`_, datetime's are stored in ISO 8601 format and
|
As `required by SimpleDB`_, datetime's are stored in ISO 8601 format and
|
||||||
|
1
examples/experimental/googledir/googledir/__init__.py
Normal file
1
examples/experimental/googledir/googledir/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# googledir project
|
16
examples/experimental/googledir/googledir/items.py
Normal file
16
examples/experimental/googledir/googledir/items.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# http://doc.scrapy.org/topics/items.html
|
||||||
|
|
||||||
|
from scrapy.item import Item, Field
|
||||||
|
|
||||||
|
class GoogledirItem(Item):
|
||||||
|
|
||||||
|
name = Field(default='')
|
||||||
|
url = Field(default='')
|
||||||
|
description = Field(default='')
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "Google Category: name=%s url=%s" \
|
||||||
|
% (self['name'], self['url'])
|
22
examples/experimental/googledir/googledir/pipelines.py
Normal file
22
examples/experimental/googledir/googledir/pipelines.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||||
|
|
||||||
|
from scrapy.core.exceptions import DropItem
|
||||||
|
|
||||||
|
class FilterWordsPipeline(object):
|
||||||
|
"""
|
||||||
|
A pipeline for filtering out items which contain certain
|
||||||
|
words in their description
|
||||||
|
"""
|
||||||
|
|
||||||
|
# put all words in lowercase
|
||||||
|
words_to_filter = ['politics', 'religion']
|
||||||
|
|
||||||
|
def process_item(self, spider, item):
|
||||||
|
for word in self.words_to_filter:
|
||||||
|
if word in unicode(item['description']).lower():
|
||||||
|
raise DropItem("Contains forbidden word: %s" % word)
|
||||||
|
else:
|
||||||
|
return item
|
21
examples/experimental/googledir/googledir/settings.py
Normal file
21
examples/experimental/googledir/googledir/settings.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# Scrapy settings for googledir project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only the most important settings by
|
||||||
|
# default. All the other settings are documented here:
|
||||||
|
#
|
||||||
|
# http://doc.scrapy.org/topics/settings.html
|
||||||
|
#
|
||||||
|
# Or you can copy and paste them from where they're defined in Scrapy:
|
||||||
|
#
|
||||||
|
# scrapy/conf/default_settings.py
|
||||||
|
#
|
||||||
|
|
||||||
|
BOT_NAME = 'googledir'
|
||||||
|
BOT_VERSION = '1.0'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['googledir.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'googledir.spiders'
|
||||||
|
DEFAULT_ITEM_CLASS = 'googledir.items.GoogledirItem'
|
||||||
|
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
||||||
|
|
||||||
|
ITEM_PIPELINES = ['googledir.pipelines.FilterWordsPipeline']
|
@ -0,0 +1,8 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# To create the first spider for your project use this command:
|
||||||
|
#
|
||||||
|
# scrapy-ctl.py genspider myspider myspider-domain.com
|
||||||
|
#
|
||||||
|
# For more info see:
|
||||||
|
# http://doc.scrapy.org/topics/spiders.html
|
@ -0,0 +1,41 @@
|
|||||||
|
from scrapy.selector import HtmlXPathSelector
|
||||||
|
from scrapy.contrib.loader import XPathItemLoader
|
||||||
|
from scrapy.contrib_exp.crawlspider import CrawlSpider, Rule
|
||||||
|
|
||||||
|
from googledir.items import GoogledirItem
|
||||||
|
|
||||||
|
class GoogleDirectorySpider(CrawlSpider):
|
||||||
|
|
||||||
|
name = 'google_directory'
|
||||||
|
allowed_domains = ['directory.google.com']
|
||||||
|
start_urls = ['http://directory.google.com/']
|
||||||
|
|
||||||
|
rules = (
|
||||||
|
# search for categories pattern and follow links
|
||||||
|
Rule(r'/[A-Z][a-zA-Z_/]+$', 'parse_category', follow=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_category(self, response):
|
||||||
|
# The main selector we're using to extract data from the page
|
||||||
|
main_selector = HtmlXPathSelector(response)
|
||||||
|
|
||||||
|
# The XPath to website links in the directory page
|
||||||
|
xpath = '//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font'
|
||||||
|
|
||||||
|
# Get a list of (sub) selectors to each website node pointed by the XPath
|
||||||
|
sub_selectors = main_selector.select(xpath)
|
||||||
|
|
||||||
|
# Iterate over the sub-selectors to extract data for each website
|
||||||
|
for selector in sub_selectors:
|
||||||
|
item = GoogledirItem()
|
||||||
|
|
||||||
|
l = XPathItemLoader(item=item, selector=selector)
|
||||||
|
l.add_xpath('name', 'a/text()')
|
||||||
|
l.add_xpath('url', 'a/@href')
|
||||||
|
l.add_xpath('description', 'font[2]/text()')
|
||||||
|
|
||||||
|
# Here we populate the item and yield it
|
||||||
|
yield l.load_item()
|
||||||
|
|
||||||
|
SPIDER = GoogleDirectorySpider()
|
||||||
|
|
7
examples/experimental/googledir/scrapy-ctl.py
Normal file
7
examples/experimental/googledir/scrapy-ctl.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import os
|
||||||
|
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'googledir.settings')
|
||||||
|
|
||||||
|
from scrapy.command.cmdline import execute
|
||||||
|
execute()
|
1
examples/experimental/imdb/imdb/__init__.py
Normal file
1
examples/experimental/imdb/imdb/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# package
|
12
examples/experimental/imdb/imdb/items.py
Normal file
12
examples/experimental/imdb/imdb/items.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# http://doc.scrapy.org/topics/items.html
|
||||||
|
|
||||||
|
from scrapy.item import Item, Field
|
||||||
|
|
||||||
|
class ImdbItem(Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = Field()
|
||||||
|
title = Field()
|
||||||
|
url = Field()
|
8
examples/experimental/imdb/imdb/pipelines.py
Normal file
8
examples/experimental/imdb/imdb/pipelines.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||||
|
|
||||||
|
class ImdbPipeline(object):
|
||||||
|
def process_item(self, spider, item):
|
||||||
|
return item
|
20
examples/experimental/imdb/imdb/settings.py
Normal file
20
examples/experimental/imdb/imdb/settings.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# Scrapy settings for imdb project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only the most important settings by
|
||||||
|
# default. All the other settings are documented here:
|
||||||
|
#
|
||||||
|
# http://doc.scrapy.org/topics/settings.html
|
||||||
|
#
|
||||||
|
# Or you can copy and paste them from where they're defined in Scrapy:
|
||||||
|
#
|
||||||
|
# scrapy/conf/default_settings.py
|
||||||
|
#
|
||||||
|
|
||||||
|
BOT_NAME = 'imdb'
|
||||||
|
BOT_VERSION = '1.0'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['imdb.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'imdb.spiders'
|
||||||
|
DEFAULT_ITEM_CLASS = 'imdb.items.ImdbItem'
|
||||||
|
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
||||||
|
|
8
examples/experimental/imdb/imdb/spiders/__init__.py
Normal file
8
examples/experimental/imdb/imdb/spiders/__init__.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# To create the first spider for your project use this command:
|
||||||
|
#
|
||||||
|
# scrapy-ctl.py genspider myspider myspider-domain.com
|
||||||
|
#
|
||||||
|
# For more info see:
|
||||||
|
# http://doc.scrapy.org/topics/spiders.html
|
141
examples/experimental/imdb/imdb/spiders/imdb_site.py
Normal file
141
examples/experimental/imdb/imdb/spiders/imdb_site.py
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.selector import HtmlXPathSelector
|
||||||
|
from scrapy.contrib.loader import XPathItemLoader
|
||||||
|
from scrapy.contrib_exp.crawlspider import CrawlSpider, Rule
|
||||||
|
from scrapy.contrib_exp.crawlspider.reqext import SgmlRequestExtractor
|
||||||
|
from scrapy.contrib_exp.crawlspider.reqproc import Canonicalize, \
|
||||||
|
FilterDupes, FilterUrl
|
||||||
|
from scrapy.utils.url import urljoin_rfc
|
||||||
|
|
||||||
|
from imdb.items import ImdbItem, Field
|
||||||
|
|
||||||
|
from itertools import chain, imap, izip
|
||||||
|
|
||||||
|
class UsaOpeningWeekMovie(ImdbItem):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class UsaTopWeekMovie(ImdbItem):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Top250Movie(ImdbItem):
|
||||||
|
rank = Field()
|
||||||
|
rating = Field()
|
||||||
|
year = Field()
|
||||||
|
votes = Field()
|
||||||
|
|
||||||
|
class MovieItem(ImdbItem):
|
||||||
|
release_date = Field()
|
||||||
|
tagline = Field()
|
||||||
|
|
||||||
|
|
||||||
|
class ImdbSiteSpider(CrawlSpider):
|
||||||
|
name = 'imdb.com'
|
||||||
|
allowed_domains = ['imdb.com']
|
||||||
|
start_urls = ['http://www.imdb.com/']
|
||||||
|
|
||||||
|
# extract requests using this classes from urls matching 'follow' flag
|
||||||
|
request_extractors = [
|
||||||
|
SgmlRequestExtractor(tags=['a'], attrs=['href']),
|
||||||
|
]
|
||||||
|
|
||||||
|
# process requests using this classes from urls matching 'follow' flag
|
||||||
|
request_processors = [
|
||||||
|
Canonicalize(),
|
||||||
|
FilterDupes(),
|
||||||
|
FilterUrl(deny=r'/tt\d+/$'), # deny movie url as we will dispatch
|
||||||
|
# manually the movie requests
|
||||||
|
]
|
||||||
|
|
||||||
|
# include domain bit for demo purposes
|
||||||
|
rules = (
|
||||||
|
# these two rules expects requests from start url
|
||||||
|
Rule(r'imdb.com/nowplaying/$', 'parse_now_playing'),
|
||||||
|
Rule(r'imdb.com/chart/top$', 'parse_top_250'),
|
||||||
|
# this rule will parse requests manually dispatched
|
||||||
|
Rule(r'imdb.com/title/tt\d+/$', 'parse_movie_info'),
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_now_playing(self, response):
|
||||||
|
"""Scrapes USA openings this week and top 10 in week"""
|
||||||
|
self.log("Parsing USA Top Week")
|
||||||
|
hxs = HtmlXPathSelector(response)
|
||||||
|
|
||||||
|
_urljoin = lambda url: self._urljoin(response, url)
|
||||||
|
|
||||||
|
#
|
||||||
|
# openings this week
|
||||||
|
#
|
||||||
|
openings = hxs.select('//table[@class="movies"]//a[@class="title"]')
|
||||||
|
boxoffice = hxs.select('//table[@class="boxoffice movies"]//a[@class="title"]')
|
||||||
|
|
||||||
|
opening_titles = openings.select('text()').extract()
|
||||||
|
opening_urls = imap(_urljoin, openings.select('@href').extract())
|
||||||
|
|
||||||
|
box_titles = boxoffice.select('text()').extract()
|
||||||
|
box_urls = imap(_urljoin, boxoffice.select('@href').extract())
|
||||||
|
|
||||||
|
# items
|
||||||
|
opening_items = (UsaOpeningWeekMovie(title=title, url=url)
|
||||||
|
for (title, url)
|
||||||
|
in izip(opening_titles, opening_urls))
|
||||||
|
|
||||||
|
box_items = (UsaTopWeekMovie(title=title, url=url)
|
||||||
|
for (title, url)
|
||||||
|
in izip(box_titles, box_urls))
|
||||||
|
|
||||||
|
# movie requests
|
||||||
|
requests = imap(self.make_requests_from_url,
|
||||||
|
chain(opening_urls, box_urls))
|
||||||
|
|
||||||
|
return chain(opening_items, box_items, requests)
|
||||||
|
|
||||||
|
def parse_top_250(self, response):
|
||||||
|
"""Scrapes movies from top 250 list"""
|
||||||
|
self.log("Parsing Top 250")
|
||||||
|
hxs = HtmlXPathSelector(response)
|
||||||
|
|
||||||
|
# scrap each row in the table
|
||||||
|
rows = hxs.select('//div[@id="main"]/table/tr//a/ancestor::tr')
|
||||||
|
for row in rows:
|
||||||
|
fields = row.select('td//text()').extract()
|
||||||
|
url, = row.select('td//a/@href').extract()
|
||||||
|
url = self._urljoin(response, url)
|
||||||
|
|
||||||
|
item = Top250Movie()
|
||||||
|
item['title'] = fields[2]
|
||||||
|
item['url'] = url
|
||||||
|
item['rank'] = fields[0]
|
||||||
|
item['rating'] = fields[1]
|
||||||
|
item['year'] = fields[3]
|
||||||
|
item['votes'] = fields[4]
|
||||||
|
|
||||||
|
# scrapped top250 item
|
||||||
|
yield item
|
||||||
|
# fetch movie
|
||||||
|
yield self.make_requests_from_url(url)
|
||||||
|
|
||||||
|
def parse_movie_info(self, response):
|
||||||
|
"""Scrapes movie information"""
|
||||||
|
self.log("Parsing Movie Info")
|
||||||
|
hxs = HtmlXPathSelector(response)
|
||||||
|
selector = hxs.select('//div[@class="maindetails"]')
|
||||||
|
|
||||||
|
item = MovieItem()
|
||||||
|
# set url
|
||||||
|
item['url'] = response.url
|
||||||
|
|
||||||
|
# use item loader for other attributes
|
||||||
|
l = XPathItemLoader(item=item, selector=selector)
|
||||||
|
l.add_xpath('title', './/h1/text()')
|
||||||
|
l.add_xpath('release_date', './/h5[text()="Release Date:"]'
|
||||||
|
'/following-sibling::div/text()')
|
||||||
|
l.add_xpath('tagline', './/h5[text()="Tagline:"]'
|
||||||
|
'/following-sibling::div/text()')
|
||||||
|
|
||||||
|
yield l.load_item()
|
||||||
|
|
||||||
|
def _urljoin(self, response, url):
|
||||||
|
"""Helper to convert relative urls to absolute"""
|
||||||
|
return urljoin_rfc(response.url, url, response.encoding)
|
||||||
|
|
||||||
|
SPIDER = ImdbSiteSpider()
|
7
examples/experimental/imdb/scrapy-ctl.py
Normal file
7
examples/experimental/imdb/scrapy-ctl.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import os
|
||||||
|
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'imdb.settings')
|
||||||
|
|
||||||
|
from scrapy.command.cmdline import execute
|
||||||
|
execute()
|
@ -6,7 +6,8 @@ from googledir.items import GoogledirItem
|
|||||||
|
|
||||||
class GoogleDirectorySpider(CrawlSpider):
|
class GoogleDirectorySpider(CrawlSpider):
|
||||||
|
|
||||||
domain_name = 'directory.google.com'
|
name = 'directory.google.com'
|
||||||
|
allow_domains = ['directory.google.com']
|
||||||
start_urls = ['http://directory.google.com/']
|
start_urls = ['http://directory.google.com/']
|
||||||
|
|
||||||
rules = (
|
rules = (
|
||||||
|
@ -1,51 +0,0 @@
|
|||||||
"""
|
|
||||||
Simple script to follow links from a start url. The links are followed in no
|
|
||||||
particular order.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
count_and_follow_links.py <start_url> <links_to_follow>
|
|
||||||
|
|
||||||
Example:
|
|
||||||
count_and_follow_links.py http://scrapy.org/ 20
|
|
||||||
|
|
||||||
For each page visisted, this script will print the page body size and the
|
|
||||||
number of links found.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from urlparse import urljoin
|
|
||||||
|
|
||||||
from scrapy.crawler import Crawler
|
|
||||||
from scrapy.selector import HtmlXPathSelector
|
|
||||||
from scrapy.http import Request, HtmlResponse
|
|
||||||
|
|
||||||
links_followed = 0
|
|
||||||
|
|
||||||
def parse(response):
|
|
||||||
global links_followed
|
|
||||||
links_followed += 1
|
|
||||||
if links_followed >= links_to_follow:
|
|
||||||
crawler.stop()
|
|
||||||
|
|
||||||
# ignore non-HTML responses
|
|
||||||
if not isinstance(response, HtmlResponse):
|
|
||||||
return
|
|
||||||
|
|
||||||
links = HtmlXPathSelector(response).select('//a/@href').extract()
|
|
||||||
abslinks = [urljoin(response.url, l) for l in links]
|
|
||||||
|
|
||||||
print "page %2d/%d: %s" % (links_followed, links_to_follow, response.url)
|
|
||||||
print " size : %d bytes" % len(response.body)
|
|
||||||
print " links: %d" % len(links)
|
|
||||||
print
|
|
||||||
|
|
||||||
return [Request(l, callback=parse) for l in abslinks]
|
|
||||||
|
|
||||||
if len(sys.argv) != 3:
|
|
||||||
print __doc__
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
start_url, links_to_follow = sys.argv[1], int(sys.argv[2])
|
|
||||||
request = Request(start_url, callback=parse)
|
|
||||||
crawler = Crawler()
|
|
||||||
crawler.crawl(request)
|
|
@ -1,72 +0,0 @@
|
|||||||
DROP TABLE IF EXISTS `url_history`;
|
|
||||||
DROP TABLE IF EXISTS `version`;
|
|
||||||
DROP TABLE IF EXISTS `url_status`;
|
|
||||||
DROP TABLE IF EXISTS `ticket`;
|
|
||||||
DROP TABLE IF EXISTS `domain_stats`;
|
|
||||||
DROP TABLE IF EXISTS `domain_stats_history`;
|
|
||||||
DROP TABLE IF EXISTS `domain_data_history`;
|
|
||||||
|
|
||||||
CREATE TABLE `ticket` (
|
|
||||||
`guid` char(40) NOT NULL,
|
|
||||||
`domain` varchar(255) default NULL,
|
|
||||||
`url` varchar(2048) default NULL,
|
|
||||||
`url_hash` char(40) default NULL, -- so we can join to url_status
|
|
||||||
PRIMARY KEY (`guid`)
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
|
||||||
|
|
||||||
CREATE TABLE `version` (
|
|
||||||
`id` bigint(20) NOT NULL auto_increment,
|
|
||||||
`guid` char(40) NOT NULL,
|
|
||||||
`version` char(40) NOT NULL,
|
|
||||||
`seen` datetime NOT NULL,
|
|
||||||
PRIMARY KEY (`id`),
|
|
||||||
FOREIGN KEY (`guid`) REFERENCES ticket(guid) ON UPDATE CASCADE ON DELETE CASCADE,
|
|
||||||
UNIQUE KEY (`version`)
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
|
||||||
|
|
||||||
CREATE TABLE `url_status` (
|
|
||||||
-- see http://support.microsoft.com/kb/q208427/ for explanation of 2048
|
|
||||||
`url_hash` char(40) NOT NULL, -- for faster searches
|
|
||||||
`url` varchar(2048) NOT NULL,
|
|
||||||
`parent_hash` char(40) default NULL, -- the url that was followed to this one - for reporting
|
|
||||||
`last_version` char(40) default NULL, -- can be null if it generated an error the last time is was checked
|
|
||||||
`last_checked` datetime NOT NULL,
|
|
||||||
PRIMARY KEY (`url_hash`)
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
|
||||||
|
|
||||||
CREATE TABLE `url_history` (
|
|
||||||
`url_hash` char(40) NOT NULL,
|
|
||||||
`version` char(40) NOT NULL,
|
|
||||||
`postdata_hash` char(40) default NULL,
|
|
||||||
`created` datetime NOT NULL,
|
|
||||||
PRIMARY KEY (`version`),
|
|
||||||
FOREIGN KEY (`url_hash`) REFERENCES url_status(url_hash) ON UPDATE CASCADE ON DELETE CASCADE
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
|
||||||
|
|
||||||
CREATE TABLE `domain_stats` (
|
|
||||||
`key1` varchar(128) NOT NULL,
|
|
||||||
`key2` varchar(128) NOT NULL,
|
|
||||||
`value` text,
|
|
||||||
PRIMARY KEY `key1_key2` (`key1`, `key2`),
|
|
||||||
KEY `key1` (`key1`)
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
|
||||||
|
|
||||||
CREATE TABLE `domain_stats_history` (
|
|
||||||
`id` bigint(20) NOT NULL auto_increment,
|
|
||||||
`key1` varchar(128) NOT NULL,
|
|
||||||
`key2` varchar(128) NOT NULL,
|
|
||||||
`value` varchar(2048) NOT NULL,
|
|
||||||
`stored` datetime NOT NULL,
|
|
||||||
PRIMARY KEY (`id`),
|
|
||||||
KEY `key1_key2` (`key1`, `key2`),
|
|
||||||
KEY `key1` (`key1`),
|
|
||||||
KEY `stored` (`stored`)
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
|
||||||
|
|
||||||
CREATE TABLE `domain_data_history` (
|
|
||||||
`domain` varchar(255) NOT NULL,
|
|
||||||
`stored` datetime NOT NULL,
|
|
||||||
`data` text,
|
|
||||||
KEY `domain_stored` (`domain`, `stored`),
|
|
||||||
KEY `domain` (`domain`)
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
|
@ -2,8 +2,8 @@
|
|||||||
Scrapy - a screen scraping framework written in Python
|
Scrapy - a screen scraping framework written in Python
|
||||||
"""
|
"""
|
||||||
|
|
||||||
version_info = (0, 8, 0, '', 0)
|
version_info = (0, 9, 0, 'dev')
|
||||||
__version__ = "0.8"
|
__version__ = "0.9-dev"
|
||||||
|
|
||||||
import sys, os, warnings
|
import sys, os, warnings
|
||||||
|
|
||||||
@ -17,11 +17,6 @@ warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
|
|||||||
# monkey patches to fix external library issues
|
# monkey patches to fix external library issues
|
||||||
from scrapy.xlib import twisted_250_monkeypatches
|
from scrapy.xlib import twisted_250_monkeypatches
|
||||||
|
|
||||||
# add some common encoding aliases not included by default in Python
|
|
||||||
from scrapy.utils.encoding import add_encoding_alias
|
|
||||||
add_encoding_alias('gb2312', 'zh-cn')
|
|
||||||
add_encoding_alias('cp1251', 'win-1251')
|
|
||||||
|
|
||||||
# optional_features is a set containing Scrapy optional features
|
# optional_features is a set containing Scrapy optional features
|
||||||
optional_features = set()
|
optional_features = set()
|
||||||
|
|
||||||
|
@ -7,20 +7,14 @@ import cProfile
|
|||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from scrapy.spider import spiders
|
|
||||||
from scrapy.xlib import lsprofcalltree
|
from scrapy.xlib import lsprofcalltree
|
||||||
from scrapy.conf import settings
|
from scrapy.conf import settings
|
||||||
from scrapy.command.models import ScrapyCommand
|
from scrapy.command.models import ScrapyCommand
|
||||||
|
from scrapy.utils.signal import send_catch_log
|
||||||
|
|
||||||
# This dict holds information about the executed command for later use
|
# Signal that carries information about the command which was executed
|
||||||
command_executed = {}
|
# args: cmdname, cmdobj, args, opts
|
||||||
|
command_executed = object()
|
||||||
def _save_command_executed(cmdname, cmd, args, opts):
|
|
||||||
"""Save command executed info for later reference"""
|
|
||||||
command_executed['name'] = cmdname
|
|
||||||
command_executed['class'] = cmd
|
|
||||||
command_executed['args'] = args[:]
|
|
||||||
command_executed['opts'] = opts.__dict__.copy()
|
|
||||||
|
|
||||||
def _find_commands(dir):
|
def _find_commands(dir):
|
||||||
try:
|
try:
|
||||||
@ -127,7 +121,8 @@ def execute(argv=None):
|
|||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
del args[0] # remove command name from args
|
del args[0] # remove command name from args
|
||||||
_save_command_executed(cmdname, cmd, args, opts)
|
send_catch_log(signal=command_executed, cmdname=cmdname, cmdobj=cmd, \
|
||||||
|
args=args, opts=opts)
|
||||||
from scrapy.core.manager import scrapymanager
|
from scrapy.core.manager import scrapymanager
|
||||||
scrapymanager.configure(control_reactor=True)
|
scrapymanager.configure(control_reactor=True)
|
||||||
ret = _run_command(cmd, args, opts)
|
ret = _run_command(cmd, args, opts)
|
||||||
@ -136,23 +131,25 @@ def execute(argv=None):
|
|||||||
|
|
||||||
def _run_command(cmd, args, opts):
|
def _run_command(cmd, args, opts):
|
||||||
if opts.profile or opts.lsprof:
|
if opts.profile or opts.lsprof:
|
||||||
if opts.profile:
|
return _run_command_profiled(cmd, args, opts)
|
||||||
log.msg("writing cProfile stats to %r" % opts.profile)
|
|
||||||
if opts.lsprof:
|
|
||||||
log.msg("writing lsprof stats to %r" % opts.lsprof)
|
|
||||||
loc = locals()
|
|
||||||
p = cProfile.Profile()
|
|
||||||
p.runctx('ret = cmd.run(args, opts)', globals(), loc)
|
|
||||||
if opts.profile:
|
|
||||||
p.dump_stats(opts.profile)
|
|
||||||
k = lsprofcalltree.KCacheGrind(p)
|
|
||||||
if opts.lsprof:
|
|
||||||
with open(opts.lsprof, 'w') as f:
|
|
||||||
k.output(f)
|
|
||||||
ret = loc['ret']
|
|
||||||
else:
|
else:
|
||||||
ret = cmd.run(args, opts)
|
return cmd.run(args, opts)
|
||||||
return ret
|
|
||||||
|
def _run_command_profiled(cmd, args, opts):
|
||||||
|
if opts.profile:
|
||||||
|
log.msg("writing cProfile stats to %r" % opts.profile)
|
||||||
|
if opts.lsprof:
|
||||||
|
log.msg("writing lsprof stats to %r" % opts.lsprof)
|
||||||
|
loc = locals()
|
||||||
|
p = cProfile.Profile()
|
||||||
|
p.runctx('ret = cmd.run(args, opts)', globals(), loc)
|
||||||
|
if opts.profile:
|
||||||
|
p.dump_stats(opts.profile)
|
||||||
|
k = lsprofcalltree.KCacheGrind(p)
|
||||||
|
if opts.lsprof:
|
||||||
|
with open(opts.lsprof, 'w') as f:
|
||||||
|
k.output(f)
|
||||||
|
return loc['ret']
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
execute()
|
execute()
|
||||||
|
@ -1,20 +1,27 @@
|
|||||||
|
from scrapy import log
|
||||||
from scrapy.command import ScrapyCommand
|
from scrapy.command import ScrapyCommand
|
||||||
from scrapy.core.manager import scrapymanager
|
from scrapy.core.manager import scrapymanager
|
||||||
from scrapy.conf import settings
|
from scrapy.conf import settings
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.spider import spiders
|
||||||
|
from scrapy.utils.url import is_url
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
class Command(ScrapyCommand):
|
class Command(ScrapyCommand):
|
||||||
|
|
||||||
requires_project = True
|
requires_project = True
|
||||||
|
|
||||||
def syntax(self):
|
def syntax(self):
|
||||||
return "[options] <domain|url> ..."
|
return "[options] <spider|url> ..."
|
||||||
|
|
||||||
def short_desc(self):
|
def short_desc(self):
|
||||||
return "Start crawling a domain or URL"
|
return "Start crawling from a spider or URL"
|
||||||
|
|
||||||
def add_options(self, parser):
|
def add_options(self, parser):
|
||||||
ScrapyCommand.add_options(self, parser)
|
ScrapyCommand.add_options(self, parser)
|
||||||
|
parser.add_option("--spider", dest="spider", default=None, \
|
||||||
|
help="always use this spider when arguments are urls")
|
||||||
parser.add_option("-n", "--nofollow", dest="nofollow", action="store_true", \
|
parser.add_option("-n", "--nofollow", dest="nofollow", action="store_true", \
|
||||||
help="don't follow links (for use with URLs only)")
|
help="don't follow links (for use with URLs only)")
|
||||||
|
|
||||||
@ -24,4 +31,45 @@ class Command(ScrapyCommand):
|
|||||||
settings.overrides['CRAWLSPIDER_FOLLOW_LINKS'] = False
|
settings.overrides['CRAWLSPIDER_FOLLOW_LINKS'] = False
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
scrapymanager.runonce(*args)
|
urls, names = self._split_urls_and_names(args)
|
||||||
|
for name in names:
|
||||||
|
scrapymanager.crawl_spider_name(name)
|
||||||
|
|
||||||
|
if opts.spider:
|
||||||
|
try:
|
||||||
|
spider = spiders.create(opts.spider)
|
||||||
|
for url in urls:
|
||||||
|
scrapymanager.crawl_url(url, spider)
|
||||||
|
except KeyError:
|
||||||
|
log.msg('Could not find spider: %s' % opts.spider, log.ERROR)
|
||||||
|
else:
|
||||||
|
for name, urls in self._group_urls_by_spider(urls):
|
||||||
|
spider = spiders.create(name)
|
||||||
|
for url in urls:
|
||||||
|
scrapymanager.crawl_url(url, spider)
|
||||||
|
|
||||||
|
scrapymanager.start()
|
||||||
|
|
||||||
|
def _group_urls_by_spider(self, urls):
|
||||||
|
spider_urls = defaultdict(list)
|
||||||
|
for url in urls:
|
||||||
|
spider_names = spiders.find_by_request(Request(url))
|
||||||
|
if not spider_names:
|
||||||
|
log.msg('Could not find spider for url: %s' % url,
|
||||||
|
log.ERROR)
|
||||||
|
elif len(spider_names) > 1:
|
||||||
|
log.msg('More than one spider found for url: %s' % url,
|
||||||
|
log.ERROR)
|
||||||
|
else:
|
||||||
|
spider_urls[spider_names[0]].append(url)
|
||||||
|
return spider_urls.items()
|
||||||
|
|
||||||
|
def _split_urls_and_names(self, args):
|
||||||
|
urls = []
|
||||||
|
names = []
|
||||||
|
for arg in args:
|
||||||
|
if is_url(arg):
|
||||||
|
urls.append(arg)
|
||||||
|
else:
|
||||||
|
names.append(arg)
|
||||||
|
return urls, names
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
import pprint
|
import pprint
|
||||||
|
|
||||||
|
from scrapy import log
|
||||||
from scrapy.command import ScrapyCommand
|
from scrapy.command import ScrapyCommand
|
||||||
from scrapy.utils.fetch import fetch
|
from scrapy.core.manager import scrapymanager
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.spider import BaseSpider, spiders
|
||||||
|
from scrapy.utils.url import is_url
|
||||||
|
|
||||||
class Command(ScrapyCommand):
|
class Command(ScrapyCommand):
|
||||||
|
|
||||||
@ -19,17 +23,33 @@ class Command(ScrapyCommand):
|
|||||||
|
|
||||||
def add_options(self, parser):
|
def add_options(self, parser):
|
||||||
ScrapyCommand.add_options(self, parser)
|
ScrapyCommand.add_options(self, parser)
|
||||||
|
parser.add_option("--spider", dest="spider",
|
||||||
|
help="use this spider")
|
||||||
parser.add_option("--headers", dest="headers", action="store_true", \
|
parser.add_option("--headers", dest="headers", action="store_true", \
|
||||||
help="print response HTTP headers instead of body")
|
help="print response HTTP headers instead of body")
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
if len(args) != 1:
|
if len(args) != 1 or not is_url(args[0]):
|
||||||
print "One URL is required"
|
return False
|
||||||
return
|
responses = [] # to collect downloaded responses
|
||||||
|
request = Request(args[0], callback=responses.append, dont_filter=True)
|
||||||
|
|
||||||
responses = fetch(args)
|
if opts.spider:
|
||||||
|
try:
|
||||||
|
spider = spiders.create(opts.spider)
|
||||||
|
except KeyError:
|
||||||
|
log.msg("Could not find spider: %s" % opts.spider, log.ERROR)
|
||||||
|
else:
|
||||||
|
spider = scrapymanager._create_spider_for_request(request, \
|
||||||
|
BaseSpider('default'))
|
||||||
|
|
||||||
|
scrapymanager.crawl_request(request, spider)
|
||||||
|
scrapymanager.start()
|
||||||
|
|
||||||
|
# display response
|
||||||
if responses:
|
if responses:
|
||||||
if opts.headers:
|
if opts.headers:
|
||||||
pprint.pprint(responses[0].headers)
|
pprint.pprint(responses[0].headers)
|
||||||
else:
|
else:
|
||||||
print responses[0].body
|
print responses[0].body
|
||||||
|
|
||||||
|
@ -15,10 +15,11 @@ SPIDER_TEMPLATES_PATH = join(scrapy.__path__[0], 'templates', 'spiders')
|
|||||||
|
|
||||||
|
|
||||||
def sanitize_module_name(module_name):
|
def sanitize_module_name(module_name):
|
||||||
"""Sanitize the given module name, by replacing dashes with underscores and
|
"""Sanitize the given module name, by replacing dashes and points
|
||||||
prefixing it with a letter if it doesn't start with one
|
with underscores and prefixing it with a letter if it doesn't start
|
||||||
|
with one
|
||||||
"""
|
"""
|
||||||
module_name = module_name.replace('-', '_')
|
module_name = module_name.replace('-', '_').replace('.', '_')
|
||||||
if module_name[0] not in string.ascii_letters:
|
if module_name[0] not in string.ascii_letters:
|
||||||
module_name = "a" + module_name
|
module_name = "a" + module_name
|
||||||
return module_name
|
return module_name
|
||||||
@ -28,7 +29,7 @@ class Command(ScrapyCommand):
|
|||||||
requires_project = True
|
requires_project = True
|
||||||
|
|
||||||
def syntax(self):
|
def syntax(self):
|
||||||
return "[options] <spider_module_name> <spider_domain_name>"
|
return "[options] <name> <domain>"
|
||||||
|
|
||||||
def short_desc(self):
|
def short_desc(self):
|
||||||
return "Generate new spider based on template passed with -t or --template"
|
return "Generate new spider based on template passed with -t or --template"
|
||||||
@ -54,28 +55,37 @@ class Command(ScrapyCommand):
|
|||||||
print template.read()
|
print template.read()
|
||||||
return
|
return
|
||||||
|
|
||||||
if len(args) < 2:
|
if len(args) != 2:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
module = sanitize_module_name(args[0])
|
name = args[0]
|
||||||
domain = args[1]
|
domain = args[1]
|
||||||
spider = spiders.fromdomain(domain)
|
|
||||||
if spider and not opts.force:
|
module = sanitize_module_name(name)
|
||||||
print "Spider '%s' already exists in module:" % domain
|
|
||||||
print " %s" % spider.__module__
|
# if spider already exists and not force option then halt
|
||||||
sys.exit(1)
|
try:
|
||||||
|
spider = spiders.create(name)
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if not opts.force:
|
||||||
|
print "Spider '%s' already exists in module:" % name
|
||||||
|
print " %s" % spider.__module__
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
template_file = self._find_template(opts.template)
|
template_file = self._find_template(opts.template)
|
||||||
if template_file:
|
if template_file:
|
||||||
self._genspider(module, domain, opts.template, template_file)
|
self._genspider(module, name, domain, opts.template, template_file)
|
||||||
|
|
||||||
def _genspider(self, module, domain, template_name, template_file):
|
def _genspider(self, module, name, domain, template_name, template_file):
|
||||||
"""Generate the spider module, based on the given template"""
|
"""Generate the spider module, based on the given template"""
|
||||||
tvars = {
|
tvars = {
|
||||||
'project_name': settings.get('BOT_NAME'),
|
'project_name': settings.get('BOT_NAME'),
|
||||||
'ProjectName': string_camelcase(settings.get('BOT_NAME')),
|
'ProjectName': string_camelcase(settings.get('BOT_NAME')),
|
||||||
'module': module,
|
'module': module,
|
||||||
'site': domain,
|
'name': name,
|
||||||
|
'domain': domain,
|
||||||
'classname': '%sSpider' % ''.join([s.capitalize() \
|
'classname': '%sSpider' % ''.join([s.capitalize() \
|
||||||
for s in module.split('_')])
|
for s in module.split('_')])
|
||||||
}
|
}
|
||||||
@ -86,7 +96,7 @@ class Command(ScrapyCommand):
|
|||||||
|
|
||||||
shutil.copyfile(template_file, spider_file)
|
shutil.copyfile(template_file, spider_file)
|
||||||
render_templatefile(spider_file, **tvars)
|
render_templatefile(spider_file, **tvars)
|
||||||
print "Created spider %r using template %r in module:" % (domain, \
|
print "Created spider %r using template %r in module:" % (name, \
|
||||||
template_name)
|
template_name)
|
||||||
print " %s.%s" % (spiders_module.__name__, module)
|
print " %s.%s" % (spiders_module.__name__, module)
|
||||||
|
|
||||||
|
@ -1,11 +1,15 @@
|
|||||||
from scrapy.command import ScrapyCommand
|
from scrapy.command import ScrapyCommand
|
||||||
from scrapy.utils.fetch import fetch
|
from scrapy.core.manager import scrapymanager
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy.item import BaseItem
|
from scrapy.item import BaseItem
|
||||||
from scrapy.spider import spiders
|
from scrapy.spider import spiders
|
||||||
from scrapy.utils import display
|
from scrapy.utils import display
|
||||||
|
from scrapy.utils.spider import iterate_spider_output
|
||||||
|
from scrapy.utils.url import is_url
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
class Command(ScrapyCommand):
|
class Command(ScrapyCommand):
|
||||||
|
|
||||||
requires_project = True
|
requires_project = True
|
||||||
@ -18,6 +22,8 @@ class Command(ScrapyCommand):
|
|||||||
|
|
||||||
def add_options(self, parser):
|
def add_options(self, parser):
|
||||||
ScrapyCommand.add_options(self, parser)
|
ScrapyCommand.add_options(self, parser)
|
||||||
|
parser.add_option("--spider", dest="spider", default=None, \
|
||||||
|
help="always use this spider")
|
||||||
parser.add_option("--nolinks", dest="nolinks", action="store_true", \
|
parser.add_option("--nolinks", dest="nolinks", action="store_true", \
|
||||||
help="don't show extracted links")
|
help="don't show extracted links")
|
||||||
parser.add_option("--noitems", dest="noitems", action="store_true", \
|
parser.add_option("--noitems", dest="noitems", action="store_true", \
|
||||||
@ -37,18 +43,13 @@ class Command(ScrapyCommand):
|
|||||||
return item
|
return item
|
||||||
|
|
||||||
def run_callback(self, spider, response, callback, args, opts):
|
def run_callback(self, spider, response, callback, args, opts):
|
||||||
spider = spiders.fromurl(response.url)
|
|
||||||
if not spider:
|
|
||||||
log.msg('Cannot find spider for url: %s' % response.url, level=log.ERROR)
|
|
||||||
return (), ()
|
|
||||||
|
|
||||||
if callback:
|
if callback:
|
||||||
callback_fcn = callback if callable(callback) else getattr(spider, callback, None)
|
callback_fcn = callback if callable(callback) else getattr(spider, callback, None)
|
||||||
if not callback_fcn:
|
if not callback_fcn:
|
||||||
log.msg('Cannot find callback %s in %s spider' % (callback, spider.domain_name))
|
log.msg('Cannot find callback %s in %s spider' % (callback, spider.name))
|
||||||
return (), ()
|
return (), ()
|
||||||
|
|
||||||
result = callback_fcn(response)
|
result = iterate_spider_output(callback_fcn(response))
|
||||||
links = [i for i in result if isinstance(i, Request)]
|
links = [i for i in result if isinstance(i, Request)]
|
||||||
items = [self.pipeline_process(i, spider, opts) for i in result if \
|
items = [self.pipeline_process(i, spider, opts) for i in result if \
|
||||||
isinstance(i, BaseItem)]
|
isinstance(i, BaseItem)]
|
||||||
@ -71,36 +72,68 @@ class Command(ScrapyCommand):
|
|||||||
display.pprint(list(links))
|
display.pprint(list(links))
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
if not args:
|
if not len(args) == 1 or not is_url(args[0]):
|
||||||
print "An URL is required"
|
return False
|
||||||
|
|
||||||
|
request = Request(args[0])
|
||||||
|
|
||||||
|
if opts.spider:
|
||||||
|
try:
|
||||||
|
spider = spiders.create(opts.spider)
|
||||||
|
except KeyError:
|
||||||
|
log.msg('Could not find spider: %s' % opts.spider, log.ERROR)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
spider = scrapymanager._create_spider_for_request(request, \
|
||||||
|
log_none=True, log_multiple=True)
|
||||||
|
|
||||||
|
if not spider:
|
||||||
return
|
return
|
||||||
|
|
||||||
for response in fetch(args):
|
responses = [] # to collect downloaded responses
|
||||||
spider = spiders.fromurl(response.url)
|
request = request.replace(callback=responses.append)
|
||||||
if not spider:
|
|
||||||
log.msg('Cannot find spider for "%s"' % response.url)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if self.callbacks:
|
scrapymanager.crawl_request(request, spider)
|
||||||
for callback in self.callbacks:
|
scrapymanager.start()
|
||||||
items, links = self.run_callback(spider, response, callback, args, opts)
|
|
||||||
self.print_results(items, links, callback, opts)
|
|
||||||
|
|
||||||
elif opts.rules:
|
if not responses:
|
||||||
rules = getattr(spider, 'rules', None)
|
log.msg('No response returned', log.ERROR, spider=spider)
|
||||||
if rules:
|
return
|
||||||
items, links = [], []
|
|
||||||
for rule in rules:
|
|
||||||
if rule.callback and rule.link_extractor.matches(response.url):
|
|
||||||
items, links = self.run_callback(spider, response, rule.callback, args, opts)
|
|
||||||
self.print_results(items, links, rule.callback, opts)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
log.msg('No rules found for spider "%s", please specify a callback for parsing' \
|
|
||||||
% spider.domain_name)
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
# now process response
|
||||||
|
# - if callbacks defined then call each one print results
|
||||||
|
# - if --rules option given search for matching spider's rule
|
||||||
|
# - default print result using default 'parse' spider's callback
|
||||||
|
response = responses[0]
|
||||||
|
|
||||||
|
if self.callbacks:
|
||||||
|
# apply each callback
|
||||||
|
for callback in self.callbacks:
|
||||||
|
items, links = self.run_callback(spider, response,
|
||||||
|
callback, args, opts)
|
||||||
|
self.print_results(items, links, callback, opts)
|
||||||
|
elif opts.rules:
|
||||||
|
# search for matching spider's rule
|
||||||
|
if hasattr(spider, 'rules') and spider.rules:
|
||||||
|
items, links = [], []
|
||||||
|
for rule in spider.rules:
|
||||||
|
if rule.link_extractor.matches(response.url) \
|
||||||
|
and rule.callback:
|
||||||
|
|
||||||
|
items, links = self.run_callback(spider,
|
||||||
|
response, rule.callback,
|
||||||
|
args, opts)
|
||||||
|
self.print_results(items, links,
|
||||||
|
rule.callback, opts)
|
||||||
|
# first-match rule breaks rules loop
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
items, links = self.run_callback(spider, response, 'parse', args, opts)
|
log.msg('No rules found for spider "%s", ' \
|
||||||
self.print_results(items, links, 'parse', opts)
|
'please specify a callback for parsing' \
|
||||||
|
% spider.name, log.ERROR)
|
||||||
|
else:
|
||||||
|
# default callback 'parse'
|
||||||
|
items, links = self.run_callback(spider, response,
|
||||||
|
'parse', args, opts)
|
||||||
|
self.print_results(items, links, 'parse', opts)
|
||||||
|
|
||||||
|
@ -52,6 +52,10 @@ class Command(ScrapyCommand):
|
|||||||
dispatcher.connect(exporter.export_item, signal=signals.item_passed)
|
dispatcher.connect(exporter.export_item, signal=signals.item_passed)
|
||||||
exporter.start_exporting()
|
exporter.start_exporting()
|
||||||
module = _import_file(args[0])
|
module = _import_file(args[0])
|
||||||
scrapymanager.runonce(module.SPIDER)
|
|
||||||
|
# schedule spider and start engine
|
||||||
|
scrapymanager.crawl_spider(module.SPIDER)
|
||||||
|
scrapymanager.start()
|
||||||
|
|
||||||
if opts.output:
|
if opts.output:
|
||||||
exporter.finish_exporting()
|
exporter.finish_exporting()
|
||||||
|
@ -9,4 +9,4 @@ class Command(ScrapyCommand):
|
|||||||
return "Start the Scrapy manager but don't run any spider (idle mode)"
|
return "Start the Scrapy manager but don't run any spider (idle mode)"
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
scrapymanager.start(*args)
|
scrapymanager.start(keep_alive=True)
|
||||||
|
@ -7,7 +7,7 @@ from os.path import join, exists
|
|||||||
import scrapy
|
import scrapy
|
||||||
from scrapy.command import ScrapyCommand
|
from scrapy.command import ScrapyCommand
|
||||||
from scrapy.utils.template import render_templatefile, string_camelcase
|
from scrapy.utils.template import render_templatefile, string_camelcase
|
||||||
from scrapy.utils.python import ignore_patterns, copytree
|
from scrapy.utils.py26 import ignore_patterns, copytree
|
||||||
|
|
||||||
TEMPLATES_PATH = join(scrapy.__path__[0], 'templates', 'project')
|
TEMPLATES_PATH = join(scrapy.__path__[0], 'templates', 'project')
|
||||||
|
|
||||||
|
@ -57,8 +57,6 @@ class ScrapyCommand(object):
|
|||||||
help="log level (default: %s)" % settings['LOGLEVEL'])
|
help="log level (default: %s)" % settings['LOGLEVEL'])
|
||||||
group.add_option("--nolog", action="store_true", dest="nolog", \
|
group.add_option("--nolog", action="store_true", dest="nolog", \
|
||||||
help="disable logging completely")
|
help="disable logging completely")
|
||||||
group.add_option("--spider", dest="spider", default=None, \
|
|
||||||
help="always use this spider when arguments are urls")
|
|
||||||
group.add_option("--profile", dest="profile", metavar="FILE", default=None, \
|
group.add_option("--profile", dest="profile", metavar="FILE", default=None, \
|
||||||
help="write python cProfile stats to FILE")
|
help="write python cProfile stats to FILE")
|
||||||
group.add_option("--lsprof", dest="lsprof", metavar="FILE", default=None, \
|
group.add_option("--lsprof", dest="lsprof", metavar="FILE", default=None, \
|
||||||
@ -99,10 +97,6 @@ class ScrapyCommand(object):
|
|||||||
if opts.nolog:
|
if opts.nolog:
|
||||||
settings.overrides['LOG_ENABLED'] = False
|
settings.overrides['LOG_ENABLED'] = False
|
||||||
|
|
||||||
if opts.spider:
|
|
||||||
from scrapy.spider import spiders
|
|
||||||
spiders.force_domain = opts.spider
|
|
||||||
|
|
||||||
if opts.pidfile:
|
if opts.pidfile:
|
||||||
with open(opts.pidfile, "w") as f:
|
with open(opts.pidfile, "w") as f:
|
||||||
f.write(str(os.getpid()))
|
f.write(str(os.getpid()))
|
||||||
|
@ -71,6 +71,40 @@ DOWNLOADER_STATS = True
|
|||||||
|
|
||||||
DUPEFILTER_CLASS = 'scrapy.contrib.dupefilter.RequestFingerprintDupeFilter'
|
DUPEFILTER_CLASS = 'scrapy.contrib.dupefilter.RequestFingerprintDupeFilter'
|
||||||
|
|
||||||
|
ENCODING_ALIASES = {}
|
||||||
|
|
||||||
|
ENCODING_ALIASES_BASE = {
|
||||||
|
# gb2312 is superseded by gb18030
|
||||||
|
'gb2312': 'gb18030',
|
||||||
|
'chinese': 'gb18030',
|
||||||
|
'csiso58gb231280': 'gb18030',
|
||||||
|
'euc- cn': 'gb18030',
|
||||||
|
'euccn': 'gb18030',
|
||||||
|
'eucgb2312-cn': 'gb18030',
|
||||||
|
'gb2312-1980': 'gb18030',
|
||||||
|
'gb2312-80': 'gb18030',
|
||||||
|
'iso- ir-58': 'gb18030',
|
||||||
|
# gbk is superseded by gb18030
|
||||||
|
'gbk': 'gb18030',
|
||||||
|
'936': 'gb18030',
|
||||||
|
'cp936': 'gb18030',
|
||||||
|
'ms936': 'gb18030',
|
||||||
|
# latin_1 is a subset of cp1252
|
||||||
|
'latin_1': 'cp1252',
|
||||||
|
'iso-8859-1': 'cp1252',
|
||||||
|
'iso8859-1': 'cp1252',
|
||||||
|
'8859': 'cp1252',
|
||||||
|
'cp819': 'cp1252',
|
||||||
|
'latin': 'cp1252',
|
||||||
|
'latin1': 'cp1252',
|
||||||
|
'l1': 'cp1252',
|
||||||
|
# others
|
||||||
|
'zh-cn': 'gb18030',
|
||||||
|
'win-1251': 'cp1251',
|
||||||
|
'macintosh' : 'mac_roman',
|
||||||
|
'x-sjis': 'shift_jis',
|
||||||
|
}
|
||||||
|
|
||||||
EXTENSIONS = {}
|
EXTENSIONS = {}
|
||||||
|
|
||||||
EXTENSIONS_BASE = {
|
EXTENSIONS_BASE = {
|
||||||
@ -101,6 +135,7 @@ ITEM_PROCESSOR = 'scrapy.contrib.pipeline.ItemPipelineManager'
|
|||||||
ITEM_PIPELINES = []
|
ITEM_PIPELINES = []
|
||||||
|
|
||||||
LOG_ENABLED = True
|
LOG_ENABLED = True
|
||||||
|
LOG_ENCODING = 'utf-8'
|
||||||
LOG_FORMATTER_CRAWLED = 'scrapy.contrib.logformatter.crawled_logline'
|
LOG_FORMATTER_CRAWLED = 'scrapy.contrib.logformatter.crawled_logline'
|
||||||
LOG_STDOUT = False
|
LOG_STDOUT = False
|
||||||
LOG_LEVEL = 'DEBUG'
|
LOG_LEVEL = 'DEBUG'
|
||||||
@ -122,6 +157,8 @@ MYSQL_CONNECTION_SETTINGS = {}
|
|||||||
|
|
||||||
NEWSPIDER_MODULE = ''
|
NEWSPIDER_MODULE = ''
|
||||||
|
|
||||||
|
RANDOMIZE_DOWNLOAD_DELAY = True
|
||||||
|
|
||||||
REDIRECT_MAX_METAREFRESH_DELAY = 100
|
REDIRECT_MAX_METAREFRESH_DELAY = 100
|
||||||
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
|
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
|
||||||
REDIRECT_PRIORITY_ADJUST = +2
|
REDIRECT_PRIORITY_ADJUST = +2
|
||||||
@ -150,7 +187,7 @@ SCHEDULER_MIDDLEWARES_BASE = {
|
|||||||
'scrapy.contrib.schedulermiddleware.duplicatesfilter.DuplicatesFilterMiddleware': 500,
|
'scrapy.contrib.schedulermiddleware.duplicatesfilter.DuplicatesFilterMiddleware': 500,
|
||||||
}
|
}
|
||||||
|
|
||||||
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
|
SCHEDULER_ORDER = 'DFO'
|
||||||
|
|
||||||
SPIDER_MANAGER_CLASS = 'scrapy.contrib.spidermanager.TwistedPluginSpiderManager'
|
SPIDER_MANAGER_CLASS = 'scrapy.contrib.spidermanager.TwistedPluginSpiderManager'
|
||||||
|
|
||||||
|
@ -5,13 +5,13 @@ because Amazon Web Service use timestamps for authentication.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
from time import strftime, gmtime
|
||||||
|
|
||||||
from scrapy.utils.httpobj import urlparse_cached
|
|
||||||
from scrapy.utils.aws import sign_request
|
from scrapy.utils.aws import sign_request
|
||||||
from scrapy.conf import settings
|
from scrapy.conf import settings
|
||||||
|
|
||||||
|
|
||||||
class AWSMiddleware(object):
|
class AWSMiddleware(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.access_key = settings['AWS_ACCESS_KEY_ID'] or \
|
self.access_key = settings['AWS_ACCESS_KEY_ID'] or \
|
||||||
os.environ.get('AWS_ACCESS_KEY_ID')
|
os.environ.get('AWS_ACCESS_KEY_ID')
|
||||||
@ -19,9 +19,6 @@ class AWSMiddleware(object):
|
|||||||
os.environ.get('AWS_SECRET_ACCESS_KEY')
|
os.environ.get('AWS_SECRET_ACCESS_KEY')
|
||||||
|
|
||||||
def process_request(self, request, spider):
|
def process_request(self, request, spider):
|
||||||
hostname = urlparse_cached(request).hostname
|
if request.meta.get('sign_s3_request'):
|
||||||
if spider.domain_name == 's3.amazonaws.com' \
|
request.headers['Date'] = strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime())
|
||||||
or (hostname and hostname.endswith('s3.amazonaws.com')):
|
|
||||||
request.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", \
|
|
||||||
time.gmtime())
|
|
||||||
sign_request(request, self.access_key, self.secret_key)
|
sign_request(request, self.access_key, self.secret_key)
|
||||||
|
@ -108,7 +108,7 @@ class FilesystemCacheStorage(object):
|
|||||||
|
|
||||||
def _get_request_path(self, spider, request):
|
def _get_request_path(self, spider, request):
|
||||||
key = request_fingerprint(request)
|
key = request_fingerprint(request)
|
||||||
return join(self.cachedir, spider.domain_name, key[0:2], key)
|
return join(self.cachedir, spider.name, key[0:2], key)
|
||||||
|
|
||||||
def _read_meta(self, spider, request):
|
def _read_meta(self, spider, request):
|
||||||
rpath = self._get_request_path(spider, request)
|
rpath = self._get_request_path(spider, request)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from scrapy import log
|
from scrapy import log
|
||||||
|
from scrapy.http import HtmlResponse
|
||||||
from scrapy.utils.url import urljoin_rfc
|
from scrapy.utils.url import urljoin_rfc
|
||||||
from scrapy.utils.response import get_meta_refresh
|
from scrapy.utils.response import get_meta_refresh
|
||||||
from scrapy.core.exceptions import IgnoreRequest
|
from scrapy.core.exceptions import IgnoreRequest
|
||||||
@ -24,10 +25,11 @@ class RedirectMiddleware(object):
|
|||||||
redirected = request.replace(url=redirected_url)
|
redirected = request.replace(url=redirected_url)
|
||||||
return self._redirect(redirected, request, spider, response.status)
|
return self._redirect(redirected, request, spider, response.status)
|
||||||
|
|
||||||
interval, url = get_meta_refresh(response)
|
if isinstance(response, HtmlResponse):
|
||||||
if url and interval < self.max_metarefresh_delay:
|
interval, url = get_meta_refresh(response)
|
||||||
redirected = self._redirect_request_using_get(request, url)
|
if url and interval < self.max_metarefresh_delay:
|
||||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
redirected = self._redirect_request_using_get(request, url)
|
||||||
|
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
@ -1,9 +1,5 @@
|
|||||||
from scrapy.contrib.exporter import BaseItemExporter
|
from scrapy.contrib.exporter import BaseItemExporter
|
||||||
|
from scrapy.utils.py26 import json
|
||||||
try:
|
|
||||||
import json
|
|
||||||
except ImportError:
|
|
||||||
import simplejson as json
|
|
||||||
|
|
||||||
class JsonLinesItemExporter(BaseItemExporter):
|
class JsonLinesItemExporter(BaseItemExporter):
|
||||||
|
|
||||||
|
@ -1,26 +0,0 @@
|
|||||||
"""
|
|
||||||
Extensions to override scrapy settings with per-group settings according to the
|
|
||||||
group the spider belongs to. It only overrides the settings when running the
|
|
||||||
crawl command with *only one domain as argument*.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from scrapy.conf import settings
|
|
||||||
from scrapy.core.exceptions import NotConfigured
|
|
||||||
from scrapy.command.cmdline import command_executed
|
|
||||||
|
|
||||||
class GroupSettings(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
if not settings.getbool("GROUPSETTINGS_ENABLED"):
|
|
||||||
raise NotConfigured
|
|
||||||
|
|
||||||
if command_executed and command_executed['name'] == 'crawl':
|
|
||||||
mod = __import__(settings['GROUPSETTINGS_MODULE'], {}, {}, [''])
|
|
||||||
args = command_executed['args']
|
|
||||||
if len(args) == 1 and not args[0].startswith('http://'):
|
|
||||||
domain = args[0]
|
|
||||||
settings.overrides.update(mod.default_settings)
|
|
||||||
for group, domains in mod.group_spiders.iteritems():
|
|
||||||
if domain in domains:
|
|
||||||
settings.overrides.update(mod.group_settings.get(group, {}))
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
This module provides a mechanism for collecting one (or more) sample items per
|
This module provides a mechanism for collecting one (or more) sample items per
|
||||||
domain.
|
spider.
|
||||||
|
|
||||||
The items are collected in a dict of guid->item and persisted by pickling that
|
The items are collected in a dict of guid->item and persisted by pickling that
|
||||||
dict into a file.
|
dict into a file.
|
||||||
@ -8,7 +8,7 @@ dict into a file.
|
|||||||
This can be useful for testing changes made to the framework or other common
|
This can be useful for testing changes made to the framework or other common
|
||||||
code that affects several spiders.
|
code that affects several spiders.
|
||||||
|
|
||||||
It uses the scrapy stats service to keep track of which domains are already
|
It uses the scrapy stats service to keep track of which spiders are already
|
||||||
sampled.
|
sampled.
|
||||||
|
|
||||||
Settings that affect this module:
|
Settings that affect this module:
|
||||||
@ -48,7 +48,7 @@ class ItemSamplerPipeline(object):
|
|||||||
raise NotConfigured
|
raise NotConfigured
|
||||||
self.items = {}
|
self.items = {}
|
||||||
self.spiders_count = 0
|
self.spiders_count = 0
|
||||||
self.empty_domains = set()
|
self.empty_spiders = set()
|
||||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||||
dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
|
dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
|
||||||
|
|
||||||
@ -66,21 +66,21 @@ class ItemSamplerPipeline(object):
|
|||||||
def engine_stopped(self):
|
def engine_stopped(self):
|
||||||
with open(self.filename, 'w') as f:
|
with open(self.filename, 'w') as f:
|
||||||
pickle.dump(self.items, f)
|
pickle.dump(self.items, f)
|
||||||
if self.empty_domains:
|
if self.empty_spiders:
|
||||||
log.msg("No products sampled for: %s" % " ".join(self.empty_domains), \
|
log.msg("No products sampled for: %s" % " ".join(self.empty_spiders), \
|
||||||
level=log.WARNING)
|
level=log.WARNING)
|
||||||
|
|
||||||
def spider_closed(self, spider, reason):
|
def spider_closed(self, spider, reason):
|
||||||
if reason == 'finished' and not stats.get_value("items_sampled", spider=spider):
|
if reason == 'finished' and not stats.get_value("items_sampled", spider=spider):
|
||||||
self.empty_domains.add(spider.domain_name)
|
self.empty_spiders.add(spider.name)
|
||||||
self.spiders_count += 1
|
self.spiders_count += 1
|
||||||
log.msg("Sampled %d domains so far (%d empty)" % (self.spiders_count, \
|
log.msg("Sampled %d spiders so far (%d empty)" % (self.spiders_count, \
|
||||||
len(self.empty_domains)), level=log.INFO)
|
len(self.empty_spiders)), level=log.INFO)
|
||||||
|
|
||||||
|
|
||||||
class ItemSamplerMiddleware(object):
|
class ItemSamplerMiddleware(object):
|
||||||
"""This middleware drops items and requests (when domain sampling has been
|
"""This middleware drops items and requests (when spider sampling has been
|
||||||
completed) to accelerate the processing of remaining domains"""
|
completed) to accelerate the processing of remaining spiders"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
if not settings['ITEMSAMPLER_FILE']:
|
if not settings['ITEMSAMPLER_FILE']:
|
||||||
|
@ -26,7 +26,7 @@ class HtmlParserLinkExtractor(HTMLParser):
|
|||||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||||
|
|
||||||
ret = []
|
ret = []
|
||||||
base_url = self.base_url if self.base_url else response_url
|
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||||
for link in links:
|
for link in links:
|
||||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||||
link.url = safe_url_string(link.url, response_encoding)
|
link.url = safe_url_string(link.url, response_encoding)
|
||||||
|
@ -3,7 +3,6 @@ This module implements the HtmlImageLinkExtractor for extracting
|
|||||||
image links only.
|
image links only.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import urlparse
|
|
||||||
|
|
||||||
from scrapy.link import Link
|
from scrapy.link import Link
|
||||||
from scrapy.utils.url import canonicalize_url, urljoin_rfc
|
from scrapy.utils.url import canonicalize_url, urljoin_rfc
|
||||||
@ -25,13 +24,13 @@ class HTMLImageLinkExtractor(object):
|
|||||||
self.unique = unique
|
self.unique = unique
|
||||||
self.canonicalize = canonicalize
|
self.canonicalize = canonicalize
|
||||||
|
|
||||||
def extract_from_selector(self, selector, parent=None):
|
def extract_from_selector(self, selector, encoding, parent=None):
|
||||||
ret = []
|
ret = []
|
||||||
def _add_link(url_sel, alt_sel=None):
|
def _add_link(url_sel, alt_sel=None):
|
||||||
url = flatten([url_sel.extract()])
|
url = flatten([url_sel.extract()])
|
||||||
alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
|
alt = flatten([alt_sel.extract()]) if alt_sel else (u'', )
|
||||||
if url:
|
if url:
|
||||||
ret.append(Link(unicode_to_str(url[0]), alt[0]))
|
ret.append(Link(unicode_to_str(url[0], encoding), alt[0]))
|
||||||
|
|
||||||
if selector.xmlNode.type == 'element':
|
if selector.xmlNode.type == 'element':
|
||||||
if selector.xmlNode.name == 'img':
|
if selector.xmlNode.name == 'img':
|
||||||
@ -41,7 +40,7 @@ class HTMLImageLinkExtractor(object):
|
|||||||
children = selector.select('child::*')
|
children = selector.select('child::*')
|
||||||
if len(children):
|
if len(children):
|
||||||
for child in children:
|
for child in children:
|
||||||
ret.extend(self.extract_from_selector(child, parent=selector))
|
ret.extend(self.extract_from_selector(child, encoding, parent=selector))
|
||||||
elif selector.xmlNode.name == 'a' and not parent:
|
elif selector.xmlNode.name == 'a' and not parent:
|
||||||
_add_link(selector.select('@href'), selector.select('@title'))
|
_add_link(selector.select('@href'), selector.select('@title'))
|
||||||
else:
|
else:
|
||||||
@ -52,7 +51,7 @@ class HTMLImageLinkExtractor(object):
|
|||||||
def extract_links(self, response):
|
def extract_links(self, response):
|
||||||
xs = HtmlXPathSelector(response)
|
xs = HtmlXPathSelector(response)
|
||||||
base_url = xs.select('//base/@href').extract()
|
base_url = xs.select('//base/@href').extract()
|
||||||
base_url = unicode_to_str(base_url[0]) if base_url else unicode_to_str(response.url)
|
base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url
|
||||||
|
|
||||||
links = []
|
links = []
|
||||||
for location in self.locations:
|
for location in self.locations:
|
||||||
@ -64,7 +63,7 @@ class HTMLImageLinkExtractor(object):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
for selector in selectors:
|
for selector in selectors:
|
||||||
links.extend(self.extract_from_selector(selector))
|
links.extend(self.extract_from_selector(selector, response.encoding))
|
||||||
|
|
||||||
seen, ret = set(), []
|
seen, ret = set(), []
|
||||||
for link in links:
|
for link in links:
|
||||||
|
@ -29,7 +29,7 @@ class LxmlLinkExtractor(object):
|
|||||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||||
|
|
||||||
ret = []
|
ret = []
|
||||||
base_url = self.base_url if self.base_url else response_url
|
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||||
for link in links:
|
for link in links:
|
||||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||||
link.url = safe_url_string(link.url, response_encoding)
|
link.url = safe_url_string(link.url, response_encoding)
|
||||||
|
@ -16,8 +16,9 @@ def clean_link(link_text):
|
|||||||
|
|
||||||
class RegexLinkExtractor(SgmlLinkExtractor):
|
class RegexLinkExtractor(SgmlLinkExtractor):
|
||||||
"""High performant link extractor"""
|
"""High performant link extractor"""
|
||||||
|
|
||||||
def _extract_links(self, response_text, response_url, response_encoding):
|
def _extract_links(self, response_text, response_url, response_encoding):
|
||||||
base_url = self.base_url if self.base_url else response_url
|
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||||
|
|
||||||
clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
|
clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
|
||||||
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
|
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
|
||||||
|
@ -28,7 +28,7 @@ class BaseSgmlLinkExtractor(FixedSGMLParser):
|
|||||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||||
|
|
||||||
ret = []
|
ret = []
|
||||||
base_url = self.base_url if self.base_url else response_url
|
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||||
for link in links:
|
for link in links:
|
||||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||||
link.url = safe_url_string(link.url, response_encoding)
|
link.url = safe_url_string(link.url, response_encoding)
|
||||||
|
@ -8,6 +8,7 @@ from scrapy.xlib.pydispatch import dispatcher
|
|||||||
from scrapy.core import signals
|
from scrapy.core import signals
|
||||||
from scrapy.core.exceptions import NotConfigured
|
from scrapy.core.exceptions import NotConfigured
|
||||||
from scrapy.contrib import exporter
|
from scrapy.contrib import exporter
|
||||||
|
from scrapy.contrib.exporter import jsonlines
|
||||||
from scrapy.conf import settings
|
from scrapy.conf import settings
|
||||||
|
|
||||||
class FileExportPipeline(object):
|
class FileExportPipeline(object):
|
||||||
@ -48,7 +49,6 @@ class FileExportPipeline(object):
|
|||||||
elif format == 'pickle':
|
elif format == 'pickle':
|
||||||
exp = exporter.PickleItemExporter(file, **exp_kwargs)
|
exp = exporter.PickleItemExporter(file, **exp_kwargs)
|
||||||
elif format == 'json':
|
elif format == 'json':
|
||||||
from scrapy.contrib.exporter import jsonlines
|
|
||||||
exp = jsonlines.JsonLinesItemExporter(file, **exp_kwargs)
|
exp = jsonlines.JsonLinesItemExporter(file, **exp_kwargs)
|
||||||
else:
|
else:
|
||||||
raise NotConfigured("Unsupported export format: %s" % format)
|
raise NotConfigured("Unsupported export format: %s" % format)
|
||||||
|
@ -47,7 +47,7 @@ class FSImagesStore(object):
|
|||||||
dispatcher.connect(self.spider_closed, signals.spider_closed)
|
dispatcher.connect(self.spider_closed, signals.spider_closed)
|
||||||
|
|
||||||
def spider_closed(self, spider):
|
def spider_closed(self, spider):
|
||||||
self.created_directories.pop(spider.domain_name, None)
|
self.created_directories.pop(spider.name, None)
|
||||||
|
|
||||||
def persist_image(self, key, image, buf, info):
|
def persist_image(self, key, image, buf, info):
|
||||||
absolute_path = self._get_filesystem_path(key)
|
absolute_path = self._get_filesystem_path(key)
|
||||||
@ -92,7 +92,7 @@ class _S3AmazonAWSSpider(BaseSpider):
|
|||||||
It means that a spider that uses download_delay or alike is not going to be
|
It means that a spider that uses download_delay or alike is not going to be
|
||||||
delayed even more because it is uploading images to s3.
|
delayed even more because it is uploading images to s3.
|
||||||
"""
|
"""
|
||||||
domain_name = "s3.amazonaws.com"
|
name = "s3.amazonaws.com"
|
||||||
start_urls = ['http://s3.amazonaws.com/']
|
start_urls = ['http://s3.amazonaws.com/']
|
||||||
max_concurrent_requests = 100
|
max_concurrent_requests = 100
|
||||||
|
|
||||||
@ -143,7 +143,7 @@ class S3ImagesStore(object):
|
|||||||
def _build_request(self, key, method, body=None, headers=None):
|
def _build_request(self, key, method, body=None, headers=None):
|
||||||
url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
|
url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
|
||||||
return Request(url, method=method, body=body, headers=headers, \
|
return Request(url, method=method, body=body, headers=headers, \
|
||||||
priority=self.request_priority)
|
meta={'sign_s3_request': True}, priority=self.request_priority)
|
||||||
|
|
||||||
def _download_request(self, request, info):
|
def _download_request(self, request, info):
|
||||||
"""This method is used for HEAD and PUT requests sent to amazon S3
|
"""This method is used for HEAD and PUT requests sent to amazon S3
|
||||||
|
@ -4,7 +4,6 @@ spiders
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import urlparse
|
|
||||||
|
|
||||||
from twisted.plugin import getCache
|
from twisted.plugin import getCache
|
||||||
from twisted.python.rebuild import rebuild
|
from twisted.python.rebuild import rebuild
|
||||||
@ -19,42 +18,38 @@ class TwistedPluginSpiderManager(object):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.loaded = False
|
self.loaded = False
|
||||||
self.force_domain = None
|
|
||||||
self._invaliddict = {}
|
|
||||||
self._spiders = {}
|
self._spiders = {}
|
||||||
|
|
||||||
def fromdomain(self, domain):
|
def create(self, spider_name, **spider_kwargs):
|
||||||
return self._spiders.get(domain)
|
"""Returns a Spider instance for the given spider name, using the given
|
||||||
|
spider arguments. If the sipder name is not found, it raises a
|
||||||
|
KeyError.
|
||||||
|
"""
|
||||||
|
spider = self._spiders[spider_name]
|
||||||
|
spider.__dict__.update(spider_kwargs)
|
||||||
|
return spider
|
||||||
|
|
||||||
def fromurl(self, url):
|
def find_by_request(self, request):
|
||||||
if self.force_domain:
|
"""Returns list of spiders names that match the given Request"""
|
||||||
return self._spiders.get(self.force_domain)
|
return [name for name, spider in self._spiders.iteritems()
|
||||||
domain = urlparse.urlparse(url).hostname
|
if url_is_from_spider(request.url, spider)]
|
||||||
domain = str(domain).replace('www.', '')
|
|
||||||
if domain:
|
|
||||||
if domain in self._spiders: # try first locating by domain
|
|
||||||
return self._spiders[domain]
|
|
||||||
else: # else search spider by spider
|
|
||||||
plist = self._spiders.values()
|
|
||||||
for p in plist:
|
|
||||||
if url_is_from_spider(url, p):
|
|
||||||
return p
|
|
||||||
|
|
||||||
def list(self):
|
def list(self):
|
||||||
|
"""Returns list of spiders available."""
|
||||||
return self._spiders.keys()
|
return self._spiders.keys()
|
||||||
|
|
||||||
def load(self, spider_modules=None):
|
def load(self, spider_modules=None):
|
||||||
|
"""Load spiders from module directory."""
|
||||||
if spider_modules is None:
|
if spider_modules is None:
|
||||||
spider_modules = settings.getlist('SPIDER_MODULES')
|
spider_modules = settings.getlist('SPIDER_MODULES')
|
||||||
self.spider_modules = spider_modules
|
self.spider_modules = spider_modules
|
||||||
self._invaliddict = {}
|
|
||||||
self._spiders = {}
|
self._spiders = {}
|
||||||
|
|
||||||
modules = [__import__(m, {}, {}, ['']) for m in self.spider_modules]
|
modules = [__import__(m, {}, {}, ['']) for m in self.spider_modules]
|
||||||
for module in modules:
|
for module in modules:
|
||||||
for spider in self._getspiders(ISpider, module):
|
for spider in self._getspiders(ISpider, module):
|
||||||
ISpider.validateInvariants(spider)
|
ISpider.validateInvariants(spider)
|
||||||
self._spiders[spider.domain_name] = spider
|
self._spiders[spider.name] = spider
|
||||||
self.loaded = True
|
self.loaded = True
|
||||||
|
|
||||||
def _getspiders(self, interface, package):
|
def _getspiders(self, interface, package):
|
||||||
@ -77,14 +72,14 @@ class TwistedPluginSpiderManager(object):
|
|||||||
"""Reload spider module to release any resources held on to by the
|
"""Reload spider module to release any resources held on to by the
|
||||||
spider
|
spider
|
||||||
"""
|
"""
|
||||||
domain = spider.domain_name
|
name = spider.name
|
||||||
if domain not in self._spiders:
|
if name not in self._spiders:
|
||||||
return
|
return
|
||||||
spider = self._spiders[domain]
|
spider = self._spiders[name]
|
||||||
module_name = spider.__module__
|
module_name = spider.__module__
|
||||||
module = sys.modules[module_name]
|
module = sys.modules[module_name]
|
||||||
if hasattr(module, 'SPIDER'):
|
if hasattr(module, 'SPIDER'):
|
||||||
log.msg("Reloading module %s" % module_name, spider=spider, \
|
log.msg("Reloading module %s" % module_name, spider=spider, \
|
||||||
level=log.DEBUG)
|
level=log.DEBUG)
|
||||||
new_module = rebuild(module, doLog=0)
|
new_module = rebuild(module, doLog=0)
|
||||||
self._spiders[domain] = new_module.SPIDER
|
self._spiders[name] = new_module.SPIDER
|
||||||
|
@ -47,8 +47,7 @@ class OffsiteMiddleware(object):
|
|||||||
return re.compile(regex)
|
return re.compile(regex)
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
def spider_opened(self, spider):
|
||||||
domains = [spider.domain_name] + spider.extra_domain_names
|
self.host_regexes[spider] = self.get_host_regex(spider.allowed_domains)
|
||||||
self.host_regexes[spider] = self.get_host_regex(domains)
|
|
||||||
self.domains_seen[spider] = set()
|
self.domains_seen[spider] = set()
|
||||||
|
|
||||||
def spider_closed(self, spider):
|
def spider_closed(self, spider):
|
||||||
|
@ -59,9 +59,9 @@ class CrawlSpider(InitSpider):
|
|||||||
"""
|
"""
|
||||||
rules = ()
|
rules = ()
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, *a, **kw):
|
||||||
"""Constructor takes care of compiling rules"""
|
"""Constructor takes care of compiling rules"""
|
||||||
super(CrawlSpider, self).__init__()
|
super(CrawlSpider, self).__init__(*a, **kw)
|
||||||
self._compile_rules()
|
self._compile_rules()
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
@ -3,8 +3,8 @@ from scrapy.spider import BaseSpider
|
|||||||
class InitSpider(BaseSpider):
|
class InitSpider(BaseSpider):
|
||||||
"""Base Spider with initialization facilities"""
|
"""Base Spider with initialization facilities"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, *a, **kw):
|
||||||
super(InitSpider, self).__init__()
|
super(InitSpider, self).__init__(*a, **kw)
|
||||||
self._postinit_reqs = []
|
self._postinit_reqs = []
|
||||||
self._init_complete = False
|
self._init_complete = False
|
||||||
self._init_started = False
|
self._init_started = False
|
||||||
|
@ -23,6 +23,6 @@ class StatsMailer(object):
|
|||||||
mail = MailSender()
|
mail = MailSender()
|
||||||
body = "Global stats\n\n"
|
body = "Global stats\n\n"
|
||||||
body += "\n".join("%-50s : %s" % i for i in stats.get_stats().items())
|
body += "\n".join("%-50s : %s" % i for i in stats.get_stats().items())
|
||||||
body += "\n\n%s stats\n\n" % spider.domain_name
|
body += "\n\n%s stats\n\n" % spider.name
|
||||||
body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
|
body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
|
||||||
mail.send(self.recipients, "Scrapy stats for: %s" % spider.domain_name, body)
|
mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body)
|
||||||
|
@ -60,7 +60,7 @@ class LiveStats(object):
|
|||||||
runtime = datetime.now() - stats.started
|
runtime = datetime.now() - stats.started
|
||||||
|
|
||||||
s += '<tr><td>%s</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td>%s</td><td>%s</td></tr>\n' % \
|
s += '<tr><td>%s</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td>%s</td><td>%s</td></tr>\n' % \
|
||||||
(spider.domain_name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(runtime))
|
(spider.name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(runtime))
|
||||||
|
|
||||||
totdomains += 1
|
totdomains += 1
|
||||||
totscraped += stats.scraped
|
totscraped += stats.scraped
|
||||||
|
@ -25,18 +25,18 @@ class Spiderctl(object):
|
|||||||
dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)
|
dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
def spider_opened(self, spider):
|
||||||
self.running[spider.domain_name] = spider
|
self.running[spider.name] = spider
|
||||||
|
|
||||||
def spider_closed(self, spider):
|
def spider_closed(self, spider):
|
||||||
del self.running[spider.domain_name]
|
del self.running[spider.name]
|
||||||
self.finished.add(spider.domain_name)
|
self.finished.add(spider.name)
|
||||||
|
|
||||||
def webconsole_render(self, wc_request):
|
def webconsole_render(self, wc_request):
|
||||||
if wc_request.args:
|
if wc_request.args:
|
||||||
changes = self.webconsole_control(wc_request)
|
changes = self.webconsole_control(wc_request)
|
||||||
|
|
||||||
self.scheduled = [s.domain_name for s in scrapyengine.spider_scheduler._pending_spiders]
|
self.scheduled = [s.name for s in scrapyengine.spider_scheduler._pending_spiders]
|
||||||
self.idle = [d for d in self.enabled_domains if d not in self.scheduled
|
self.idle = [d for d in self.enabled_spiders if d not in self.scheduled
|
||||||
and d not in self.running
|
and d not in self.running
|
||||||
and d not in self.finished]
|
and d not in self.finished]
|
||||||
|
|
||||||
@ -53,9 +53,9 @@ class Spiderctl(object):
|
|||||||
# idle
|
# idle
|
||||||
s += "<td valign='top'>\n"
|
s += "<td valign='top'>\n"
|
||||||
s += '<form method="post" action=".">\n'
|
s += '<form method="post" action=".">\n'
|
||||||
s += '<select name="add_pending_domains" multiple="multiple">\n'
|
s += '<select name="add_pending_spiders" multiple="multiple">\n'
|
||||||
for domain in sorted(self.idle):
|
for name in sorted(self.idle):
|
||||||
s += "<option>%s</option>\n" % domain
|
s += "<option>%s</option>\n" % name
|
||||||
s += '</select><br>\n'
|
s += '</select><br>\n'
|
||||||
s += '<br />'
|
s += '<br />'
|
||||||
s += '<input type="submit" value="Schedule selected">\n'
|
s += '<input type="submit" value="Schedule selected">\n'
|
||||||
@ -65,9 +65,9 @@ class Spiderctl(object):
|
|||||||
# scheduled
|
# scheduled
|
||||||
s += "<td valign='top'>\n"
|
s += "<td valign='top'>\n"
|
||||||
s += '<form method="post" action=".">\n'
|
s += '<form method="post" action=".">\n'
|
||||||
s += '<select name="remove_pending_domains" multiple="multiple">\n'
|
s += '<select name="remove_pending_spiders" multiple="multiple">\n'
|
||||||
for domain in self.scheduled:
|
for name in self.scheduled:
|
||||||
s += "<option>%s</option>\n" % domain
|
s += "<option>%s</option>\n" % name
|
||||||
s += '</select><br>\n'
|
s += '</select><br>\n'
|
||||||
s += '<br />'
|
s += '<br />'
|
||||||
s += '<input type="submit" value="Remove selected">\n'
|
s += '<input type="submit" value="Remove selected">\n'
|
||||||
@ -78,9 +78,9 @@ class Spiderctl(object):
|
|||||||
# running
|
# running
|
||||||
s += "<td valign='top'>\n"
|
s += "<td valign='top'>\n"
|
||||||
s += '<form method="post" action=".">\n'
|
s += '<form method="post" action=".">\n'
|
||||||
s += '<select name="stop_running_domains" multiple="multiple">\n'
|
s += '<select name="stop_running_spiders" multiple="multiple">\n'
|
||||||
for domain in sorted(self.running):
|
for name in sorted(self.running):
|
||||||
s += "<option>%s</option>\n" % domain
|
s += "<option>%s</option>\n" % name
|
||||||
s += '</select><br>\n'
|
s += '</select><br>\n'
|
||||||
s += '<br />'
|
s += '<br />'
|
||||||
s += '<input type="submit" value="Stop selected">\n'
|
s += '<input type="submit" value="Stop selected">\n'
|
||||||
@ -90,9 +90,9 @@ class Spiderctl(object):
|
|||||||
# finished
|
# finished
|
||||||
s += "<td valign='top'>\n"
|
s += "<td valign='top'>\n"
|
||||||
s += '<form method="post" action=".">\n'
|
s += '<form method="post" action=".">\n'
|
||||||
s += '<select name="rerun_finished_domains" multiple="multiple">\n'
|
s += '<select name="rerun_finished_spiders" multiple="multiple">\n'
|
||||||
for domain in sorted(self.finished):
|
for name in sorted(self.finished):
|
||||||
s += "<option>%s</option>\n" % domain
|
s += "<option>%s</option>\n" % name
|
||||||
s += '</select><br>\n'
|
s += '</select><br>\n'
|
||||||
s += '<br />'
|
s += '<br />'
|
||||||
s += '<input type="submit" value="Re-schedule selected">\n'
|
s += '<input type="submit" value="Re-schedule selected">\n'
|
||||||
@ -114,42 +114,42 @@ class Spiderctl(object):
|
|||||||
args = wc_request.args
|
args = wc_request.args
|
||||||
s = "<hr />\n"
|
s = "<hr />\n"
|
||||||
|
|
||||||
if "stop_running_domains" in args:
|
if "stop_running_spiders" in args:
|
||||||
s += "<p>"
|
s += "<p>"
|
||||||
stopped_domains = []
|
stopped_spiders = []
|
||||||
for domain in args["stop_running_domains"]:
|
for name in args["stop_running_spiders"]:
|
||||||
if domain in self.running:
|
if name in self.running:
|
||||||
scrapyengine.close_spider(self.running[domain])
|
scrapyengine.close_spider(self.running[name])
|
||||||
stopped_domains.append(domain)
|
stopped_spiders.append(name)
|
||||||
s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_domains)
|
s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_spiders)
|
||||||
s += "</p>"
|
s += "</p>"
|
||||||
if "remove_pending_domains" in args:
|
if "remove_pending_spiders" in args:
|
||||||
removed = []
|
removed = []
|
||||||
for domain in args["remove_pending_domains"]:
|
for name in args["remove_pending_spiders"]:
|
||||||
if scrapyengine.spider_scheduler.remove_pending_domain(domain):
|
if scrapyengine.spider_scheduler.remove_pending_spider(name):
|
||||||
removed.append(domain)
|
removed.append(name)
|
||||||
if removed:
|
if removed:
|
||||||
s += "<p>"
|
s += "<p>"
|
||||||
s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_domains"])
|
s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_spiders"])
|
||||||
s += "</p>"
|
s += "</p>"
|
||||||
if "add_pending_domains" in args:
|
if "add_pending_spiders" in args:
|
||||||
for domain in args["add_pending_domains"]:
|
for name in args["add_pending_spiders"]:
|
||||||
if domain not in scrapyengine.scheduler.pending_requests:
|
if name not in scrapyengine.scheduler.pending_requests:
|
||||||
scrapymanager.crawl(domain)
|
scrapymanager.crawl_spider_name(name)
|
||||||
s += "<p>"
|
s += "<p>"
|
||||||
s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_domains"])
|
s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_spiders"])
|
||||||
s += "</p>"
|
s += "</p>"
|
||||||
if "rerun_finished_domains" in args:
|
if "rerun_finished_spiders" in args:
|
||||||
for domain in args["rerun_finished_domains"]:
|
for name in args["rerun_finished_spiders"]:
|
||||||
if domain not in scrapyengine.scheduler.pending_requests:
|
if name not in scrapyengine.scheduler.pending_requests:
|
||||||
scrapymanager.crawl(domain)
|
scrapymanager.crawl_spider_name(name)
|
||||||
self.finished.remove(domain)
|
self.finished.remove(name)
|
||||||
s += "<p>"
|
s += "<p>"
|
||||||
s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_domains"])
|
s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_spiders"])
|
||||||
s += "</p>"
|
s += "</p>"
|
||||||
|
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def webconsole_discover_module(self):
|
def webconsole_discover_module(self):
|
||||||
self.enabled_domains = spiders.list()
|
self.enabled_spiders = spiders.list()
|
||||||
return self
|
return self
|
||||||
|
@ -23,7 +23,7 @@ class StatsDump(object):
|
|||||||
s += "<h3>Global stats</h3>\n"
|
s += "<h3>Global stats</h3>\n"
|
||||||
s += stats_html_table(stats.get_stats())
|
s += stats_html_table(stats.get_stats())
|
||||||
for spider, spider_stats in stats.iter_spider_stats():
|
for spider, spider_stats in stats.iter_spider_stats():
|
||||||
s += "<h3>%s</h3>\n" % spider.domain_name
|
s += "<h3>%s</h3>\n" % spider.name
|
||||||
s += stats_html_table(spider_stats)
|
s += stats_html_table(spider_stats)
|
||||||
s += "</body>\n"
|
s += "</body>\n"
|
||||||
s += "</html>\n"
|
s += "</html>\n"
|
||||||
|
4
scrapy/contrib_exp/crawlspider/__init__.py
Normal file
4
scrapy/contrib_exp/crawlspider/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
"""CrawlSpider v2"""
|
||||||
|
|
||||||
|
from .rules import Rule
|
||||||
|
from .spider import CrawlSpider
|
61
scrapy/contrib_exp/crawlspider/matchers.py
Normal file
61
scrapy/contrib_exp/crawlspider/matchers.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
"""
|
||||||
|
Request/Response Matchers
|
||||||
|
|
||||||
|
Perform evaluation to Request or Response attributes
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
class BaseMatcher(object):
|
||||||
|
"""Base matcher. Returns True by default."""
|
||||||
|
|
||||||
|
def matches_request(self, request):
|
||||||
|
"""Performs Request Matching"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
def matches_response(self, response):
|
||||||
|
"""Performs Response Matching"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class UrlMatcher(BaseMatcher):
|
||||||
|
"""Matches URL attribute"""
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
"""Initialize url attribute"""
|
||||||
|
self._url = url
|
||||||
|
|
||||||
|
def matches_url(self, url):
|
||||||
|
"""Returns True if given url is equal to matcher's url"""
|
||||||
|
return self._url == url
|
||||||
|
|
||||||
|
def matches_request(self, request):
|
||||||
|
"""Returns True if Request's url matches initial url"""
|
||||||
|
return self.matches_url(request.url)
|
||||||
|
|
||||||
|
def matches_response(self, response):
|
||||||
|
"""Returns True if Response's url matches initial url"""
|
||||||
|
return self.matches_url(response.url)
|
||||||
|
|
||||||
|
|
||||||
|
class UrlRegexMatcher(UrlMatcher):
|
||||||
|
"""Matches URL using regular expression"""
|
||||||
|
|
||||||
|
def __init__(self, regex, flags=0):
|
||||||
|
"""Initialize regular expression"""
|
||||||
|
self._regex = re.compile(regex, flags)
|
||||||
|
|
||||||
|
def matches_url(self, url):
|
||||||
|
"""Returns True if url matches regular expression"""
|
||||||
|
return self._regex.search(url) is not None
|
||||||
|
|
||||||
|
|
||||||
|
class UrlListMatcher(UrlMatcher):
|
||||||
|
"""Matches if URL is in List"""
|
||||||
|
|
||||||
|
def __init__(self, urls):
|
||||||
|
self._urls = urls
|
||||||
|
|
||||||
|
def matches_url(self, url):
|
||||||
|
"""Returns True if url is in urls list"""
|
||||||
|
return url in self._urls
|
117
scrapy/contrib_exp/crawlspider/reqext.py
Normal file
117
scrapy/contrib_exp/crawlspider/reqext.py
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
"""Request Extractors"""
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.selector import HtmlXPathSelector
|
||||||
|
from scrapy.utils.misc import arg_to_iter
|
||||||
|
from scrapy.utils.python import FixedSGMLParser, str_to_unicode
|
||||||
|
from scrapy.utils.url import safe_url_string, urljoin_rfc
|
||||||
|
|
||||||
|
from itertools import ifilter
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSgmlRequestExtractor(FixedSGMLParser):
|
||||||
|
"""Base SGML Request Extractor"""
|
||||||
|
|
||||||
|
def __init__(self, tag='a', attr='href'):
|
||||||
|
"""Initialize attributes"""
|
||||||
|
FixedSGMLParser.__init__(self)
|
||||||
|
|
||||||
|
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||||
|
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||||
|
self.current_request = None
|
||||||
|
|
||||||
|
def extract_requests(self, response):
|
||||||
|
"""Returns list of requests extracted from response"""
|
||||||
|
return self._extract_requests(response.body, response.url,
|
||||||
|
response.encoding)
|
||||||
|
|
||||||
|
def _extract_requests(self, response_text, response_url, response_encoding):
|
||||||
|
"""Extract requests with absolute urls"""
|
||||||
|
self.reset()
|
||||||
|
self.feed(response_text)
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
|
||||||
|
self._make_absolute_urls(base_url, response_encoding)
|
||||||
|
self._fix_link_text_encoding(response_encoding)
|
||||||
|
|
||||||
|
return self.requests
|
||||||
|
|
||||||
|
def _make_absolute_urls(self, base_url, encoding):
|
||||||
|
"""Makes all request's urls absolute"""
|
||||||
|
for req in self.requests:
|
||||||
|
url = req.url
|
||||||
|
# make absolute url
|
||||||
|
url = urljoin_rfc(base_url, url, encoding)
|
||||||
|
url = safe_url_string(url, encoding)
|
||||||
|
# replace in-place request's url
|
||||||
|
req.url = url
|
||||||
|
|
||||||
|
def _fix_link_text_encoding(self, encoding):
|
||||||
|
"""Convert link_text to unicode for each request"""
|
||||||
|
for req in self.requests:
|
||||||
|
req.meta.setdefault('link_text', '')
|
||||||
|
req.meta['link_text'] = str_to_unicode(req.meta['link_text'],
|
||||||
|
encoding)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Reset state"""
|
||||||
|
FixedSGMLParser.reset(self)
|
||||||
|
self.requests = []
|
||||||
|
self.base_url = None
|
||||||
|
|
||||||
|
def unknown_starttag(self, tag, attrs):
|
||||||
|
"""Process unknown start tag"""
|
||||||
|
if 'base' == tag:
|
||||||
|
self.base_url = dict(attrs).get('href')
|
||||||
|
|
||||||
|
_matches = lambda (attr, value): self.scan_attr(attr) \
|
||||||
|
and value is not None
|
||||||
|
if self.scan_tag(tag):
|
||||||
|
for attr, value in ifilter(_matches, attrs):
|
||||||
|
req = Request(url=value)
|
||||||
|
self.requests.append(req)
|
||||||
|
self.current_request = req
|
||||||
|
|
||||||
|
def unknown_endtag(self, tag):
|
||||||
|
"""Process unknown end tag"""
|
||||||
|
self.current_request = None
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
"""Process data"""
|
||||||
|
current = self.current_request
|
||||||
|
if current and not 'link_text' in current.meta:
|
||||||
|
current.meta['link_text'] = data.strip()
|
||||||
|
|
||||||
|
|
||||||
|
class SgmlRequestExtractor(BaseSgmlRequestExtractor):
|
||||||
|
"""SGML Request Extractor"""
|
||||||
|
|
||||||
|
def __init__(self, tags=None, attrs=None):
|
||||||
|
"""Initialize with custom tag & attribute function checkers"""
|
||||||
|
# defaults
|
||||||
|
tags = tuple(tags) if tags else ('a', 'area')
|
||||||
|
attrs = tuple(attrs) if attrs else ('href', )
|
||||||
|
|
||||||
|
tag_func = lambda x: x in tags
|
||||||
|
attr_func = lambda x: x in attrs
|
||||||
|
BaseSgmlRequestExtractor.__init__(self, tag=tag_func, attr=attr_func)
|
||||||
|
|
||||||
|
# TODO: move to own file
|
||||||
|
class XPathRequestExtractor(SgmlRequestExtractor):
|
||||||
|
"""SGML Request Extractor with XPath restriction"""
|
||||||
|
|
||||||
|
def __init__(self, restrict_xpaths, tags=None, attrs=None):
|
||||||
|
"""Initialize XPath restrictions"""
|
||||||
|
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
|
||||||
|
SgmlRequestExtractor.__init__(self, tags, attrs)
|
||||||
|
|
||||||
|
def extract_requests(self, response):
|
||||||
|
"""Restrict to XPath regions"""
|
||||||
|
hxs = HtmlXPathSelector(response)
|
||||||
|
fragments = (''.join(
|
||||||
|
html_frag for html_frag in hxs.select(xpath).extract()
|
||||||
|
) for xpath in self.restrict_xpaths)
|
||||||
|
html_slice = ''.join(html_frag for html_frag in fragments)
|
||||||
|
return self._extract_requests(html_slice, response.url,
|
||||||
|
response.encoding)
|
||||||
|
|
27
scrapy/contrib_exp/crawlspider/reqgen.py
Normal file
27
scrapy/contrib_exp/crawlspider/reqgen.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
"""Request Generator"""
|
||||||
|
from itertools import imap
|
||||||
|
|
||||||
|
class RequestGenerator(object):
|
||||||
|
"""Extracto and process requests from response"""
|
||||||
|
|
||||||
|
def __init__(self, req_extractors, req_processors, callback, spider=None):
|
||||||
|
"""Initialize attributes"""
|
||||||
|
self._request_extractors = req_extractors
|
||||||
|
self._request_processors = req_processors
|
||||||
|
#TODO: resolve callback?
|
||||||
|
self._callback = callback
|
||||||
|
|
||||||
|
def generate_requests(self, response):
|
||||||
|
"""Extract and process new requests from response.
|
||||||
|
Attach callback to each request as default callback."""
|
||||||
|
requests = []
|
||||||
|
for ext in self._request_extractors:
|
||||||
|
requests.extend(ext.extract_requests(response))
|
||||||
|
|
||||||
|
for proc in self._request_processors:
|
||||||
|
requests = proc(requests)
|
||||||
|
|
||||||
|
# return iterator
|
||||||
|
# @@@ creates new Request object with callback
|
||||||
|
return imap(lambda r: r.replace(callback=self._callback), requests)
|
||||||
|
|
111
scrapy/contrib_exp/crawlspider/reqproc.py
Normal file
111
scrapy/contrib_exp/crawlspider/reqproc.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
"""Request Processors"""
|
||||||
|
from scrapy.utils.misc import arg_to_iter
|
||||||
|
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
|
||||||
|
|
||||||
|
from itertools import ifilter, imap
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
class Canonicalize(object):
|
||||||
|
"""Canonicalize Request Processor"""
|
||||||
|
def _replace_url(self, req):
|
||||||
|
# replace in-place
|
||||||
|
req.url = canonicalize_url(req.url)
|
||||||
|
return req
|
||||||
|
|
||||||
|
def __call__(self, requests):
|
||||||
|
"""Canonicalize all requests' urls"""
|
||||||
|
return imap(self._replace_url, requests)
|
||||||
|
|
||||||
|
|
||||||
|
class FilterDupes(object):
|
||||||
|
"""Filter duplicate Requests"""
|
||||||
|
|
||||||
|
def __init__(self, *attributes):
|
||||||
|
"""Initialize comparison attributes"""
|
||||||
|
self._attributes = tuple(attributes) if attributes \
|
||||||
|
else tuple(['url'])
|
||||||
|
|
||||||
|
def _equal_attr(self, obj1, obj2, attr):
|
||||||
|
return getattr(obj1, attr) == getattr(obj2, attr)
|
||||||
|
|
||||||
|
def _requests_equal(self, req1, req2):
|
||||||
|
"""Attribute comparison helper"""
|
||||||
|
# look for not equal attribute
|
||||||
|
_not_equal = lambda attr: not self._equal_attr(req1, req2, attr)
|
||||||
|
for attr in ifilter(_not_equal, self._attributes):
|
||||||
|
return False
|
||||||
|
# all attributes equal
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _request_in(self, request, requests_seen):
|
||||||
|
"""Check if request is in given requests seen list"""
|
||||||
|
_req_seen = lambda r: self._requests_equal(r, request)
|
||||||
|
for seen in ifilter(_req_seen, requests_seen):
|
||||||
|
return True
|
||||||
|
# request not seen
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __call__(self, requests):
|
||||||
|
"""Filter seen requests"""
|
||||||
|
# per-call duplicates filter
|
||||||
|
self.requests_seen = set()
|
||||||
|
_not_seen = lambda r: not self._request_in(r, self.requests_seen)
|
||||||
|
for req in ifilter(_not_seen, requests):
|
||||||
|
yield req
|
||||||
|
# registry seen request
|
||||||
|
self.requests_seen.add(req)
|
||||||
|
|
||||||
|
|
||||||
|
class FilterDomain(object):
|
||||||
|
"""Filter request's domain"""
|
||||||
|
|
||||||
|
def __init__(self, allow=(), deny=()):
|
||||||
|
"""Initialize allow/deny attributes"""
|
||||||
|
self.allow = tuple(arg_to_iter(allow))
|
||||||
|
self.deny = tuple(arg_to_iter(deny))
|
||||||
|
|
||||||
|
def __call__(self, requests):
|
||||||
|
"""Filter domains"""
|
||||||
|
processed = (req for req in requests)
|
||||||
|
|
||||||
|
if self.allow:
|
||||||
|
processed = (req for req in requests
|
||||||
|
if url_is_from_any_domain(req.url, self.allow))
|
||||||
|
if self.deny:
|
||||||
|
processed = (req for req in requests
|
||||||
|
if not url_is_from_any_domain(req.url, self.deny))
|
||||||
|
|
||||||
|
return processed
|
||||||
|
|
||||||
|
|
||||||
|
class FilterUrl(object):
|
||||||
|
"""Filter request's url"""
|
||||||
|
|
||||||
|
def __init__(self, allow=(), deny=()):
|
||||||
|
"""Initialize allow/deny attributes"""
|
||||||
|
_re_type = type(re.compile('', 0))
|
||||||
|
|
||||||
|
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x)
|
||||||
|
for x in arg_to_iter(allow)]
|
||||||
|
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x)
|
||||||
|
for x in arg_to_iter(deny)]
|
||||||
|
|
||||||
|
def __call__(self, requests):
|
||||||
|
"""Filter request's url based on allow/deny rules"""
|
||||||
|
#TODO: filter valid urls here?
|
||||||
|
processed = (req for req in requests)
|
||||||
|
|
||||||
|
if self.allow_res:
|
||||||
|
processed = (req for req in requests
|
||||||
|
if self._matches(req.url, self.allow_res))
|
||||||
|
if self.deny_res:
|
||||||
|
processed = (req for req in requests
|
||||||
|
if not self._matches(req.url, self.deny_res))
|
||||||
|
|
||||||
|
return processed
|
||||||
|
|
||||||
|
def _matches(self, url, regexs):
|
||||||
|
"""Returns True if url matches any regex in given list"""
|
||||||
|
return any(r.search(url) for r in regexs)
|
||||||
|
|
100
scrapy/contrib_exp/crawlspider/rules.py
Normal file
100
scrapy/contrib_exp/crawlspider/rules.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
"""Crawler Rules"""
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.http import Response
|
||||||
|
|
||||||
|
from functools import partial
|
||||||
|
from itertools import ifilter
|
||||||
|
|
||||||
|
from .matchers import BaseMatcher
|
||||||
|
# default strint-to-matcher class
|
||||||
|
from .matchers import UrlRegexMatcher
|
||||||
|
|
||||||
|
class CompiledRule(object):
|
||||||
|
"""Compiled version of Rule"""
|
||||||
|
def __init__(self, matcher, callback=None, follow=False):
|
||||||
|
"""Initialize attributes checking type"""
|
||||||
|
assert isinstance(matcher, BaseMatcher)
|
||||||
|
assert callback is None or callable(callback)
|
||||||
|
assert isinstance(follow, bool)
|
||||||
|
|
||||||
|
self.matcher = matcher
|
||||||
|
self.callback = callback
|
||||||
|
self.follow = follow
|
||||||
|
|
||||||
|
|
||||||
|
class Rule(object):
|
||||||
|
"""Crawler Rule"""
|
||||||
|
def __init__(self, matcher=None, callback=None, follow=False, **kwargs):
|
||||||
|
"""Store attributes"""
|
||||||
|
self.matcher = matcher
|
||||||
|
self.callback = callback
|
||||||
|
self.cb_kwargs = kwargs if kwargs else {}
|
||||||
|
self.follow = True if follow else False
|
||||||
|
|
||||||
|
if self.callback is None and self.follow is False:
|
||||||
|
raise ValueError("Rule must either have a callback or "
|
||||||
|
"follow=True: %r" % self)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "Rule(matcher=%r, callback=%r, follow=%r, **%r)" \
|
||||||
|
% (self.matcher, self.callback, self.follow, self.cb_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class RulesManager(object):
|
||||||
|
"""Rules Manager"""
|
||||||
|
def __init__(self, rules, spider, default_matcher=UrlRegexMatcher):
|
||||||
|
"""Initialize rules using spider and default matcher"""
|
||||||
|
self._rules = tuple()
|
||||||
|
|
||||||
|
# compile absolute/relative-to-spider callbacks"""
|
||||||
|
for rule in rules:
|
||||||
|
# prepare matcher
|
||||||
|
if rule.matcher is None:
|
||||||
|
# instance BaseMatcher by default
|
||||||
|
matcher = BaseMatcher()
|
||||||
|
elif isinstance(rule.matcher, BaseMatcher):
|
||||||
|
matcher = rule.matcher
|
||||||
|
else:
|
||||||
|
# matcher not BaseMatcher, check for string
|
||||||
|
if isinstance(rule.matcher, basestring):
|
||||||
|
# instance default matcher
|
||||||
|
matcher = default_matcher(rule.matcher)
|
||||||
|
else:
|
||||||
|
raise ValueError('Not valid matcher given %r in %r' \
|
||||||
|
% (rule.matcher, rule))
|
||||||
|
|
||||||
|
# prepare callback
|
||||||
|
if callable(rule.callback):
|
||||||
|
callback = rule.callback
|
||||||
|
elif not rule.callback is None:
|
||||||
|
# callback from spider
|
||||||
|
callback = getattr(spider, rule.callback)
|
||||||
|
|
||||||
|
if not callable(callback):
|
||||||
|
raise AttributeError('Invalid callback %r can not be resolved' \
|
||||||
|
% callback)
|
||||||
|
else:
|
||||||
|
callback = None
|
||||||
|
|
||||||
|
if rule.cb_kwargs:
|
||||||
|
# build partial callback
|
||||||
|
callback = partial(callback, **rule.cb_kwargs)
|
||||||
|
|
||||||
|
# append compiled rule to rules list
|
||||||
|
crule = CompiledRule(matcher, callback, follow=rule.follow)
|
||||||
|
self._rules += (crule, )
|
||||||
|
|
||||||
|
def get_rule_from_request(self, request):
|
||||||
|
"""Returns first rule that matches given Request"""
|
||||||
|
_matches = lambda r: r.matcher.matches_request(request)
|
||||||
|
for rule in ifilter(_matches, self._rules):
|
||||||
|
# return first match of iterator
|
||||||
|
return rule
|
||||||
|
|
||||||
|
def get_rule_from_response(self, response):
|
||||||
|
"""Returns first rule that matches given Response"""
|
||||||
|
_matches = lambda r: r.matcher.matches_response(response)
|
||||||
|
for rule in ifilter(_matches, self._rules):
|
||||||
|
# return first match of iterator
|
||||||
|
return rule
|
||||||
|
|
69
scrapy/contrib_exp/crawlspider/spider.py
Normal file
69
scrapy/contrib_exp/crawlspider/spider.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
"""CrawlSpider v2"""
|
||||||
|
from scrapy.spider import BaseSpider
|
||||||
|
from scrapy.utils.spider import iterate_spider_output
|
||||||
|
|
||||||
|
from .matchers import UrlListMatcher
|
||||||
|
from .rules import Rule, RulesManager
|
||||||
|
from .reqext import SgmlRequestExtractor
|
||||||
|
from .reqgen import RequestGenerator
|
||||||
|
from .reqproc import Canonicalize, FilterDupes
|
||||||
|
|
||||||
|
class CrawlSpider(BaseSpider):
|
||||||
|
"""CrawlSpider v2"""
|
||||||
|
|
||||||
|
request_extractors = None
|
||||||
|
request_processors = None
|
||||||
|
rules = []
|
||||||
|
|
||||||
|
def __init__(self, *a, **kw):
|
||||||
|
"""Initialize dispatcher"""
|
||||||
|
super(CrawlSpider, self).__init__(*a, **kw)
|
||||||
|
|
||||||
|
# auto follow start urls
|
||||||
|
if self.start_urls:
|
||||||
|
_matcher = UrlListMatcher(self.start_urls)
|
||||||
|
# append new rule using type from current self.rules
|
||||||
|
rules = self.rules + type(self.rules)([
|
||||||
|
Rule(_matcher, follow=True)
|
||||||
|
])
|
||||||
|
else:
|
||||||
|
rules = self.rules
|
||||||
|
|
||||||
|
# set defaults if not set
|
||||||
|
if self.request_extractors is None:
|
||||||
|
# default link extractor. Extracts all links from response
|
||||||
|
self.request_extractors = [ SgmlRequestExtractor() ]
|
||||||
|
|
||||||
|
if self.request_processors is None:
|
||||||
|
# default proccessor. Filter duplicates requests
|
||||||
|
self.request_processors = [ FilterDupes() ]
|
||||||
|
|
||||||
|
|
||||||
|
# wrap rules
|
||||||
|
self._rulesman = RulesManager(rules, spider=self)
|
||||||
|
# generates new requests with given callback
|
||||||
|
self._reqgen = RequestGenerator(self.request_extractors,
|
||||||
|
self.request_processors,
|
||||||
|
callback=self.parse)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
"""Dispatch callback and generate requests"""
|
||||||
|
# get rule for response
|
||||||
|
rule = self._rulesman.get_rule_from_response(response)
|
||||||
|
|
||||||
|
if rule:
|
||||||
|
# dispatch callback if set
|
||||||
|
if rule.callback:
|
||||||
|
output = iterate_spider_output(rule.callback(response))
|
||||||
|
for req_or_item in output:
|
||||||
|
yield req_or_item
|
||||||
|
|
||||||
|
if rule.follow:
|
||||||
|
for req in self._reqgen.generate_requests(response):
|
||||||
|
# only dispatch request if has matching rule
|
||||||
|
if self._rulesman.get_rule_from_request(req):
|
||||||
|
yield req
|
||||||
|
else:
|
||||||
|
self.log("No rule for response %s" % response, level=log.WARNING)
|
||||||
|
|
||||||
|
|
@ -1,55 +0,0 @@
|
|||||||
"""
|
|
||||||
A pipeline to persist objects using shove.
|
|
||||||
|
|
||||||
Shove is a "new generation" shelve. For more information see:
|
|
||||||
http://pypi.python.org/pypi/shove
|
|
||||||
"""
|
|
||||||
|
|
||||||
from string import Template
|
|
||||||
|
|
||||||
from shove import Shove
|
|
||||||
from scrapy.xlib.pydispatch import dispatcher
|
|
||||||
|
|
||||||
from scrapy import log
|
|
||||||
from scrapy.core import signals
|
|
||||||
from scrapy.conf import settings
|
|
||||||
from scrapy.core.exceptions import NotConfigured
|
|
||||||
|
|
||||||
class ShoveItemPipeline(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.uritpl = settings['SHOVEITEM_STORE_URI']
|
|
||||||
if not self.uritpl:
|
|
||||||
raise NotConfigured
|
|
||||||
self.opts = settings['SHOVEITEM_STORE_OPT'] or {}
|
|
||||||
self.stores = {}
|
|
||||||
|
|
||||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
|
||||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
|
||||||
|
|
||||||
def process_item(self, spider, item):
|
|
||||||
guid = str(item.guid)
|
|
||||||
|
|
||||||
if guid in self.stores[spider]:
|
|
||||||
if self.stores[spider][guid] == item:
|
|
||||||
status = 'old'
|
|
||||||
else:
|
|
||||||
status = 'upd'
|
|
||||||
else:
|
|
||||||
status = 'new'
|
|
||||||
|
|
||||||
if not status == 'old':
|
|
||||||
self.stores[spider][guid] = item
|
|
||||||
self.log(spider, item, status)
|
|
||||||
return item
|
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
|
||||||
uri = Template(self.uritpl).substitute(domain=spider.domain_name)
|
|
||||||
self.stores[spider] = Shove(uri, **self.opts)
|
|
||||||
|
|
||||||
def spider_closed(self, spider):
|
|
||||||
self.stores[spider].sync()
|
|
||||||
|
|
||||||
def log(self, spider, item, status):
|
|
||||||
log.msg("Shove (%s): Item guid=%s" % (status, item.guid), level=log.DEBUG, \
|
|
||||||
spider=spider)
|
|
@ -2,6 +2,7 @@
|
|||||||
Download web pages using asynchronous IO
|
Download web pages using asynchronous IO
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import random
|
||||||
from time import time
|
from time import time
|
||||||
|
|
||||||
from twisted.internet import reactor, defer
|
from twisted.internet import reactor, defer
|
||||||
@ -20,15 +21,21 @@ class SpiderInfo(object):
|
|||||||
|
|
||||||
def __init__(self, download_delay=None, max_concurrent_requests=None):
|
def __init__(self, download_delay=None, max_concurrent_requests=None):
|
||||||
if download_delay is None:
|
if download_delay is None:
|
||||||
self.download_delay = settings.getfloat('DOWNLOAD_DELAY')
|
self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||||
else:
|
else:
|
||||||
self.download_delay = download_delay
|
self._download_delay = float(download_delay)
|
||||||
if self.download_delay:
|
if self._download_delay:
|
||||||
self.max_concurrent_requests = 1
|
self.max_concurrent_requests = 1
|
||||||
elif max_concurrent_requests is None:
|
elif max_concurrent_requests is None:
|
||||||
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
|
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
|
||||||
else:
|
else:
|
||||||
self.max_concurrent_requests = max_concurrent_requests
|
self.max_concurrent_requests = max_concurrent_requests
|
||||||
|
if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
|
||||||
|
# same policy as wget --random-wait
|
||||||
|
self.random_delay_interval = (0.5*self._download_delay, \
|
||||||
|
1.5*self._download_delay)
|
||||||
|
else:
|
||||||
|
self.random_delay_interval = None
|
||||||
|
|
||||||
self.active = set()
|
self.active = set()
|
||||||
self.queue = []
|
self.queue = []
|
||||||
@ -44,6 +51,12 @@ class SpiderInfo(object):
|
|||||||
# use self.active to include requests in the downloader middleware
|
# use self.active to include requests in the downloader middleware
|
||||||
return len(self.active) > 2 * self.max_concurrent_requests
|
return len(self.active) > 2 * self.max_concurrent_requests
|
||||||
|
|
||||||
|
def download_delay(self):
|
||||||
|
if self.random_delay_interval:
|
||||||
|
return random.uniform(*self.random_delay_interval)
|
||||||
|
else:
|
||||||
|
return self._download_delay
|
||||||
|
|
||||||
def cancel_request_calls(self):
|
def cancel_request_calls(self):
|
||||||
for call in self.next_request_calls:
|
for call in self.next_request_calls:
|
||||||
call.cancel()
|
call.cancel()
|
||||||
@ -99,8 +112,9 @@ class Downloader(object):
|
|||||||
|
|
||||||
# Delay queue processing if a download_delay is configured
|
# Delay queue processing if a download_delay is configured
|
||||||
now = time()
|
now = time()
|
||||||
if site.download_delay:
|
delay = site.download_delay()
|
||||||
penalty = site.download_delay - now + site.lastseen
|
if delay:
|
||||||
|
penalty = delay - now + site.lastseen
|
||||||
if penalty > 0:
|
if penalty > 0:
|
||||||
d = defer.Deferred()
|
d = defer.Deferred()
|
||||||
d.addCallback(self._process_queue)
|
d.addCallback(self._process_queue)
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import signal
|
import signal
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
from twisted.internet import reactor
|
from twisted.internet import reactor
|
||||||
|
|
||||||
@ -7,54 +6,13 @@ from scrapy.extension import extensions
|
|||||||
from scrapy import log
|
from scrapy import log
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy.core.engine import scrapyengine
|
from scrapy.core.engine import scrapyengine
|
||||||
from scrapy.spider import BaseSpider, spiders
|
from scrapy.spider import spiders
|
||||||
from scrapy.utils.misc import arg_to_iter
|
from scrapy.utils.misc import arg_to_iter
|
||||||
from scrapy.utils.url import is_url
|
|
||||||
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
|
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
|
||||||
|
|
||||||
def _get_spider_requests(*args):
|
|
||||||
"""Collect requests and spiders from the given arguments. Returns a dict of
|
|
||||||
spider -> list of requests
|
|
||||||
"""
|
|
||||||
spider_requests = defaultdict(list)
|
|
||||||
for arg in args:
|
|
||||||
if isinstance(arg, tuple):
|
|
||||||
request, spider = arg
|
|
||||||
spider_requests[spider] = request
|
|
||||||
elif isinstance(arg, Request):
|
|
||||||
spider = spiders.fromurl(arg.url) or BaseSpider('default')
|
|
||||||
if spider:
|
|
||||||
spider_requests[spider] += [arg]
|
|
||||||
else:
|
|
||||||
log.msg('Could not find spider for request: %s' % arg, log.ERROR)
|
|
||||||
elif isinstance(arg, BaseSpider):
|
|
||||||
spider_requests[arg] += arg.start_requests()
|
|
||||||
elif is_url(arg):
|
|
||||||
spider = spiders.fromurl(arg) or BaseSpider('default')
|
|
||||||
if spider:
|
|
||||||
for req in arg_to_iter(spider.make_requests_from_url(arg)):
|
|
||||||
spider_requests[spider] += [req]
|
|
||||||
else:
|
|
||||||
log.msg('Could not find spider for url: %s' % arg, log.ERROR)
|
|
||||||
elif isinstance(arg, basestring):
|
|
||||||
spider = spiders.fromdomain(arg)
|
|
||||||
if spider:
|
|
||||||
spider_requests[spider] += spider.start_requests()
|
|
||||||
else:
|
|
||||||
log.msg('Could not find spider for domain: %s' % arg, log.ERROR)
|
|
||||||
else:
|
|
||||||
raise TypeError("Unsupported argument: %r" % arg)
|
|
||||||
return spider_requests
|
|
||||||
|
|
||||||
|
|
||||||
class ExecutionManager(object):
|
class ExecutionManager(object):
|
||||||
"""Process a list of sites or urls.
|
|
||||||
|
|
||||||
This class should be used in a main for process a list of sites/urls.
|
|
||||||
|
|
||||||
It extracts products and could be used to store results in a database or
|
|
||||||
just for testing spiders.
|
|
||||||
"""
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.interrupted = False
|
self.interrupted = False
|
||||||
self.configured = False
|
self.configured = False
|
||||||
@ -78,24 +36,46 @@ class ExecutionManager(object):
|
|||||||
scrapyengine.configure()
|
scrapyengine.configure()
|
||||||
self.configured = True
|
self.configured = True
|
||||||
|
|
||||||
def crawl(self, *args):
|
def crawl_url(self, url, spider=None):
|
||||||
"""Schedule the given args for crawling. args is a list of urls or domains"""
|
"""Schedule given url for crawling."""
|
||||||
|
if spider is None:
|
||||||
|
spider = self._create_spider_for_request(Request(url), log_none=True, \
|
||||||
|
log_multiple=True)
|
||||||
|
if spider:
|
||||||
|
requests = arg_to_iter(spider.make_requests_from_url(url))
|
||||||
|
self._crawl_requests(requests, spider)
|
||||||
|
|
||||||
|
def crawl_request(self, request, spider=None):
|
||||||
|
"""Schedule request for crawling."""
|
||||||
assert self.configured, "Scrapy Manager not yet configured"
|
assert self.configured, "Scrapy Manager not yet configured"
|
||||||
spider_requests = _get_spider_requests(*args)
|
if spider is None:
|
||||||
for spider, requests in spider_requests.iteritems():
|
spider = self._create_spider_for_request(request, log_none=True, \
|
||||||
for request in requests:
|
log_multiple=True)
|
||||||
scrapyengine.crawl(request, spider)
|
if spider:
|
||||||
|
scrapyengine.crawl(request, spider)
|
||||||
|
|
||||||
def runonce(self, *args):
|
def crawl_spider_name(self, name):
|
||||||
"""Run the engine until it finishes scraping all domains and then exit"""
|
"""Schedule given spider by name for crawling."""
|
||||||
self.crawl(*args)
|
try:
|
||||||
scrapyengine.start()
|
spider = spiders.create(name)
|
||||||
if self.control_reactor:
|
except KeyError:
|
||||||
reactor.run(installSignalHandlers=False)
|
log.msg('Could not find spider: %s' % name, log.ERROR)
|
||||||
|
else:
|
||||||
|
self.crawl_spider(spider)
|
||||||
|
|
||||||
def start(self):
|
def crawl_spider(self, spider):
|
||||||
|
"""Schedule spider for crawling."""
|
||||||
|
requests = spider.start_requests()
|
||||||
|
self._crawl_requests(requests, spider)
|
||||||
|
|
||||||
|
def _crawl_requests(self, requests, spider):
|
||||||
|
"""Shortcut to schedule a list of requests"""
|
||||||
|
for req in requests:
|
||||||
|
self.crawl_request(req, spider)
|
||||||
|
|
||||||
|
def start(self, keep_alive=False):
|
||||||
"""Start the scrapy server, without scheduling any domains"""
|
"""Start the scrapy server, without scheduling any domains"""
|
||||||
scrapyengine.keep_alive = True
|
scrapyengine.keep_alive = keep_alive
|
||||||
scrapyengine.start()
|
scrapyengine.start()
|
||||||
if self.control_reactor:
|
if self.control_reactor:
|
||||||
reactor.run(installSignalHandlers=False)
|
reactor.run(installSignalHandlers=False)
|
||||||
@ -105,6 +85,17 @@ class ExecutionManager(object):
|
|||||||
self.interrupted = True
|
self.interrupted = True
|
||||||
scrapyengine.stop()
|
scrapyengine.stop()
|
||||||
|
|
||||||
|
def _create_spider_for_request(self, request, default=None, log_none=False, \
|
||||||
|
log_multiple=False):
|
||||||
|
spider_names = spiders.find_by_request(request)
|
||||||
|
if len(spider_names) == 1:
|
||||||
|
return spiders.create(spider_names[0])
|
||||||
|
if len(spider_names) > 1 and log_multiple:
|
||||||
|
log.msg('More than one spider found for: %s' % request, log.ERROR)
|
||||||
|
if len(spider_names) == 0 and log_none:
|
||||||
|
log.msg('Could not find spider for: %s' % request, log.ERROR)
|
||||||
|
return default
|
||||||
|
|
||||||
def _signal_shutdown(self, signum, _):
|
def _signal_shutdown(self, signum, _):
|
||||||
signame = signal_names[signum]
|
signame = signal_names[signum]
|
||||||
log.msg("Received %s, shutting down gracefully. Send again to force " \
|
log.msg("Received %s, shutting down gracefully. Send again to force " \
|
||||||
|
@ -1,66 +0,0 @@
|
|||||||
"""
|
|
||||||
Crawler class
|
|
||||||
|
|
||||||
The Crawler class can be used to crawl pages using the Scrapy crawler from
|
|
||||||
outside a Scrapy project, for example, from a standalone script.
|
|
||||||
|
|
||||||
To use it, instantiate it and call the "crawl" method with one (or more)
|
|
||||||
requests. For example:
|
|
||||||
|
|
||||||
>>> from scrapy.crawler import Crawler
|
|
||||||
>>> from scrapy.http import Request
|
|
||||||
>>> def parse_response(response):
|
|
||||||
... print "Visited: %s" % response.url
|
|
||||||
...
|
|
||||||
>>> request = Request('http://scrapy.org', callback=parse_response)
|
|
||||||
>>> crawler = Crawler()
|
|
||||||
>>> crawler.crawl(request)
|
|
||||||
Visited: http://scrapy.org
|
|
||||||
>>>
|
|
||||||
|
|
||||||
Request callbacks follow the same API of spiders callback, which means that all
|
|
||||||
requests returned from the callbacks will be followed.
|
|
||||||
|
|
||||||
See examples/scripts/count_and_follow_links.py for a more detailed example.
|
|
||||||
|
|
||||||
WARNING: The Crawler class currently has a big limitation - it cannot be used
|
|
||||||
more than once in the same Python process. This is due to the fact that Twisted
|
|
||||||
reactors cannot be restarted. Hopefully, this limitation will be removed in the
|
|
||||||
future.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from scrapy.xlib.pydispatch import dispatcher
|
|
||||||
from scrapy.core.manager import scrapymanager
|
|
||||||
from scrapy.core.engine import scrapyengine
|
|
||||||
from scrapy.conf import settings as scrapy_settings
|
|
||||||
from scrapy import log
|
|
||||||
|
|
||||||
class Crawler(object):
|
|
||||||
|
|
||||||
def __init__(self, enable_log=False, stop_on_error=False, silence_errors=False, \
|
|
||||||
settings=None):
|
|
||||||
self.stop_on_error = stop_on_error
|
|
||||||
self.silence_errors = silence_errors
|
|
||||||
# disable offsite middleware (by default) because it prevents free crawling
|
|
||||||
if settings is not None:
|
|
||||||
settings.overrides.update(settings)
|
|
||||||
scrapy_settings.overrides['SPIDER_MIDDLEWARES'] = {
|
|
||||||
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None}
|
|
||||||
scrapy_settings.overrides['LOG_ENABLED'] = enable_log
|
|
||||||
scrapymanager.configure()
|
|
||||||
dispatcher.connect(self._logmessage_received, signal=log.logmessage_received)
|
|
||||||
|
|
||||||
def crawl(self, *args):
|
|
||||||
scrapymanager.runonce(*args)
|
|
||||||
|
|
||||||
def stop(self):
|
|
||||||
scrapyengine.stop()
|
|
||||||
log.log_level = log.SILENT
|
|
||||||
scrapyengine.kill()
|
|
||||||
|
|
||||||
def _logmessage_received(self, message, level):
|
|
||||||
if level <= log.ERROR:
|
|
||||||
if not self.silence_errors:
|
|
||||||
print "Crawler error: %s" % message
|
|
||||||
if self.stop_on_error:
|
|
||||||
self.stop()
|
|
@ -96,20 +96,12 @@ class Request(object_ref):
|
|||||||
"""Return a copy of this Request"""
|
"""Return a copy of this Request"""
|
||||||
return self.replace()
|
return self.replace()
|
||||||
|
|
||||||
def replace(self, url=None, callback=None, method=None, headers=None, body=None, \
|
def replace(self, *args, **kwargs):
|
||||||
cookies=None, meta=None, encoding=None, priority=None, \
|
|
||||||
dont_filter=None, errback=None):
|
|
||||||
"""Create a new Request with the same attributes except for those
|
"""Create a new Request with the same attributes except for those
|
||||||
given new values.
|
given new values.
|
||||||
"""
|
"""
|
||||||
return self.__class__(url=self.url if url is None else url,
|
for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', \
|
||||||
callback=callback,
|
'encoding', 'priority', 'dont_filter']:
|
||||||
method=self.method if method is None else method,
|
kwargs.setdefault(x, getattr(self, x))
|
||||||
headers=copy.deepcopy(self.headers) if headers is None else headers,
|
cls = kwargs.pop('cls', self.__class__)
|
||||||
body=self.body if body is None else body,
|
return cls(*args, **kwargs)
|
||||||
cookies=self.cookies if cookies is None else cookies,
|
|
||||||
meta=self.meta if meta is None else meta,
|
|
||||||
encoding=self.encoding if encoding is None else encoding,
|
|
||||||
priority=self.priority if priority is None else priority,
|
|
||||||
dont_filter=self.dont_filter if dont_filter is None else dont_filter,
|
|
||||||
errback=errback)
|
|
||||||
|
@ -71,18 +71,11 @@ class Response(object_ref):
|
|||||||
"""Return a copy of this Response"""
|
"""Return a copy of this Response"""
|
||||||
return self.replace()
|
return self.replace()
|
||||||
|
|
||||||
def replace(self, url=None, status=None, headers=None, body=None, meta=None, \
|
def replace(self, *args, **kwargs):
|
||||||
flags=None, cls=None, **kwargs):
|
|
||||||
"""Create a new Response with the same attributes except for those
|
"""Create a new Response with the same attributes except for those
|
||||||
given new values.
|
given new values.
|
||||||
"""
|
"""
|
||||||
if cls is None:
|
for x in ['url', 'status', 'headers', 'body', 'meta', 'flags']:
|
||||||
cls = self.__class__
|
kwargs.setdefault(x, getattr(self, x))
|
||||||
new = cls(url=self.url if url is None else url,
|
cls = kwargs.pop('cls', self.__class__)
|
||||||
status=self.status if status is None else status,
|
return cls(*args, **kwargs)
|
||||||
headers=copy.deepcopy(self.headers) if headers is None else headers,
|
|
||||||
body=self.body if body is None else body,
|
|
||||||
meta=self.meta if meta is None else meta,
|
|
||||||
flags=self.flags if flags is None else flags,
|
|
||||||
**kwargs)
|
|
||||||
return new
|
|
||||||
|
@ -23,9 +23,6 @@ class HtmlResponse(TextResponse):
|
|||||||
METATAG_RE = re.compile(r'<meta\s+%s\s+%s' % (_httpequiv_re, _content_re), re.I)
|
METATAG_RE = re.compile(r'<meta\s+%s\s+%s' % (_httpequiv_re, _content_re), re.I)
|
||||||
METATAG_RE2 = re.compile(r'<meta\s+%s\s+%s' % (_content_re, _httpequiv_re), re.I)
|
METATAG_RE2 = re.compile(r'<meta\s+%s\s+%s' % (_content_re, _httpequiv_re), re.I)
|
||||||
|
|
||||||
def body_encoding(self):
|
|
||||||
return self._body_declared_encoding() or super(HtmlResponse, self).body_encoding()
|
|
||||||
|
|
||||||
@memoizemethod_noargs
|
@memoizemethod_noargs
|
||||||
def _body_declared_encoding(self):
|
def _body_declared_encoding(self):
|
||||||
chunk = self.body[:5000]
|
chunk = self.body[:5000]
|
||||||
|
@ -6,24 +6,31 @@ See documentation in docs/topics/request-response.rst
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import codecs
|
||||||
from scrapy.xlib.BeautifulSoup import UnicodeDammit
|
from scrapy.xlib.BeautifulSoup import UnicodeDammit
|
||||||
|
|
||||||
from scrapy.http.response import Response
|
from scrapy.http.response import Response
|
||||||
from scrapy.utils.python import memoizemethod_noargs
|
from scrapy.utils.python import memoizemethod_noargs
|
||||||
|
from scrapy.utils.encoding import encoding_exists, resolve_encoding
|
||||||
from scrapy.conf import settings
|
from scrapy.conf import settings
|
||||||
|
|
||||||
|
|
||||||
|
# Python decoder doesn't follow unicode standard when handling
|
||||||
|
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
|
||||||
|
codecs.register_error('scrapy_replace', lambda exc: (u'\ufffd', exc.start+1))
|
||||||
|
|
||||||
|
|
||||||
class TextResponse(Response):
|
class TextResponse(Response):
|
||||||
|
|
||||||
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
|
_DEFAULT_ENCODING = settings['DEFAULT_RESPONSE_ENCODING']
|
||||||
_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
|
_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
|
||||||
|
|
||||||
__slots__ = ['_encoding', '_body_inferred_encoding']
|
__slots__ = ['_encoding', '_cached_benc', '_cached_ubody']
|
||||||
|
|
||||||
def __init__(self, url, status=200, headers=None, body=None, meta=None, \
|
def __init__(self, url, status=200, headers=None, body=None, meta=None, \
|
||||||
flags=None, encoding=None):
|
flags=None, encoding=None):
|
||||||
self._encoding = encoding
|
self._encoding = encoding
|
||||||
self._body_inferred_encoding = None
|
self._cached_benc = None
|
||||||
|
self._cached_ubody = None
|
||||||
super(TextResponse, self).__init__(url, status, headers, body, meta, flags)
|
super(TextResponse, self).__init__(url, status, headers, body, meta, flags)
|
||||||
|
|
||||||
def _get_url(self):
|
def _get_url(self):
|
||||||
@ -56,31 +63,48 @@ class TextResponse(Response):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def encoding(self):
|
def encoding(self):
|
||||||
return self._encoding or self.headers_encoding() or self.body_encoding()
|
return self._get_encoding(infer=True)
|
||||||
|
|
||||||
@memoizemethod_noargs
|
def _get_encoding(self, infer=False):
|
||||||
def headers_encoding(self):
|
enc = self._declared_encoding()
|
||||||
content_type = self.headers.get('Content-Type')
|
if enc and not encoding_exists(enc):
|
||||||
if content_type:
|
enc = None
|
||||||
encoding = self._ENCODING_RE.search(content_type)
|
if not enc and infer:
|
||||||
if encoding:
|
enc = self._body_inferred_encoding()
|
||||||
return encoding.group(1)
|
if not enc:
|
||||||
|
enc = self._DEFAULT_ENCODING
|
||||||
|
return resolve_encoding(enc)
|
||||||
|
|
||||||
|
def _declared_encoding(self):
|
||||||
|
return self._encoding or self._headers_encoding() \
|
||||||
|
or self._body_declared_encoding()
|
||||||
|
|
||||||
@memoizemethod_noargs
|
|
||||||
def body_as_unicode(self):
|
def body_as_unicode(self):
|
||||||
"""Return body as unicode"""
|
"""Return body as unicode"""
|
||||||
possible_encodings = (self._encoding, self.headers_encoding(), \
|
if self._cached_ubody is None:
|
||||||
self._body_declared_encoding())
|
self._cached_ubody = self.body.decode(self.encoding, 'scrapy_replace')
|
||||||
dammit = UnicodeDammit(self.body, possible_encodings)
|
return self._cached_ubody
|
||||||
self._body_inferred_encoding = dammit.originalEncoding
|
|
||||||
if self._body_inferred_encoding in ('ascii', None):
|
|
||||||
self._body_inferred_encoding = self._DEFAULT_ENCODING
|
|
||||||
return dammit.unicode
|
|
||||||
|
|
||||||
def body_encoding(self):
|
@memoizemethod_noargs
|
||||||
if self._body_inferred_encoding is None:
|
def _headers_encoding(self):
|
||||||
self.body_as_unicode()
|
content_type = self.headers.get('Content-Type')
|
||||||
return self._body_inferred_encoding
|
if content_type:
|
||||||
|
m = self._ENCODING_RE.search(content_type)
|
||||||
|
if m:
|
||||||
|
encoding = m.group(1)
|
||||||
|
if encoding_exists(encoding):
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
def _body_inferred_encoding(self):
|
||||||
|
if self._cached_benc is None:
|
||||||
|
enc = self._get_encoding()
|
||||||
|
dammit = UnicodeDammit(self.body, [enc])
|
||||||
|
benc = dammit.originalEncoding
|
||||||
|
self._cached_benc = benc
|
||||||
|
# UnicodeDammit is buggy decoding utf-16
|
||||||
|
if self._cached_ubody is None and benc != 'utf-16':
|
||||||
|
self._cached_ubody = dammit.unicode
|
||||||
|
return self._cached_benc
|
||||||
|
|
||||||
def _body_declared_encoding(self):
|
def _body_declared_encoding(self):
|
||||||
# implemented in subclasses (XmlResponse, HtmlResponse)
|
# implemented in subclasses (XmlResponse, HtmlResponse)
|
||||||
|
@ -18,9 +18,6 @@ class XmlResponse(TextResponse):
|
|||||||
_encoding_re = _template % ('encoding', r'(?P<charset>[\w-]+)')
|
_encoding_re = _template % ('encoding', r'(?P<charset>[\w-]+)')
|
||||||
XMLDECL_RE = re.compile(r'<\?xml\s.*?%s' % _encoding_re, re.I)
|
XMLDECL_RE = re.compile(r'<\?xml\s.*?%s' % _encoding_re, re.I)
|
||||||
|
|
||||||
def body_encoding(self):
|
|
||||||
return self._body_declared_encoding() or super(XmlResponse, self).body_encoding()
|
|
||||||
|
|
||||||
@memoizemethod_noargs
|
@memoizemethod_noargs
|
||||||
def _body_declared_encoding(self):
|
def _body_declared_encoding(self):
|
||||||
chunk = self.body[:5000]
|
chunk = self.body[:5000]
|
||||||
|
@ -29,8 +29,9 @@ BOT_NAME = settings['BOT_NAME']
|
|||||||
# args: message, level, spider
|
# args: message, level, spider
|
||||||
logmessage_received = object()
|
logmessage_received = object()
|
||||||
|
|
||||||
# default logging level
|
# default values
|
||||||
log_level = DEBUG
|
log_level = DEBUG
|
||||||
|
log_encoding = 'utf-8'
|
||||||
|
|
||||||
started = False
|
started = False
|
||||||
|
|
||||||
@ -47,11 +48,12 @@ def _get_log_level(level_name_or_id=None):
|
|||||||
|
|
||||||
def start(logfile=None, loglevel=None, logstdout=None):
|
def start(logfile=None, loglevel=None, logstdout=None):
|
||||||
"""Initialize and start logging facility"""
|
"""Initialize and start logging facility"""
|
||||||
global log_level, started
|
global log_level, log_encoding, started
|
||||||
|
|
||||||
if started or not settings.getbool('LOG_ENABLED'):
|
if started or not settings.getbool('LOG_ENABLED'):
|
||||||
return
|
return
|
||||||
log_level = _get_log_level(loglevel)
|
log_level = _get_log_level(loglevel)
|
||||||
|
log_encoding = settings['LOG_ENCODING']
|
||||||
started = True
|
started = True
|
||||||
|
|
||||||
# set log observer
|
# set log observer
|
||||||
@ -73,8 +75,8 @@ def msg(message, level=INFO, component=BOT_NAME, domain=None, spider=None):
|
|||||||
"use 'spider' argument instead", DeprecationWarning, stacklevel=2)
|
"use 'spider' argument instead", DeprecationWarning, stacklevel=2)
|
||||||
dispatcher.send(signal=logmessage_received, message=message, level=level, \
|
dispatcher.send(signal=logmessage_received, message=message, level=level, \
|
||||||
spider=spider)
|
spider=spider)
|
||||||
system = domain or (spider.domain_name if spider else component)
|
system = domain or (spider.name if spider else component)
|
||||||
msg_txt = unicode_to_str("%s: %s" % (level_names[level], message))
|
msg_txt = unicode_to_str("%s: %s" % (level_names[level], message), log_encoding)
|
||||||
log.msg(msg_txt, system=system)
|
log.msg(msg_txt, system=system)
|
||||||
|
|
||||||
def exc(message, level=ERROR, component=BOT_NAME, domain=None, spider=None):
|
def exc(message, level=ERROR, component=BOT_NAME, domain=None, spider=None):
|
||||||
@ -91,7 +93,7 @@ def err(_stuff=None, _why=None, **kwargs):
|
|||||||
import warnings
|
import warnings
|
||||||
warnings.warn("'domain' argument of scrapy.log.err() is deprecated, " \
|
warnings.warn("'domain' argument of scrapy.log.err() is deprecated, " \
|
||||||
"use 'spider' argument instead", DeprecationWarning, stacklevel=2)
|
"use 'spider' argument instead", DeprecationWarning, stacklevel=2)
|
||||||
kwargs['system'] = domain or (spider.domain_name if spider else component)
|
kwargs['system'] = domain or (spider.name if spider else component)
|
||||||
if _why:
|
if _why:
|
||||||
_why = unicode_to_str("ERROR: %s" % _why)
|
_why = unicode_to_str("ERROR: %s" % _why, log_encoding)
|
||||||
log.err(_stuff, _why, **kwargs)
|
log.err(_stuff, _why, **kwargs)
|
||||||
|
@ -47,34 +47,26 @@ class MailSender(object):
|
|||||||
part = MIMEBase(*mimetype.split('/'))
|
part = MIMEBase(*mimetype.split('/'))
|
||||||
part.set_payload(f.read())
|
part.set_payload(f.read())
|
||||||
Encoders.encode_base64(part)
|
Encoders.encode_base64(part)
|
||||||
part.add_header('Content-Disposition', 'attachment; filename="%s"' % attach_name)
|
part.add_header('Content-Disposition', 'attachment; filename="%s"' \
|
||||||
|
% attach_name)
|
||||||
msg.attach(part)
|
msg.attach(part)
|
||||||
else:
|
else:
|
||||||
msg.set_payload(body)
|
msg.set_payload(body)
|
||||||
|
|
||||||
# FIXME ---------------------------------------------------------------------
|
dfd = self._sendmail(self.smtphost, self.mailfrom, rcpts, msg.as_string())
|
||||||
# There seems to be a problem with sending emails using deferreds when
|
dfd.addCallbacks(self._sent_ok, self._sent_failed,
|
||||||
# the last thing left to do is sending the mail, cause the engine stops
|
callbackArgs=[to, cc, subject, len(attachs)],
|
||||||
# the reactor and the email don't get send. we need to fix this. until
|
errbackArgs=[to, cc, subject, len(attachs)])
|
||||||
# then, we'll revert to use Python standard (IO-blocking) smtplib.
|
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
|
||||||
|
|
||||||
#dfd = self._sendmail(self.smtphost, self.mailfrom, rcpts, msg.as_string())
|
|
||||||
#dfd.addCallbacks(self._sent_ok, self._sent_failed,
|
|
||||||
# callbackArgs=[to, cc, subject, len(attachs)],
|
|
||||||
# errbackArgs=[to, cc, subject, len(attachs)])
|
|
||||||
import smtplib
|
|
||||||
smtp = smtplib.SMTP(self.smtphost)
|
|
||||||
smtp.sendmail(self.mailfrom, rcpts, msg.as_string())
|
|
||||||
log.msg('Mail sent: To=%s Cc=%s Subject="%s"' % (to, cc, subject))
|
|
||||||
smtp.close()
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _sent_ok(self, result, to, cc, subject, nattachs):
|
def _sent_ok(self, result, to, cc, subject, nattachs):
|
||||||
log.msg('Mail sent OK: To=%s Cc=%s Subject="%s" Attachs=%d' % (to, cc, subject, nattachs))
|
log.msg('Mail sent OK: To=%s Cc=%s Subject="%s" Attachs=%d' % \
|
||||||
|
(to, cc, subject, nattachs))
|
||||||
|
|
||||||
def _sent_failed(self, failure, to, cc, subject, nattachs):
|
def _sent_failed(self, failure, to, cc, subject, nattachs):
|
||||||
errstr = str(failure.value)
|
errstr = str(failure.value)
|
||||||
log.msg('Unable to send mail: To=%s Cc=%s Subject="%s" Attachs=%d - %s' % (to, cc, subject, nattachs, errstr), level=log.ERROR)
|
log.msg('Unable to send mail: To=%s Cc=%s Subject="%s" Attachs=%d - %s' % \
|
||||||
|
(to, cc, subject, nattachs, errstr), level=log.ERROR)
|
||||||
|
|
||||||
def _sendmail(self, smtphost, from_addr, to_addrs, msg, port=25):
|
def _sendmail(self, smtphost, from_addr, to_addrs, msg, port=25):
|
||||||
""" This is based on twisted.mail.smtp.sendmail except that it
|
""" This is based on twisted.mail.smtp.sendmail except that it
|
||||||
|
@ -29,8 +29,8 @@ class XPathSelector(object_ref):
|
|||||||
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
|
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
|
||||||
self.xmlNode = self.doc.xmlDoc
|
self.xmlNode = self.doc.xmlDoc
|
||||||
elif text:
|
elif text:
|
||||||
response = TextResponse(url='about:blank', body=unicode_to_str(text), \
|
response = TextResponse(url='about:blank', \
|
||||||
encoding='utf-8')
|
body=unicode_to_str(text, 'utf-8'), encoding='utf-8')
|
||||||
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
|
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
|
||||||
self.xmlNode = self.doc.xmlDoc
|
self.xmlNode = self.doc.xmlDoc
|
||||||
self.expr = expr
|
self.expr = expr
|
||||||
|
60
scrapy/service.py
Normal file
60
scrapy/service.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import sys, os
|
||||||
|
|
||||||
|
from twisted.python import log
|
||||||
|
from twisted.internet import reactor, protocol, error
|
||||||
|
from twisted.application.service import Service
|
||||||
|
|
||||||
|
from scrapy.utils.py26 import cpu_count
|
||||||
|
from scrapy.conf import settings
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapyService(Service):
|
||||||
|
|
||||||
|
def startService(self):
|
||||||
|
reactor.callWhenRunning(self.start_processes)
|
||||||
|
|
||||||
|
def start_processes(self):
|
||||||
|
for i in range(cpu_count()):
|
||||||
|
self.start_process(i+1)
|
||||||
|
|
||||||
|
def start_process(self, id):
|
||||||
|
args = [sys.executable, '-m', 'scrapy.service']
|
||||||
|
env = os.environ.copy()
|
||||||
|
self.set_log_file(env, id)
|
||||||
|
pp = ScrapyProcessProtocol(self, id, env.get('SCRAPY_LOG_FILE'))
|
||||||
|
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
|
||||||
|
|
||||||
|
def set_log_file(self, env, suffix):
|
||||||
|
logfile = settings['LOG_FILE']
|
||||||
|
if logfile:
|
||||||
|
file, ext = os.path.splitext(logfile)
|
||||||
|
env['SCRAPY_LOG_FILE'] = "%s-%s%s" % (file, suffix, ext)
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapyProcessProtocol(protocol.ProcessProtocol):
|
||||||
|
|
||||||
|
def __init__(self, service, id, logfile):
|
||||||
|
self.service = service
|
||||||
|
self.id = id
|
||||||
|
self.logfile = logfile
|
||||||
|
self.pid = None
|
||||||
|
|
||||||
|
def connectionMade(self):
|
||||||
|
self.pid = self.transport.pid
|
||||||
|
log.msg("Process %r started: pid=%r logfile=%r" % (self.id, self.pid, \
|
||||||
|
self.logfile))
|
||||||
|
|
||||||
|
def processEnded(self, status):
|
||||||
|
if isinstance(status.value, error.ProcessDone):
|
||||||
|
log.msg("Process %r finished: pid=%r logfile=%r" % (self.id, \
|
||||||
|
self.pid, self.logfile))
|
||||||
|
else:
|
||||||
|
log.msg("Process %r died: exitstatus=%r pid=%r logfile=%r" % \
|
||||||
|
(self.id, status.value.exitCode, self.pid, self.logfile))
|
||||||
|
reactor.callLater(5, self.service.start_process, self.id)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from scrapy.core.manager import scrapymanager
|
||||||
|
scrapymanager.configure()
|
||||||
|
scrapymanager.start(keep_alive=True)
|
@ -35,6 +35,7 @@ def parse_url(url):
|
|||||||
u = urlparse.urlparse(url)
|
u = urlparse.urlparse(url)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
class Shell(object):
|
class Shell(object):
|
||||||
|
|
||||||
requires_project = False
|
requires_project = False
|
||||||
@ -52,18 +53,21 @@ class Shell(object):
|
|||||||
else:
|
else:
|
||||||
url = parse_url(request_or_url)
|
url = parse_url(request_or_url)
|
||||||
request = Request(url)
|
request = Request(url)
|
||||||
spider = spiders.fromurl(url) or BaseSpider('default')
|
|
||||||
|
spider = scrapymanager._create_spider_for_request(request, \
|
||||||
|
BaseSpider('default'), log_multiple=True)
|
||||||
|
|
||||||
print "Fetching %s..." % request
|
print "Fetching %s..." % request
|
||||||
response = threads.blockingCallFromThread(reactor, scrapyengine.schedule, \
|
response = threads.blockingCallFromThread(reactor, scrapyengine.schedule, \
|
||||||
request, spider)
|
request, spider)
|
||||||
if response:
|
if response:
|
||||||
self.populate_vars(url, response, request)
|
self.populate_vars(url, response, request, spider)
|
||||||
if print_help:
|
if print_help:
|
||||||
self.print_help()
|
self.print_help()
|
||||||
else:
|
else:
|
||||||
print "Done - use shelp() to see available objects"
|
print "Done - use shelp() to see available objects"
|
||||||
|
|
||||||
def populate_vars(self, url=None, response=None, request=None):
|
def populate_vars(self, url=None, response=None, request=None, spider=None):
|
||||||
item = self.item_class()
|
item = self.item_class()
|
||||||
self.vars['item'] = item
|
self.vars['item'] = item
|
||||||
if url:
|
if url:
|
||||||
@ -73,7 +77,7 @@ class Shell(object):
|
|||||||
self.vars['url'] = url
|
self.vars['url'] = url
|
||||||
self.vars['response'] = response
|
self.vars['response'] = response
|
||||||
self.vars['request'] = request
|
self.vars['request'] = request
|
||||||
self.vars['spider'] = spiders.fromurl(url)
|
self.vars['spider'] = spider
|
||||||
if not self.nofetch:
|
if not self.nofetch:
|
||||||
self.vars['fetch'] = self.fetch
|
self.vars['fetch'] = self.fetch
|
||||||
self.vars['view'] = open_in_browser
|
self.vars['view'] = open_in_browser
|
||||||
@ -104,7 +108,7 @@ class Shell(object):
|
|||||||
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||||||
|
|
||||||
reactor.callInThread(self._console_thread, url)
|
reactor.callInThread(self._console_thread, url)
|
||||||
scrapymanager.start()
|
scrapymanager.start(keep_alive=True)
|
||||||
|
|
||||||
def inspect_response(self, response):
|
def inspect_response(self, response):
|
||||||
print
|
print
|
||||||
|
@ -3,6 +3,9 @@ Base class for Scrapy spiders
|
|||||||
|
|
||||||
See documentation in docs/topics/spiders.rst
|
See documentation in docs/topics/spiders.rst
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
from zope.interface import Interface, Attribute, invariant, implements
|
from zope.interface import Interface, Attribute, invariant, implements
|
||||||
from twisted.plugin import IPlugin
|
from twisted.plugin import IPlugin
|
||||||
|
|
||||||
@ -11,17 +14,9 @@ from scrapy.http import Request
|
|||||||
from scrapy.utils.misc import arg_to_iter
|
from scrapy.utils.misc import arg_to_iter
|
||||||
from scrapy.utils.trackref import object_ref
|
from scrapy.utils.trackref import object_ref
|
||||||
|
|
||||||
def _valid_domain_name(obj):
|
|
||||||
"""Check the domain name specified is valid"""
|
|
||||||
if not obj.domain_name:
|
|
||||||
raise ValueError("Spider 'domain_name' attribute is required")
|
|
||||||
|
|
||||||
class ISpider(Interface, IPlugin) :
|
class ISpider(Interface, IPlugin) :
|
||||||
"""Interface to be implemented by site-specific web spiders"""
|
"""Interface used by TwistedPluginSpiderManager to discover spiders"""
|
||||||
|
pass
|
||||||
domain_name = Attribute("The domain name of the site to be scraped.")
|
|
||||||
|
|
||||||
invariant(_valid_domain_name)
|
|
||||||
|
|
||||||
class BaseSpider(object_ref):
|
class BaseSpider(object_ref):
|
||||||
"""Base class for scrapy spiders. All spiders must inherit from this
|
"""Base class for scrapy spiders. All spiders must inherit from this
|
||||||
@ -31,19 +26,37 @@ class BaseSpider(object_ref):
|
|||||||
implements(ISpider)
|
implements(ISpider)
|
||||||
|
|
||||||
# XXX: class attributes kept for backwards compatibility
|
# XXX: class attributes kept for backwards compatibility
|
||||||
domain_name = None
|
name = None
|
||||||
start_urls = []
|
start_urls = []
|
||||||
extra_domain_names = []
|
allowed_domains = []
|
||||||
|
|
||||||
def __init__(self, domain_name=None):
|
def __init__(self, name=None, **kwargs):
|
||||||
if domain_name is not None:
|
self.__dict__.update(kwargs)
|
||||||
self.domain_name = domain_name
|
# XXX: SEP-12 backward compatibility (remove for 0.10)
|
||||||
|
if hasattr(self, 'domain_name'):
|
||||||
|
warnings.warn("Spider.domain_name attribute is deprecated, use Spider.name instead and Spider.allowed_domains", \
|
||||||
|
DeprecationWarning, stacklevel=4)
|
||||||
|
self.name = self.domain_name
|
||||||
|
self.allowed_domains = [self.name]
|
||||||
|
if hasattr(self, 'extra_domain_names'):
|
||||||
|
warnings.warn("Spider.extra_domain_names attribute is deprecated - user Spider.allowed_domains instead", \
|
||||||
|
DeprecationWarning, stacklevel=4)
|
||||||
|
self.allowed_domains += list(self.extra_domain_names)
|
||||||
|
|
||||||
|
if name is not None:
|
||||||
|
self.name = name
|
||||||
# XXX: create instance attributes (class attributes were kept for
|
# XXX: create instance attributes (class attributes were kept for
|
||||||
# backwards compatibility)
|
# backwards compatibility)
|
||||||
if not self.start_urls:
|
if not self.start_urls:
|
||||||
self.start_urls = []
|
self.start_urls = []
|
||||||
if not self.extra_domain_names:
|
if not self.allowed_domains:
|
||||||
self.extra_domain_names = []
|
self.allowed_domains = []
|
||||||
|
if not self.name:
|
||||||
|
raise ValueError("%s must have a name" % type(self).__name__)
|
||||||
|
|
||||||
|
# XXX: SEP-12 forward compatibility (remove for 0.10)
|
||||||
|
self.domain_name = self.name
|
||||||
|
self.extra_domain_names = self.allowed_domains
|
||||||
|
|
||||||
def log(self, message, level=log.DEBUG):
|
def log(self, message, level=log.DEBUG):
|
||||||
"""Log the given messages at the given log level. Always use this
|
"""Log the given messages at the given log level. Always use this
|
||||||
@ -67,6 +80,6 @@ class BaseSpider(object_ref):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "<%s %r>" % (type(self).__name__, self.domain_name)
|
return "<%s %r>" % (type(self).__name__, self.name)
|
||||||
|
|
||||||
__repr__ = __str__
|
__repr__ = __str__
|
||||||
|
@ -76,11 +76,11 @@ class MemoryStatsCollector(StatsCollector):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(MemoryStatsCollector, self).__init__()
|
super(MemoryStatsCollector, self).__init__()
|
||||||
self.domain_stats = {}
|
self.spider_stats = {}
|
||||||
|
|
||||||
def _persist_stats(self, stats, spider=None):
|
def _persist_stats(self, stats, spider=None):
|
||||||
if spider is not None:
|
if spider is not None:
|
||||||
self.domain_stats[spider.domain_name] = stats
|
self.spider_stats[spider.name] = stats
|
||||||
|
|
||||||
|
|
||||||
class DummyStatsCollector(StatsCollector):
|
class DummyStatsCollector(StatsCollector):
|
||||||
|
@ -1,31 +0,0 @@
|
|||||||
"""
|
|
||||||
A Stats collector for persisting stats (pickled) to a MySQL db
|
|
||||||
"""
|
|
||||||
|
|
||||||
import cPickle as pickle
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from scrapy.stats.collector import StatsCollector
|
|
||||||
from scrapy.utils.mysql import mysql_connect
|
|
||||||
from scrapy.conf import settings
|
|
||||||
|
|
||||||
class MysqlStatsCollector(StatsCollector):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super(MysqlStatsCollector, self).__init__()
|
|
||||||
mysqluri = settings['STATS_MYSQL_URI']
|
|
||||||
self._mysql_conn = mysql_connect(mysqluri, use_unicode=False) if mysqluri else None
|
|
||||||
|
|
||||||
def _persist_stats(self, stats, spider=None):
|
|
||||||
if spider is None: # only store spider-specific stats
|
|
||||||
return
|
|
||||||
if self._mysql_conn is None:
|
|
||||||
return
|
|
||||||
stored = datetime.utcnow()
|
|
||||||
datas = pickle.dumps(stats)
|
|
||||||
table = 'domain_data_history'
|
|
||||||
|
|
||||||
c = self._mysql_conn.cursor()
|
|
||||||
c.execute("INSERT INTO %s (domain,stored,data) VALUES (%%s,%%s,%%s)" % table, \
|
|
||||||
(spider.domain_name, stored, datas))
|
|
||||||
self._mysql_conn.commit()
|
|
@ -36,9 +36,9 @@ class SimpledbStatsCollector(StatsCollector):
|
|||||||
|
|
||||||
def _persist_to_sdb(self, spider, stats):
|
def _persist_to_sdb(self, spider, stats):
|
||||||
ts = self._get_timestamp(spider).isoformat()
|
ts = self._get_timestamp(spider).isoformat()
|
||||||
sdb_item_id = "%s_%s" % (spider.domain_name, ts)
|
sdb_item_id = "%s_%s" % (spider.name, ts)
|
||||||
sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems())
|
sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems())
|
||||||
sdb_item['domain'] = spider.domain_name
|
sdb_item['spider'] = spider.name
|
||||||
sdb_item['timestamp'] = self._to_sdb_value(ts)
|
sdb_item['timestamp'] = self._to_sdb_value(ts)
|
||||||
connect_sdb().put_attributes(self._sdbdomain, sdb_item_id, sdb_item)
|
connect_sdb().put_attributes(self._sdbdomain, sdb_item_id, sdb_item)
|
||||||
|
|
||||||
|
@ -4,5 +4,5 @@
|
|||||||
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||||
|
|
||||||
class ${ProjectName}Pipeline(object):
|
class ${ProjectName}Pipeline(object):
|
||||||
def process_item(self, domain, item):
|
def process_item(self, spider, item):
|
||||||
return item
|
return item
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
from scrapy.spider import BaseSpider
|
from scrapy.spider import BaseSpider
|
||||||
|
|
||||||
class $classname(BaseSpider):
|
class $classname(BaseSpider):
|
||||||
domain_name = "$site"
|
name = "$name"
|
||||||
|
allowed_domains = ["$domain"]
|
||||||
start_urls = (
|
start_urls = (
|
||||||
'http://www.$site/',
|
'http://www.$domain/',
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
|
@ -6,19 +6,20 @@ from scrapy.contrib.spiders import CrawlSpider, Rule
|
|||||||
from $project_name.items import ${ProjectName}Item
|
from $project_name.items import ${ProjectName}Item
|
||||||
|
|
||||||
class $classname(CrawlSpider):
|
class $classname(CrawlSpider):
|
||||||
domain_name = '$site'
|
name = '$name'
|
||||||
start_urls = ['http://www.$site/']
|
allowed_domains = ['$domain']
|
||||||
|
start_urls = ['http://www.$domain/']
|
||||||
|
|
||||||
rules = (
|
rules = (
|
||||||
Rule(SgmlLinkExtractor(allow=(r'Items/', )), 'parse_item', follow=True),
|
Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse_item(self, response):
|
def parse_item(self, response):
|
||||||
xs = HtmlXPathSelector(response)
|
hxs = HtmlXPathSelector(response)
|
||||||
i = ${ProjectName}Item()
|
i = ${ProjectName}Item()
|
||||||
#i['site_id'] = xs.select('//input[@id="sid"]/@value').extract()
|
#i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract()
|
||||||
#i['name'] = xs.select('//div[@id="name"]').extract()
|
#i['name'] = hxs.select('//div[@id="name"]').extract()
|
||||||
#i['description'] = xs.select('//div[@id="description"]').extract()
|
#i['description'] = hxs.select('//div[@id="description"]').extract()
|
||||||
return i
|
return i
|
||||||
|
|
||||||
SPIDER = $classname()
|
SPIDER = $classname()
|
||||||
|
@ -2,8 +2,9 @@ from scrapy.contrib.spiders import CSVFeedSpider
|
|||||||
from $project_name.items import ${ProjectName}Item
|
from $project_name.items import ${ProjectName}Item
|
||||||
|
|
||||||
class $classname(CSVFeedSpider):
|
class $classname(CSVFeedSpider):
|
||||||
domain_name = '$site'
|
name = '$name'
|
||||||
start_urls = ['http://www.$site/feed.csv']
|
allowed_domains = ['$domain']
|
||||||
|
start_urls = ['http://www.$domain/feed.csv']
|
||||||
# headers = ['id', 'name', 'description', 'image_link']
|
# headers = ['id', 'name', 'description', 'image_link']
|
||||||
# delimiter = '\t'
|
# delimiter = '\t'
|
||||||
|
|
||||||
|
@ -2,8 +2,9 @@ from scrapy.contrib.spiders import XMLFeedSpider
|
|||||||
from $project_name.items import ${ProjectName}Item
|
from $project_name.items import ${ProjectName}Item
|
||||||
|
|
||||||
class $classname(XMLFeedSpider):
|
class $classname(XMLFeedSpider):
|
||||||
domain_name = '$site'
|
name = '$name'
|
||||||
start_urls = ['http://www.$site/feed.xml']
|
allowed_domains = ['$domain']
|
||||||
|
start_urls = ['http://www.$domain/feed.xml']
|
||||||
|
|
||||||
def parse_item(self, response, selector):
|
def parse_item(self, response, selector):
|
||||||
i = ${ProjectName}Item()
|
i = ${ProjectName}Item()
|
||||||
|
@ -6,9 +6,6 @@ To run all Scrapy unittests go to Scrapy main dir and type:
|
|||||||
bin/runtests.sh
|
bin/runtests.sh
|
||||||
|
|
||||||
If you're in windows use runtests.bat instead.
|
If you're in windows use runtests.bat instead.
|
||||||
|
|
||||||
Keep in mind that some tests may be skipped if you don't have some (optional)
|
|
||||||
modules available like MySQLdb or simplejson, but that's not a problem.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
@ -59,10 +59,18 @@ class CommandTest(ProjectTest):
|
|||||||
|
|
||||||
class GenspiderCommandTest(CommandTest):
|
class GenspiderCommandTest(CommandTest):
|
||||||
|
|
||||||
|
def test_arguments(self):
|
||||||
|
# only pass one argument. spider script shouldn't be created
|
||||||
|
self.assertEqual(0, self.call('genspider', 'test_name'))
|
||||||
|
assert not exists(join(self.proj_mod_path, 'spiders', 'test_name.py'))
|
||||||
|
# pass two arguments <name> <domain>. spider script should be created
|
||||||
|
self.assertEqual(0, self.call('genspider', 'test_name', 'test.com'))
|
||||||
|
assert exists(join(self.proj_mod_path, 'spiders', 'test_name.py'))
|
||||||
|
|
||||||
def test_template_default(self, *args):
|
def test_template_default(self, *args):
|
||||||
self.assertEqual(0, self.call('genspider', 'testspider', 'test.com', *args))
|
self.assertEqual(0, self.call('genspider', 'test_spider', 'test.com', *args))
|
||||||
assert exists(join(self.proj_mod_path, 'spiders', 'testspider.py'))
|
assert exists(join(self.proj_mod_path, 'spiders', 'test_spider.py'))
|
||||||
self.assertEqual(1, self.call('genspider', 'otherspider', 'test.com'))
|
self.assertEqual(1, self.call('genspider', 'test_spider', 'test.com'))
|
||||||
|
|
||||||
def test_template_basic(self):
|
def test_template_basic(self):
|
||||||
self.test_template_default('--template=basic')
|
self.test_template_default('--template=basic')
|
||||||
|
94
scrapy/tests/test_contrib_exp_crawlspider_matchers.py
Normal file
94
scrapy/tests/test_contrib_exp_crawlspider_matchers.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
from twisted.trial import unittest
|
||||||
|
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.http import Response
|
||||||
|
|
||||||
|
from scrapy.contrib_exp.crawlspider.matchers import BaseMatcher
|
||||||
|
from scrapy.contrib_exp.crawlspider.matchers import UrlMatcher
|
||||||
|
from scrapy.contrib_exp.crawlspider.matchers import UrlRegexMatcher
|
||||||
|
from scrapy.contrib_exp.crawlspider.matchers import UrlListMatcher
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
class MatchersTest(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_base_matcher(self):
|
||||||
|
matcher = BaseMatcher()
|
||||||
|
|
||||||
|
request = Request('http://example.com')
|
||||||
|
response = Response('http://example.com')
|
||||||
|
|
||||||
|
self.assertTrue(matcher.matches_request(request))
|
||||||
|
self.assertTrue(matcher.matches_response(response))
|
||||||
|
|
||||||
|
def test_url_matcher(self):
|
||||||
|
matcher = UrlMatcher('http://example.com')
|
||||||
|
|
||||||
|
request = Request('http://example.com')
|
||||||
|
response = Response('http://example.com')
|
||||||
|
|
||||||
|
self.failUnless(matcher.matches_request(request))
|
||||||
|
self.failUnless(matcher.matches_request(response))
|
||||||
|
|
||||||
|
request = Request('http://example2.com')
|
||||||
|
response = Response('http://example2.com')
|
||||||
|
|
||||||
|
self.failIf(matcher.matches_request(request))
|
||||||
|
self.failIf(matcher.matches_request(response))
|
||||||
|
|
||||||
|
def test_url_regex_matcher(self):
|
||||||
|
matcher = UrlRegexMatcher(r'sample')
|
||||||
|
urls = (
|
||||||
|
'http://example.com/sample1.html',
|
||||||
|
'http://example.com/sample2.html',
|
||||||
|
'http://example.com/sample3.html',
|
||||||
|
'http://example.com/sample4.html',
|
||||||
|
)
|
||||||
|
for url in urls:
|
||||||
|
request, response = Request(url), Response(url)
|
||||||
|
self.failUnless(matcher.matches_request(request))
|
||||||
|
self.failUnless(matcher.matches_response(response))
|
||||||
|
|
||||||
|
matcher = UrlRegexMatcher(r'sample_fail')
|
||||||
|
for url in urls:
|
||||||
|
request, response = Request(url), Response(url)
|
||||||
|
self.failIf(matcher.matches_request(request))
|
||||||
|
self.failIf(matcher.matches_response(response))
|
||||||
|
|
||||||
|
matcher = UrlRegexMatcher(r'SAMPLE\d+', re.IGNORECASE)
|
||||||
|
for url in urls:
|
||||||
|
request, response = Request(url), Response(url)
|
||||||
|
self.failUnless(matcher.matches_request(request))
|
||||||
|
self.failUnless(matcher.matches_response(response))
|
||||||
|
|
||||||
|
def test_url_list_matcher(self):
|
||||||
|
urls = (
|
||||||
|
'http://example.com/sample1.html',
|
||||||
|
'http://example.com/sample2.html',
|
||||||
|
'http://example.com/sample3.html',
|
||||||
|
'http://example.com/sample4.html',
|
||||||
|
)
|
||||||
|
urls2 = (
|
||||||
|
'http://example.com/sample5.html',
|
||||||
|
'http://example.com/sample6.html',
|
||||||
|
'http://example.com/sample7.html',
|
||||||
|
'http://example.com/sample8.html',
|
||||||
|
'http://example.com/',
|
||||||
|
)
|
||||||
|
matcher = UrlListMatcher(urls)
|
||||||
|
|
||||||
|
# match urls
|
||||||
|
for url in urls:
|
||||||
|
request, response = Request(url), Response(url)
|
||||||
|
self.failUnless(matcher.matches_request(request))
|
||||||
|
self.failUnless(matcher.matches_response(response))
|
||||||
|
|
||||||
|
# non-match urls
|
||||||
|
for url in urls2:
|
||||||
|
request, response = Request(url), Response(url)
|
||||||
|
self.failIf(matcher.matches_request(request))
|
||||||
|
self.failIf(matcher.matches_response(response))
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user