mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 04:44:26 +00:00
finished working version of robots.txt downloader middleware, and renamed the module/class
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40630
This commit is contained in:
parent
8a404c4480
commit
e437ff67ae
@ -1,30 +1,28 @@
|
|||||||
"""
|
"""
|
||||||
jhis is an expertimental middleware to respect robots.txt policies. The biggest
|
This is a middleware to respect robots.txt policies. To active it you must
|
||||||
problem it has is that it uses urllib directly (in RobotFileParser.read()
|
enable this middleware and enable the ROBOTSTXT_OBEY setting.
|
||||||
method) and that conflicts with twisted networking, so it should be ported to
|
|
||||||
use twisted networking API, but that is not as trivial as it may seem.
|
|
||||||
|
|
||||||
This code is left here for future reference, when we resume the work on this
|
|
||||||
subject.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
|
||||||
import urlparse
|
import urlparse
|
||||||
import robotparser
|
import robotparser
|
||||||
|
|
||||||
from pydispatch import dispatcher
|
from pydispatch import dispatcher
|
||||||
|
|
||||||
from scrapy.core import signals
|
from scrapy.core import signals
|
||||||
from scrapy import log
|
from scrapy.core.engine import scrapyengine
|
||||||
|
from scrapy.core.exceptions import NotConfigured
|
||||||
|
from scrapy.spider import spiders
|
||||||
|
from scrapy.http import Request
|
||||||
from scrapy.core.exceptions import IgnoreRequest
|
from scrapy.core.exceptions import IgnoreRequest
|
||||||
from scrapy.conf import settings
|
from scrapy.conf import settings
|
||||||
|
|
||||||
BASEURL_RE = re.compile("http://.*?/")
|
class RobotsTxtMiddleware(object):
|
||||||
|
|
||||||
class RobotsMiddleware(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
if not settings.getbool('ROBOTSTXT_OBEY'):
|
||||||
|
raise NotConfigured
|
||||||
|
|
||||||
self._parsers = {}
|
self._parsers = {}
|
||||||
self._spiderdomains = {}
|
self._spiderdomains = {}
|
||||||
self._pending = {}
|
self._pending = {}
|
||||||
@ -38,18 +36,22 @@ class RobotsMiddleware(object):
|
|||||||
raise IgnoreRequest("URL forbidden by robots.txt: %s" % request.url)
|
raise IgnoreRequest("URL forbidden by robots.txt: %s" % request.url)
|
||||||
|
|
||||||
def robot_parser(self, url, spiderdomain):
|
def robot_parser(self, url, spiderdomain):
|
||||||
urldomain = urlparse.urlparse(url).hostname
|
parsedurl = urlparse.urlparse(url)
|
||||||
|
urldomain = parsedurl.hostname
|
||||||
if urldomain in self._parsers:
|
if urldomain in self._parsers:
|
||||||
rp = self._parsers[urldomain]
|
return self._parsers[urldomain]
|
||||||
else:
|
else:
|
||||||
rp = robotparser.RobotFileParser()
|
self._parsers[urldomain] = None
|
||||||
m = BASEURL_RE.search(url)
|
robotsurl = "%s://%s/robots.txt" % parsedurl[0:2]
|
||||||
if m:
|
robotsreq = Request(robotsurl)
|
||||||
rp.set_url("%srobots.txt" % m.group())
|
dfd = scrapyengine.schedule(robotsreq, spiders.fromdomain(spiderdomain), priority=0)
|
||||||
rp.read()
|
dfd.addCallbacks(callback=self._parse_robots, callbackArgs=[urldomain])
|
||||||
self._parsers[urldomain] = rp
|
|
||||||
self._spiderdomains[spiderdomain].add(urldomain)
|
self._spiderdomains[spiderdomain].add(urldomain)
|
||||||
return rp
|
|
||||||
|
def _parse_robots(self, response, urldomain):
|
||||||
|
rp = robotparser.RobotFileParser()
|
||||||
|
rp.parse(response.to_string().splitlines())
|
||||||
|
self._parsers[urldomain] = rp
|
||||||
|
|
||||||
def domain_open(self, domain):
|
def domain_open(self, domain):
|
||||||
self._spiderdomains[domain] = set()
|
self._spiderdomains[domain] = set()
|
Loading…
x
Reference in New Issue
Block a user