1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 04:44:26 +00:00

finished working version of robots.txt downloader middleware, and renamed the module/class

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40630
This commit is contained in:
Pablo Hoffman 2009-01-03 07:35:30 +00:00
parent 8a404c4480
commit e437ff67ae

View File

@ -1,30 +1,28 @@
""" """
jhis is an expertimental middleware to respect robots.txt policies. The biggest This is a middleware to respect robots.txt policies. To active it you must
problem it has is that it uses urllib directly (in RobotFileParser.read() enable this middleware and enable the ROBOTSTXT_OBEY setting.
method) and that conflicts with twisted networking, so it should be ported to
use twisted networking API, but that is not as trivial as it may seem.
This code is left here for future reference, when we resume the work on this
subject.
""" """
import re
import urlparse import urlparse
import robotparser import robotparser
from pydispatch import dispatcher from pydispatch import dispatcher
from scrapy.core import signals from scrapy.core import signals
from scrapy import log from scrapy.core.engine import scrapyengine
from scrapy.core.exceptions import NotConfigured
from scrapy.spider import spiders
from scrapy.http import Request
from scrapy.core.exceptions import IgnoreRequest from scrapy.core.exceptions import IgnoreRequest
from scrapy.conf import settings from scrapy.conf import settings
BASEURL_RE = re.compile("http://.*?/") class RobotsTxtMiddleware(object):
class RobotsMiddleware(object):
def __init__(self): def __init__(self):
if not settings.getbool('ROBOTSTXT_OBEY'):
raise NotConfigured
self._parsers = {} self._parsers = {}
self._spiderdomains = {} self._spiderdomains = {}
self._pending = {} self._pending = {}
@ -38,18 +36,22 @@ class RobotsMiddleware(object):
raise IgnoreRequest("URL forbidden by robots.txt: %s" % request.url) raise IgnoreRequest("URL forbidden by robots.txt: %s" % request.url)
def robot_parser(self, url, spiderdomain): def robot_parser(self, url, spiderdomain):
urldomain = urlparse.urlparse(url).hostname parsedurl = urlparse.urlparse(url)
urldomain = parsedurl.hostname
if urldomain in self._parsers: if urldomain in self._parsers:
rp = self._parsers[urldomain] return self._parsers[urldomain]
else: else:
rp = robotparser.RobotFileParser() self._parsers[urldomain] = None
m = BASEURL_RE.search(url) robotsurl = "%s://%s/robots.txt" % parsedurl[0:2]
if m: robotsreq = Request(robotsurl)
rp.set_url("%srobots.txt" % m.group()) dfd = scrapyengine.schedule(robotsreq, spiders.fromdomain(spiderdomain), priority=0)
rp.read() dfd.addCallbacks(callback=self._parse_robots, callbackArgs=[urldomain])
self._parsers[urldomain] = rp
self._spiderdomains[spiderdomain].add(urldomain) self._spiderdomains[spiderdomain].add(urldomain)
return rp
def _parse_robots(self, response, urldomain):
rp = robotparser.RobotFileParser()
rp.parse(response.to_string().splitlines())
self._parsers[urldomain] = rp
def domain_open(self, domain): def domain_open(self, domain):
self._spiderdomains[domain] = set() self._spiderdomains[domain] = set()