1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 09:24:20 +00:00

removed rulengine and simpage code

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40314
This commit is contained in:
olveyra 2008-10-09 18:59:32 +00:00
parent e9f3913328
commit e6f73c3dfa
8 changed files with 0 additions and 382 deletions

View File

@ -1,91 +0,0 @@
"""
SimpageMiddleware is a middleware for detecting similar page layouts
"""
import sys
import datetime
import pprint
import pickle
from pydispatch import dispatcher
from scrapy.core import signals
from scrapy.http import Response
from scrapy.core.exceptions import NotConfigured
from scrapy.conf import settings
from scrapy.contrib.rulengine.responseWrapper import ResponseWrapper
from scrapy.contrib.rulengine.pipeline import RulesPipeline
from .metrics import tagdepth, histogram
class SimpagesMiddleware(object):
metric = tagdepth
threshold = 0.8
def __init__(self):
if not settings.getbool('SIMPAGES_ENABLED'):
raise NotConfigured
repfilename = settings.get('SIMPAGES_REPORT_FILE')
self.reportfile = open(repfilename, "a") if repfilename else None
persistence_filename = open(repfilename + '.pickle', 'w') if repfilename else None
self.persistent_simgroup = pickle.Pickler(persistence_filename) if persistence_filename else None
self.sim_groups = {}
self.last_group = 0
dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
#Rules
RulesPipeline.loadRules()
def process_response(self, request, response, spider):
if isinstance(response, Response):
group, simrate, simhash = self.get_similarity_group(response)
if group:
wres = ResponseWrapper(response)
rp = RulesPipeline(wres)
rules_info = rp.execute()
self.sim_groups[group]['similar_urls'].append((response.url, simrate, simhash, rules_info))
else:
self.create_similarity_group(response)
return response
def get_similarity_group(self, response):
sh = self.metric.simhash(response, symnumbers=True)
for group, data in self.sim_groups.iteritems():
simrate = self.metric.compare(sh, data['simhash'])
if simrate > self.threshold:
return (group, simrate, data['simhash'])
return (None, 0, set())
def create_similarity_group(self, response):
self.last_group += 1
data = {}
data['simhash'] = self.metric.simhash(response, symnumbers=True)
data['first_url'] = response.url
wres = ResponseWrapper(response)
rp = RulesPipeline(wres)
data['rules_info'] = rp.execute()
data['similar_urls'] = []
self.sim_groups[self.last_group] = data
def get_report(self):
data_hist = dict( [(k, len(v['similar_urls'])) for k, v in self.sim_groups.items()] )
r = "Page similarity results\n"
r += "=======================\n\n"
r += "Datetime : %s\n" % datetime.datetime.now()
r += "Metric : %s\n" % self.metric.__name__
r += "Threshold: %s\n" % self.threshold
r += "Distribution per group:\n%s\n" % histogram.plot(data_hist)
r += "Results :\n"
r += pprint.pformat(self.sim_groups)
r += "\n\n"
return r
def engine_stopped(self):
rep = self.get_report()
if self.reportfile:
self.reportfile.write(rep)
else:
print rep
if self.persistent_simgroup:
self.persistent_simgroup.dump(self.sim_groups)

View File

@ -1,19 +0,0 @@
"""
This module contains several metrics that can be used with the
SimpagesMiddleware.
A metric must implement two functions:
1. simhash(response, *args)
Receives a response and returns a simhash of that response. A simhash can be
an object of any type and its only purpose is to provide a fast way for
comparing the [simhashed] response with another responses (that will also be
simhashed).
2. compare(simhash1, simhash2)
Receives two simhashes and must return a (float) value between 0 and 1,
depending on how similar the two simhashes (and, thus, the responses they
represent) are. 0 means completely different, 1 means identical.
"""

View File

@ -1,39 +0,0 @@
import math
def plot(data):
"""
data is a dict of tuples of the form: {key: quantity, ...}.
Make the histogram of the data dict.
"""
maxv = max(data.values())
minv = min(data.values())
step = (maxv - minv) * 0.1 if (maxv - minv) != 0 else 1
s = []
for key, q in data.items():
s1 = "%s%s: " % (key, blanks(6 - len(str(key))))
for i in xrange(1, int(math.ceil(q/step)+1)):
s1+= "="
if s1[len(s1)-1] == '=':
s1+= " "
s1 += str(q)
s.append(s1)
maxl = len(max(s, key=lambda x:len(x)))
s2 = ''
for i in xrange(1,maxl+1):
s2 += '-'
r = "\tgroup | quantities\n"
r += "\t%s" % s2
for x in s:
r += "\n\t%s" % x
return r
def blanks(n):
return ''.join([' ' for x in range(1,n+1)])
def print_plot(data):
print plot(data)

View File

@ -1,39 +0,0 @@
"""
tagdepth metric
Compares pages analyzing a predefined set of (relevant)
tags and the depth where they appear in the page markup document.
Requires ResponseSoup extension enabled.
"""
from __future__ import division
from BeautifulSoup import Tag
relevant_tags = set(['div', 'table', 'td', 'tr', 'h1','p'])
def get_symbol_dict(node, tags=(), depth=1):
symdict = {}
for tag in node:
if isinstance(tag, Tag) and tag.name in tags:
symbol = "%d%s" % (depth, str(tag.name))
symdict[symbol] = symdict.setdefault(symbol, 0) + 1
symdict.update(get_symbol_dict(tag, tags, depth+1))
return symdict
def simhash(response, symnumbers=False):
soup = response.soup
symdict = get_symbol_dict(soup.find('body'), relevant_tags)
if symnumbers:
s = set([k+str(v) for k,v in symdict.items()])
else:
s = set(symdict.keys())
return s
def compare(sh1, sh2):
if sh1 == sh2:
return 1.0
else:
return len(sh1 & sh2) / len(sh1 | sh2)

View File

@ -1,74 +0,0 @@
"""
Here put the rules that the rules engine / pipline will take to process it.
"""
from scrapy.core.exceptions import NotConfigured
from scrapy.utils.misc import load_class
from scrapy import log
from scrapy.conf import settings
class Rule(object):
"""
Interface of the Rules.
Implement this class to create a new rule.
"""
def __init__(self, wresponse=None):
self.__responsewrapper = wresponse
def __getresponsewrapper(self):
return self.__responsewrapper
def __setresponsewrapper(self, wresponse):
self.__responsewrapper = wresponse
responsewrapper = property(__getresponsewrapper, __setresponsewrapper)
def check(self):
result = 0.0
if self.responsewrapper:
result = self.holds()
if result < 0 or result > 1:
raise ValueError, "Value must be between 0 and 1."
return result
def holds(self):
"""
User of this class must override this method.
Put here the conditions that must be satisfied by the rule.
The return value must be a number between 0.0 and 1.0.
"""
return 0.0
class RulesManager(object):
"""
This class contains the RulesManager which takes care of loading and
keeping track of all enabled rules. It also contains an instantiated
RulesManager (rules) to be used as singleton.
The RulesManager contains the rules classes, not instances of the rules
classes, this approach give us more flexiblility in our Rules Engine.
"""
def __init__(self):
self.loaded = False
self.enabled = {}
def load(self):
"""
Load enabled extensions in settings module
"""
self.loaded = False
self.enabled.clear()
for extension_path in settings.getlist('SIMPAGES_RULES'):
cls = load_class(extension_path)
self.enabled[cls.__name__] = cls
self.loaded = True
def reload(self):
self.load()
rules = RulesManager()

View File

@ -1,9 +0,0 @@
"""
Contains exceptions of the simpages stuffs
"""
class RulesNotLoaded(Exception):
"""
Indicate that the rules was not loaded.
"""
pass

View File

@ -1,52 +0,0 @@
"""
Represent the pipepline of execution to the rules engine.
"""
from scrapy import log
from scrapy.contrib.rulengine.exceptions import RulesNotLoaded
from scrapy.contrib.rulengine import rules
class RulesPipeline(object):
rulesLoaded = []
loaded = False
def __init__(self, wresponse):
"""
wresponse: is a response wrapper object that contain the response.
"""
self.__rules = []
self._responsewrapper = wresponse
@staticmethod
def loadRules():
RulesPipeline.loaded = True
rules.load()
try:
for rulename in rules.enabled.keys():
ldr_msg = 'Loading ... %s' % rulename
ruleClass = rules.enabled[rulename]
RulesPipeline.rulesLoaded.append(ruleClass())
log.msg(ldr_msg)
print ldr_msg
except Exception, e:
RulesPipeline.loaded = False
RulesPipeline.rulesLoaded = []
log.msg(e)
print e
def execute(self):
"""
Return a dictionary that conatins all the rules executed.
"""
if RulesPipeline.loaded:
rules_loaded = RulesPipeline.rulesLoaded
info_dict = {}
info_dict['rules_executed'] = {}
for rule in rules_loaded:
rule.responsewrapper = self._responsewrapper
rule_result = rule.check()
info_dict['rules_executed'][rule.__class__.__name__] = rule_result
return info_dict
else:
raise RulesNotLoaded, 'Problems loading the rules.'

View File

@ -1,59 +0,0 @@
"""
This class serve as wrapper of the response object.
The object created by this class are not response, but are objects that contains a response a a lot of useful information about the response.
"""
import re
from BeautifulSoup import Tag
def extractText(soup):
text_in_tags = soup.findAll(text=True)
res = []
for tag in text_in_tags:
if isinstance(tag.parent, Tag) and tag.parent.name not in('script'):
res.append(tag)
return res
class ResponseWrapper(object):
def __init__(self, response):
self._response = response
self.__soup_bodytext = extractText(response.soup.body) if response else None
self.__soup_headtext = extractText(response.soup.head) if response else None
self._soup = None
self._bodytext = None
self._headtext = None
self._cleanbodytext = None
def __cleantext(self, souptext):
return filter(lambda x:re.search('\w+', x), souptext)
@property
def soup(self):
if (self._soup):
self._soup = self._response.soup
return self._response.soup
@property
def bodytext(self):
if not self._bodytext:
self._bodytext = self.__cleantext(self.__soup_bodytext)
return self._bodytext
@property
def headtext(self):
if not self._headtext:
self._headtext = self.__cleantext(self.__soup_headtext)
return self._headtext
@property
def cleanbodytext(self):
if not self._cleanbodytext:
text = ' '.join(self.bodytext)
text = text.lower()
text = text.replace('\n', '')
text = text.replace('\t', '')
text = re.sub('&.*?;', '', text)
self._cleanbodytext = text
return self._cleanbodytext