mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 09:24:20 +00:00
removed rulengine and simpage code
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40314
This commit is contained in:
parent
e9f3913328
commit
e6f73c3dfa
@ -1,91 +0,0 @@
|
||||
"""
|
||||
SimpageMiddleware is a middleware for detecting similar page layouts
|
||||
"""
|
||||
|
||||
import sys
|
||||
import datetime
|
||||
import pprint
|
||||
import pickle
|
||||
|
||||
from pydispatch import dispatcher
|
||||
|
||||
from scrapy.core import signals
|
||||
from scrapy.http import Response
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
from scrapy.conf import settings
|
||||
|
||||
from scrapy.contrib.rulengine.responseWrapper import ResponseWrapper
|
||||
from scrapy.contrib.rulengine.pipeline import RulesPipeline
|
||||
|
||||
from .metrics import tagdepth, histogram
|
||||
|
||||
class SimpagesMiddleware(object):
|
||||
|
||||
metric = tagdepth
|
||||
threshold = 0.8
|
||||
|
||||
def __init__(self):
|
||||
if not settings.getbool('SIMPAGES_ENABLED'):
|
||||
raise NotConfigured
|
||||
repfilename = settings.get('SIMPAGES_REPORT_FILE')
|
||||
self.reportfile = open(repfilename, "a") if repfilename else None
|
||||
persistence_filename = open(repfilename + '.pickle', 'w') if repfilename else None
|
||||
self.persistent_simgroup = pickle.Pickler(persistence_filename) if persistence_filename else None
|
||||
self.sim_groups = {}
|
||||
self.last_group = 0
|
||||
dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
|
||||
#Rules
|
||||
RulesPipeline.loadRules()
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if isinstance(response, Response):
|
||||
group, simrate, simhash = self.get_similarity_group(response)
|
||||
if group:
|
||||
wres = ResponseWrapper(response)
|
||||
rp = RulesPipeline(wres)
|
||||
rules_info = rp.execute()
|
||||
self.sim_groups[group]['similar_urls'].append((response.url, simrate, simhash, rules_info))
|
||||
else:
|
||||
self.create_similarity_group(response)
|
||||
return response
|
||||
|
||||
def get_similarity_group(self, response):
|
||||
sh = self.metric.simhash(response, symnumbers=True)
|
||||
for group, data in self.sim_groups.iteritems():
|
||||
simrate = self.metric.compare(sh, data['simhash'])
|
||||
if simrate > self.threshold:
|
||||
return (group, simrate, data['simhash'])
|
||||
return (None, 0, set())
|
||||
|
||||
def create_similarity_group(self, response):
|
||||
self.last_group += 1
|
||||
data = {}
|
||||
data['simhash'] = self.metric.simhash(response, symnumbers=True)
|
||||
data['first_url'] = response.url
|
||||
wres = ResponseWrapper(response)
|
||||
rp = RulesPipeline(wres)
|
||||
data['rules_info'] = rp.execute()
|
||||
data['similar_urls'] = []
|
||||
self.sim_groups[self.last_group] = data
|
||||
|
||||
def get_report(self):
|
||||
data_hist = dict( [(k, len(v['similar_urls'])) for k, v in self.sim_groups.items()] )
|
||||
r = "Page similarity results\n"
|
||||
r += "=======================\n\n"
|
||||
r += "Datetime : %s\n" % datetime.datetime.now()
|
||||
r += "Metric : %s\n" % self.metric.__name__
|
||||
r += "Threshold: %s\n" % self.threshold
|
||||
r += "Distribution per group:\n%s\n" % histogram.plot(data_hist)
|
||||
r += "Results :\n"
|
||||
r += pprint.pformat(self.sim_groups)
|
||||
r += "\n\n"
|
||||
return r
|
||||
|
||||
def engine_stopped(self):
|
||||
rep = self.get_report()
|
||||
if self.reportfile:
|
||||
self.reportfile.write(rep)
|
||||
else:
|
||||
print rep
|
||||
if self.persistent_simgroup:
|
||||
self.persistent_simgroup.dump(self.sim_groups)
|
@ -1,19 +0,0 @@
|
||||
"""
|
||||
This module contains several metrics that can be used with the
|
||||
SimpagesMiddleware.
|
||||
|
||||
A metric must implement two functions:
|
||||
|
||||
1. simhash(response, *args)
|
||||
|
||||
Receives a response and returns a simhash of that response. A simhash can be
|
||||
an object of any type and its only purpose is to provide a fast way for
|
||||
comparing the [simhashed] response with another responses (that will also be
|
||||
simhashed).
|
||||
|
||||
2. compare(simhash1, simhash2)
|
||||
|
||||
Receives two simhashes and must return a (float) value between 0 and 1,
|
||||
depending on how similar the two simhashes (and, thus, the responses they
|
||||
represent) are. 0 means completely different, 1 means identical.
|
||||
"""
|
@ -1,39 +0,0 @@
|
||||
import math
|
||||
|
||||
def plot(data):
|
||||
"""
|
||||
data is a dict of tuples of the form: {key: quantity, ...}.
|
||||
Make the histogram of the data dict.
|
||||
"""
|
||||
maxv = max(data.values())
|
||||
minv = min(data.values())
|
||||
step = (maxv - minv) * 0.1 if (maxv - minv) != 0 else 1
|
||||
s = []
|
||||
for key, q in data.items():
|
||||
s1 = "%s%s: " % (key, blanks(6 - len(str(key))))
|
||||
for i in xrange(1, int(math.ceil(q/step)+1)):
|
||||
s1+= "="
|
||||
if s1[len(s1)-1] == '=':
|
||||
s1+= " "
|
||||
s1 += str(q)
|
||||
s.append(s1)
|
||||
maxl = len(max(s, key=lambda x:len(x)))
|
||||
s2 = ''
|
||||
for i in xrange(1,maxl+1):
|
||||
s2 += '-'
|
||||
|
||||
r = "\tgroup | quantities\n"
|
||||
r += "\t%s" % s2
|
||||
for x in s:
|
||||
r += "\n\t%s" % x
|
||||
return r
|
||||
|
||||
def blanks(n):
|
||||
return ''.join([' ' for x in range(1,n+1)])
|
||||
|
||||
def print_plot(data):
|
||||
print plot(data)
|
||||
|
||||
|
||||
|
||||
|
@ -1,39 +0,0 @@
|
||||
"""
|
||||
tagdepth metric
|
||||
|
||||
Compares pages analyzing a predefined set of (relevant)
|
||||
tags and the depth where they appear in the page markup document.
|
||||
|
||||
Requires ResponseSoup extension enabled.
|
||||
"""
|
||||
|
||||
from __future__ import division
|
||||
|
||||
from BeautifulSoup import Tag
|
||||
|
||||
relevant_tags = set(['div', 'table', 'td', 'tr', 'h1','p'])
|
||||
|
||||
def get_symbol_dict(node, tags=(), depth=1):
|
||||
symdict = {}
|
||||
for tag in node:
|
||||
if isinstance(tag, Tag) and tag.name in tags:
|
||||
symbol = "%d%s" % (depth, str(tag.name))
|
||||
symdict[symbol] = symdict.setdefault(symbol, 0) + 1
|
||||
symdict.update(get_symbol_dict(tag, tags, depth+1))
|
||||
return symdict
|
||||
|
||||
def simhash(response, symnumbers=False):
|
||||
soup = response.soup
|
||||
symdict = get_symbol_dict(soup.find('body'), relevant_tags)
|
||||
if symnumbers:
|
||||
s = set([k+str(v) for k,v in symdict.items()])
|
||||
else:
|
||||
s = set(symdict.keys())
|
||||
return s
|
||||
|
||||
def compare(sh1, sh2):
|
||||
if sh1 == sh2:
|
||||
return 1.0
|
||||
else:
|
||||
return len(sh1 & sh2) / len(sh1 | sh2)
|
||||
|
@ -1,74 +0,0 @@
|
||||
"""
|
||||
Here put the rules that the rules engine / pipline will take to process it.
|
||||
"""
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
from scrapy.utils.misc import load_class
|
||||
from scrapy import log
|
||||
from scrapy.conf import settings
|
||||
|
||||
class Rule(object):
|
||||
"""
|
||||
Interface of the Rules.
|
||||
Implement this class to create a new rule.
|
||||
"""
|
||||
def __init__(self, wresponse=None):
|
||||
self.__responsewrapper = wresponse
|
||||
|
||||
def __getresponsewrapper(self):
|
||||
return self.__responsewrapper
|
||||
def __setresponsewrapper(self, wresponse):
|
||||
self.__responsewrapper = wresponse
|
||||
|
||||
responsewrapper = property(__getresponsewrapper, __setresponsewrapper)
|
||||
|
||||
def check(self):
|
||||
result = 0.0
|
||||
if self.responsewrapper:
|
||||
result = self.holds()
|
||||
if result < 0 or result > 1:
|
||||
raise ValueError, "Value must be between 0 and 1."
|
||||
return result
|
||||
|
||||
def holds(self):
|
||||
"""
|
||||
User of this class must override this method.
|
||||
Put here the conditions that must be satisfied by the rule.
|
||||
The return value must be a number between 0.0 and 1.0.
|
||||
"""
|
||||
return 0.0
|
||||
|
||||
class RulesManager(object):
|
||||
"""
|
||||
This class contains the RulesManager which takes care of loading and
|
||||
keeping track of all enabled rules. It also contains an instantiated
|
||||
RulesManager (rules) to be used as singleton.
|
||||
The RulesManager contains the rules classes, not instances of the rules
|
||||
classes, this approach give us more flexiblility in our Rules Engine.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.loaded = False
|
||||
self.enabled = {}
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Load enabled extensions in settings module
|
||||
"""
|
||||
|
||||
self.loaded = False
|
||||
self.enabled.clear()
|
||||
|
||||
for extension_path in settings.getlist('SIMPAGES_RULES'):
|
||||
cls = load_class(extension_path)
|
||||
self.enabled[cls.__name__] = cls
|
||||
|
||||
self.loaded = True
|
||||
|
||||
def reload(self):
|
||||
self.load()
|
||||
|
||||
rules = RulesManager()
|
||||
|
||||
|
||||
|
||||
|
@ -1,9 +0,0 @@
|
||||
"""
|
||||
Contains exceptions of the simpages stuffs
|
||||
"""
|
||||
|
||||
class RulesNotLoaded(Exception):
|
||||
"""
|
||||
Indicate that the rules was not loaded.
|
||||
"""
|
||||
pass
|
@ -1,52 +0,0 @@
|
||||
"""
|
||||
Represent the pipepline of execution to the rules engine.
|
||||
"""
|
||||
from scrapy import log
|
||||
from scrapy.contrib.rulengine.exceptions import RulesNotLoaded
|
||||
|
||||
from scrapy.contrib.rulengine import rules
|
||||
|
||||
class RulesPipeline(object):
|
||||
|
||||
rulesLoaded = []
|
||||
loaded = False
|
||||
|
||||
def __init__(self, wresponse):
|
||||
"""
|
||||
wresponse: is a response wrapper object that contain the response.
|
||||
"""
|
||||
self.__rules = []
|
||||
self._responsewrapper = wresponse
|
||||
|
||||
@staticmethod
|
||||
def loadRules():
|
||||
RulesPipeline.loaded = True
|
||||
rules.load()
|
||||
try:
|
||||
for rulename in rules.enabled.keys():
|
||||
ldr_msg = 'Loading ... %s' % rulename
|
||||
ruleClass = rules.enabled[rulename]
|
||||
RulesPipeline.rulesLoaded.append(ruleClass())
|
||||
log.msg(ldr_msg)
|
||||
print ldr_msg
|
||||
except Exception, e:
|
||||
RulesPipeline.loaded = False
|
||||
RulesPipeline.rulesLoaded = []
|
||||
log.msg(e)
|
||||
print e
|
||||
|
||||
def execute(self):
|
||||
"""
|
||||
Return a dictionary that conatins all the rules executed.
|
||||
"""
|
||||
if RulesPipeline.loaded:
|
||||
rules_loaded = RulesPipeline.rulesLoaded
|
||||
info_dict = {}
|
||||
info_dict['rules_executed'] = {}
|
||||
for rule in rules_loaded:
|
||||
rule.responsewrapper = self._responsewrapper
|
||||
rule_result = rule.check()
|
||||
info_dict['rules_executed'][rule.__class__.__name__] = rule_result
|
||||
return info_dict
|
||||
else:
|
||||
raise RulesNotLoaded, 'Problems loading the rules.'
|
@ -1,59 +0,0 @@
|
||||
"""
|
||||
This class serve as wrapper of the response object.
|
||||
The object created by this class are not response, but are objects that contains a response a a lot of useful information about the response.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from BeautifulSoup import Tag
|
||||
|
||||
def extractText(soup):
|
||||
text_in_tags = soup.findAll(text=True)
|
||||
res = []
|
||||
for tag in text_in_tags:
|
||||
if isinstance(tag.parent, Tag) and tag.parent.name not in('script'):
|
||||
res.append(tag)
|
||||
return res
|
||||
|
||||
class ResponseWrapper(object):
|
||||
def __init__(self, response):
|
||||
self._response = response
|
||||
self.__soup_bodytext = extractText(response.soup.body) if response else None
|
||||
self.__soup_headtext = extractText(response.soup.head) if response else None
|
||||
self._soup = None
|
||||
self._bodytext = None
|
||||
self._headtext = None
|
||||
self._cleanbodytext = None
|
||||
|
||||
def __cleantext(self, souptext):
|
||||
return filter(lambda x:re.search('\w+', x), souptext)
|
||||
|
||||
@property
|
||||
def soup(self):
|
||||
if (self._soup):
|
||||
self._soup = self._response.soup
|
||||
return self._response.soup
|
||||
|
||||
@property
|
||||
def bodytext(self):
|
||||
if not self._bodytext:
|
||||
self._bodytext = self.__cleantext(self.__soup_bodytext)
|
||||
return self._bodytext
|
||||
|
||||
@property
|
||||
def headtext(self):
|
||||
if not self._headtext:
|
||||
self._headtext = self.__cleantext(self.__soup_headtext)
|
||||
return self._headtext
|
||||
|
||||
@property
|
||||
def cleanbodytext(self):
|
||||
if not self._cleanbodytext:
|
||||
text = ' '.join(self.bodytext)
|
||||
text = text.lower()
|
||||
text = text.replace('\n', '')
|
||||
text = text.replace('\t', '')
|
||||
text = re.sub('&.*?;', '', text)
|
||||
self._cleanbodytext = text
|
||||
return self._cleanbodytext
|
||||
|
Loading…
x
Reference in New Issue
Block a user