removed rulengine and simpage code

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40314
2025-02-25 09:24:20 +00:00 · 2008-10-09 18:59:32 +00:00 · 2008-10-09 18:59:32 +00:00 · e6f73c3dfa
commit e6f73c3dfa
parent e9f3913328
8 changed files with 0 additions and 382 deletions
--- a/scrapy/trunk/scrapy/contrib/downloadermiddleware/simpages/init.py
+++ b/scrapy/trunk/scrapy/contrib/downloadermiddleware/simpages/init.py
@ -1,91 +0,0 @@
-"""
-SimpageMiddleware is a middleware for detecting similar page layouts
-"""
-
-import sys
-import datetime
-import pprint
-import pickle
-
-from pydispatch import dispatcher
-
-from scrapy.core import signals
-from scrapy.http import Response
-from scrapy.core.exceptions import NotConfigured
-from scrapy.conf import settings
-
-from scrapy.contrib.rulengine.responseWrapper import ResponseWrapper
-from scrapy.contrib.rulengine.pipeline import RulesPipeline
-
-from .metrics import tagdepth, histogram
-
-class SimpagesMiddleware(object):
-    
-    metric = tagdepth
-    threshold = 0.8
-
-    def __init__(self):
-        if not settings.getbool('SIMPAGES_ENABLED'):
-            raise NotConfigured
-        repfilename = settings.get('SIMPAGES_REPORT_FILE')
-        self.reportfile = open(repfilename, "a") if repfilename else None        
-        persistence_filename = open(repfilename + '.pickle', 'w') if repfilename else None
-        self.persistent_simgroup = pickle.Pickler(persistence_filename) if persistence_filename else None
-        self.sim_groups = {}
-        self.last_group = 0
-        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
-        #Rules 
-        RulesPipeline.loadRules()
-
-    def process_response(self, request, response, spider):
-        if isinstance(response, Response):
-            group, simrate, simhash = self.get_similarity_group(response)
-            if group:
-                wres = ResponseWrapper(response)
-                rp = RulesPipeline(wres)
-                rules_info = rp.execute()
-                self.sim_groups[group]['similar_urls'].append((response.url, simrate, simhash, rules_info))
-            else:
-                self.create_similarity_group(response)
-        return response
-
-    def get_similarity_group(self, response):
-        sh = self.metric.simhash(response, symnumbers=True)
-        for group, data in self.sim_groups.iteritems():
-            simrate = self.metric.compare(sh, data['simhash'])
-            if simrate > self.threshold:
-                return (group, simrate, data['simhash'])
-        return (None, 0, set())
-
-    def create_similarity_group(self, response):
-        self.last_group += 1
-        data = {}
-        data['simhash'] = self.metric.simhash(response, symnumbers=True)
-        data['first_url'] = response.url
-        wres = ResponseWrapper(response)
-        rp = RulesPipeline(wres)
-        data['rules_info'] = rp.execute()
-        data['similar_urls'] = []
-        self.sim_groups[self.last_group] = data
-
-    def get_report(self):
-        data_hist = dict( [(k, len(v['similar_urls'])) for k, v in self.sim_groups.items()] )
-        r =  "Page similarity results\n"
-        r += "=======================\n\n"
-        r += "Datetime : %s\n" % datetime.datetime.now()
-        r += "Metric   : %s\n" % self.metric.__name__
-        r += "Threshold: %s\n" % self.threshold
-        r += "Distribution per group:\n%s\n" % histogram.plot(data_hist)
-        r += "Results  :\n"
-        r += pprint.pformat(self.sim_groups)
-        r += "\n\n"
-        return r
-
-    def engine_stopped(self):
-        rep = self.get_report()
-        if self.reportfile:
-            self.reportfile.write(rep)
-        else:
-            print rep
-        if self.persistent_simgroup:
-            self.persistent_simgroup.dump(self.sim_groups)
--- a/scrapy/trunk/scrapy/contrib/downloadermiddleware/simpages/metrics/init.py
+++ b/scrapy/trunk/scrapy/contrib/downloadermiddleware/simpages/metrics/init.py
@ -1,19 +0,0 @@
-"""
-This module contains several metrics that can be used with the
-SimpagesMiddleware.
-
-A metric must implement two functions:
-
-1. simhash(response, *args)
-
-Receives a response and returns a simhash of that response. A simhash can be
-an object of any type and its only purpose is to provide a fast way for
-comparing the [simhashed] response with another responses (that will also be
-simhashed).
-
-2. compare(simhash1, simhash2)
-
-Receives two simhashes and must return a (float) value between 0 and 1,
-depending on how similar the two simhashes (and, thus, the responses they
-represent) are. 0 means completely different, 1 means identical.
-"""
--- a/scrapy/trunk/scrapy/contrib/downloadermiddleware/simpages/metrics/histogram.py
+++ b/scrapy/trunk/scrapy/contrib/downloadermiddleware/simpages/metrics/histogram.py
@ -1,39 +0,0 @@
-import math
-
-def plot(data):
-    """
-    data is a dict of tuples of the form: {key: quantity, ...}.
-    Make the histogram of the data dict.
-    """
-    maxv = max(data.values())
-    minv = min(data.values())
-    step = (maxv - minv) * 0.1 if (maxv - minv) != 0 else 1
-    s = []
-    for key, q in data.items():
-        s1 = "%s%s: " % (key, blanks(6 - len(str(key))))
-        for i in xrange(1, int(math.ceil(q/step)+1)):
-            s1+= "="
-        if s1[len(s1)-1] == '=':
-            s1+= " " 
-        s1 += str(q)
-        s.append(s1)
-    maxl = len(max(s, key=lambda x:len(x)))
-    s2 = ''
-    for i in xrange(1,maxl+1):
-        s2 += '-'
-
-    r = "\tgroup | quantities\n"
-    r += "\t%s" % s2
-    for x in s:
-        r += "\n\t%s" % x
-    return r
-
-def blanks(n):
-    return ''.join([' ' for x in range(1,n+1)])
-
-def print_plot(data):
-    print plot(data)
-
-
-
-
--- a/scrapy/trunk/scrapy/contrib/downloadermiddleware/simpages/metrics/tagdepth.py
+++ b/scrapy/trunk/scrapy/contrib/downloadermiddleware/simpages/metrics/tagdepth.py
@ -1,39 +0,0 @@
-"""
-tagdepth metric
-
-Compares pages analyzing a predefined set of (relevant)
-tags and the depth where they appear in the page markup document. 
-
-Requires ResponseSoup extension enabled.
-"""
-
-from __future__ import division
-
-from BeautifulSoup import Tag
-
-relevant_tags = set(['div', 'table', 'td', 'tr', 'h1','p'])
-
-def get_symbol_dict(node, tags=(), depth=1):
-    symdict = {}
-    for tag in node:
-        if isinstance(tag, Tag) and tag.name in tags:
-            symbol = "%d%s" % (depth, str(tag.name))
-            symdict[symbol] = symdict.setdefault(symbol, 0) + 1
-            symdict.update(get_symbol_dict(tag, tags, depth+1))
-    return symdict
-
-def simhash(response, symnumbers=False):
-    soup = response.soup
-    symdict = get_symbol_dict(soup.find('body'), relevant_tags)
-    if symnumbers:
-        s = set([k+str(v) for k,v in symdict.items()])
-    else:
-        s = set(symdict.keys())
-    return s
-
-def compare(sh1, sh2):
-    if sh1 == sh2:
-        return 1.0
-    else:
-        return len(sh1 & sh2) / len(sh1 | sh2)
-
--- a/scrapy/trunk/scrapy/contrib/rulengine/init.py
+++ b/scrapy/trunk/scrapy/contrib/rulengine/init.py
@ -1,74 +0,0 @@
-"""
-Here put the rules that the rules engine / pipline will take to process it.
-"""
-from scrapy.core.exceptions import NotConfigured
-from scrapy.utils.misc import load_class
-from scrapy import log 
-from scrapy.conf import settings
-
-class Rule(object):
-    """
-    Interface of the Rules. 
-    Implement this class to create a new rule.
-    """
-    def __init__(self, wresponse=None):
-        self.__responsewrapper = wresponse
-    
-    def __getresponsewrapper(self):
-        return self.__responsewrapper
-    def __setresponsewrapper(self, wresponse):
-        self.__responsewrapper = wresponse
-
-    responsewrapper = property(__getresponsewrapper, __setresponsewrapper)
-    
-    def check(self):
-        result = 0.0
-        if self.responsewrapper:
-            result = self.holds()
-            if result < 0 or result > 1:
-                raise ValueError, "Value must be between 0 and 1."
-        return result
-
-    def holds(self):
-        """
-        User of this class must override this method.
-        Put here the conditions that must be satisfied by the rule.
-        The return value must be a number between 0.0 and 1.0.
-        """
-        return 0.0
-
-class RulesManager(object):
-    """
-    This class contains the RulesManager  which takes care of loading and
-    keeping track of all enabled rules. It also contains an instantiated
-    RulesManager (rules) to be used as singleton.
-    The RulesManager contains the rules classes, not instances of the rules 
-    classes, this approach give us more flexiblility in our Rules Engine.
-    """
-
-    def __init__(self):
-        self.loaded = False
-        self.enabled = {}
-
-    def load(self):
-        """
-        Load enabled extensions in settings module
-        """
-        
-        self.loaded = False
-        self.enabled.clear()
-        
-        for extension_path in settings.getlist('SIMPAGES_RULES'):
-            cls = load_class(extension_path)
-            self.enabled[cls.__name__] = cls
-                
-        self.loaded = True
-    
-    def reload(self):
-        self.load()
-
-rules = RulesManager()
-
-
-
-
--- a/scrapy/trunk/scrapy/contrib/rulengine/exceptions.py
+++ b/scrapy/trunk/scrapy/contrib/rulengine/exceptions.py
@ -1,9 +0,0 @@
-"""
-Contains exceptions of the simpages stuffs
-"""
-
-class RulesNotLoaded(Exception):
-    """
-    Indicate that the rules was not loaded.
-    """
-    pass
--- a/scrapy/trunk/scrapy/contrib/rulengine/pipeline.py
+++ b/scrapy/trunk/scrapy/contrib/rulengine/pipeline.py
@ -1,52 +0,0 @@
-"""
-Represent the pipepline of execution to the rules engine.
-"""
-from scrapy import log
-from scrapy.contrib.rulengine.exceptions import RulesNotLoaded
-
-from scrapy.contrib.rulengine import rules
-
-class RulesPipeline(object):
-
-    rulesLoaded = []
-    loaded = False
-    
-    def __init__(self, wresponse):
-        """
-        wresponse: is a response wrapper object that contain the response.
-        """
-        self.__rules = []
-        self._responsewrapper = wresponse
-    
-    @staticmethod
-    def loadRules():
-        RulesPipeline.loaded = True
-        rules.load()
-        try:
-            for rulename in rules.enabled.keys():
-                ldr_msg = 'Loading ... %s' % rulename
-                ruleClass = rules.enabled[rulename]
-                RulesPipeline.rulesLoaded.append(ruleClass())
-                log.msg(ldr_msg)
-                print ldr_msg
-        except Exception, e:
-            RulesPipeline.loaded = False
-            RulesPipeline.rulesLoaded = []
-            log.msg(e)
-            print e
-
-    def execute(self):
-        """
-        Return a dictionary that conatins all the rules executed.
-        """
-        if RulesPipeline.loaded:
-            rules_loaded = RulesPipeline.rulesLoaded
-            info_dict = {}
-            info_dict['rules_executed'] = {}
-            for rule in rules_loaded:
-                rule.responsewrapper = self._responsewrapper
-                rule_result = rule.check()
-                info_dict['rules_executed'][rule.__class__.__name__] = rule_result
-            return info_dict
-        else:
-            raise RulesNotLoaded, 'Problems loading the rules.'
--- a/scrapy/trunk/scrapy/contrib/rulengine/responseWrapper.py
+++ b/scrapy/trunk/scrapy/contrib/rulengine/responseWrapper.py
@ -1,59 +0,0 @@
-"""
-This class serve as wrapper of the response object. 
-The object created by this class are not response, but are objects that contains a response a a lot of useful information about the response.
-"""
-
-import re
-
-from BeautifulSoup import Tag
-
-def extractText(soup):
-    text_in_tags = soup.findAll(text=True)
-    res = []
-    for tag in text_in_tags:
-        if isinstance(tag.parent, Tag) and tag.parent.name not in('script'):
-            res.append(tag)
-    return res
-
-class ResponseWrapper(object):
-    def __init__(self, response):
-        self._response = response        
-        self.__soup_bodytext = extractText(response.soup.body) if response else None
-        self.__soup_headtext = extractText(response.soup.head) if response else None        
-        self._soup = None
-        self._bodytext = None
-        self._headtext = None
-        self._cleanbodytext = None
-    
-    def __cleantext(self, souptext):
-        return filter(lambda x:re.search('\w+', x), souptext)
-
-    @property
-    def soup(self):        
-        if (self._soup):
-            self._soup = self._response.soup
-        return self._response.soup
-
-    @property
-    def bodytext(self):
-        if not self._bodytext:
-            self._bodytext = self.__cleantext(self.__soup_bodytext)
-        return self._bodytext
-
-    @property
-    def headtext(self):
-        if not self._headtext:
-            self._headtext = self.__cleantext(self.__soup_headtext)
-        return self._headtext
-
-    @property
-    def cleanbodytext(self):
-        if not self._cleanbodytext:
-            text = ' '.join(self.bodytext)
-            text = text.lower()
-            text = text.replace('\n', '')
-            text = text.replace('\t', '')
-            text = re.sub('&.*?;', '', text)
-            self._cleanbodytext = text
-        return self._cleanbodytext
-