#!/usr/bin/env python '''Module to caluculate the weights for weighted overlap The weight for feature i is: infogain = H(C) + CH(C) gain ratio = ( H(C) + CH(C) )/ si si = -1 * sum( P(vi)*log(P(vi), 2) ) H(C) = -1 * sum( P(ci)*log(P(ci), 2) ) CH(C) = -1 * sum( P(vi)*H(C|vi) ) With H(C|v): -1 * sum( P(ci)*log( P(ci), 2) ) but only for the classes of instances that have value v for the feature in focus. (See Timbl reference guide) # License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt ''' __date__ ='October 2011' __author__='Vincent Van Asch' __version__='1.3' from math import log import os, sys, getopt def read(fname, sep=None): '''Yields all lines as lists. sep: the feature separator ''' f=open(os.path.abspath(os.path.expanduser(fname))) try: for l in f: line= l.strip() if line: yield line.split(sep) finally: f.close() def entropy(f): '''Takes a frequency and returns -1*f*log(f,2)''' return -1 * f * log(f,2) def weights(fname, sep, split_info=False): '''Takes an fname and returns 2 or 3 lists of values. The first list is information gain; the second is gain ratio; the third is the split info (if split_info == True). sep: the feature separator in the Timbl file fname ''' classcounts={} total=0 valuecounts=[] for line in read(fname, sep): features = line[:-1] classlabel = line[-1] #Count the number of instances total+=1 #Store the class counts try: classcounts[classlabel]+=1 except KeyError: classcounts[classlabel]=1 #Store the counts per featurevalue if not valuecounts: valuecounts = [{} for i in range(len(features))] for i,fv in enumerate(features): #Update the valuefrequency try: valuecounts[i][fv][0]+=1 except KeyError: #classlabel:0 becuse we update the classlabel count below valuecounts[i][fv]= [ 1, {classlabel:0} ] # Update the classfrequency for this value try: valuecounts[i][fv][1][classlabel]+=1 except KeyError: #Because we created valuecounts[i][fv] in the previous step #we know the keyerror comes from the classlabel valuecounts[i][fv][1][classlabel]=1 total = float(total) # calculate database entropy H(C) HC = 0 for cl,count in classcounts.items(): freq = count/total HC += entropy(freq) # Calculate all weights (information gain and gain ratio) IG=[] # Informaton gain GR=[] # Gain Ratio SI = [] # Split Info for fd in valuecounts: si=0 ch = 0 for fv, clist in fd.items(): # calculate H(C|vi) hcv = 0 for cl,c in clist[1].items(): cfreq = c / float(clist[0]) hcv += entropy(cfreq) #Calculate split values freq = clist[0]/total si += entropy(freq) #Calculate conditional entropy -1*P(vi)*H(C|vi) ch += (-1* freq * hcv) #Information gain ig = HC + ch #Gain ratio if si == 0: gr = 0 else: gr = ig / si #Store IG.append(ig) GR.append(gr) SI.append(si) if split_info: return IG, GR, SI else: return IG, GR def format(ig, ir, si=None): '''Print the lists''' if not si: print 'Feat\tInfoGain\tGain Ratio' f='%4d\t%f\t%f' for i, w1, w2 in zip(range(len(ig)), ig, ir): print f %(i+1, w1, w2) else: print 'Feat\tInfoGain\tGain Ratio\tSplit Info' f='%4d\t%f\t%f\t%f' for i, w1, w2, w3 in zip(range(len(ig)), ig, ir, si): print f %(i+1, w1, w2, w3) def _usage(): print >>sys.stderr, '''Calculates Information Gain (IG) and Gain Ratio (GR) of Timbl files. (version %s) USAGE ./weightedoverlap.py [-s sep] [-S] timblfile timblfile: file with Timbl instances OPTIONS -s sep: the feature separator (deafult: all whitespace) for tab: -s '\\t' -S : also report the split info ACKNOWLEDGEMENTS Based on the information in the Timbl reference guide. http://ilk.uvt.nl/timbl %s, %s''' %(__version__, __author__, __date__) if __name__ == '__main__': try: opts,args=getopt.getopt(sys.argv[1:],'hs:S', ['help', 'sep=']) except getopt.GetoptError: # print help information and exit: _usage() sys.exit(2) sep=None split_info=False for o, a in opts: if o in ('-h', '--help'): _usage() sys.exit() if o in ('-s', '--sep'): sep=a if o in ('-S',): split_info=True if sep == '\\t': sep='\t' if len(args) != 1: print >>sys.stderr, 'ERROR: you can only provide 1 Timbl file.' sys.exit(1) format(*weights(args[0], sep, split_info))