#!/usr/bin/env python '''groups.py Trying to find "consistent" groups in a list of figures. USAGE: >>> l = [1,2,15,16, 150] >>> search(l) [[1, 2], [15, 16], [150]] Copyright (c) 2011 CLiPS. All rights reserved. ''' __author__="Vincent Van Asch" __date__="August 2011" __version__="1.2" import os, sys from math import sqrt, pow class Group(list): @property def variance(self): '''The standard_deviation/average of the group''' return stdev(self)/average(self) class Ranking(list): def __init__(self, values=[]): self._dict={} list.__init__(self, values) def __call__(self, k): '''Returns the index of the group that contains k. Should be called only after the last element has been added to the Ranking.''' if not self._dict: for i,group in enumerate(self): for v in group: self._dict[v] = i try: return self._dict[k] except KeyError: raise IndexError('%s not in ranking' %str(k)) def average(x): '''Returns the average x: list ''' return sum(x)/float(len(x)) def stdev(x, ddof=1): ''' Returns the standard deviation x: list ddof: degrees of freedom ''' if len(x) ==1: return 0.0 av = average(x) ss = sum([pow(i-av, 2) for i in x])/float(len(x)-ddof) return sqrt(ss) def is_range(g): ''' Test function: A range is a list containing more than 1 element and consisting of consecutive numbers. ''' # check number of elements if len(g) < 2: return False # We don't care about doubles g = list(set(g)) g.sort() # This is quicker and safer for large numbers if len(g) != (g[-1] - g[0] + 1): return False # See if it are consecutive numbers if g == range(g[0], g[-1]+1): return True return False def search(data, threshold=0.08, threshold2=0.1, verbose=False): ''' Takes a list of figures and returns the groups as a list of Group objects. threshold: the maximum difference between consecutive variances when building the groups threshold2: the maximum variance for a group of 2 figures This function is developed for integer data but will also run on floats. ''' data.sort() groups=Ranking() group=Group() if verbose: print >>sys.stderr, '%35s\t%6s\t \t%8s\t%6s\t \t%8s' %('temp group', 'diff', 'max (-d)', 'var', 'max (-p)') for x in data: group.append(x) # Don't break if the group is a range if is_range(group): variance = group.variance if verbose: print >>sys.stderr, '%35s\trange' %(group) continue if len(group) > 2: value = abs(group.variance - variance) if verbose: print >>sys.stderr, '%35s\t%6.4f\t<\t%8.3f\t%6s\t \t%8s' %(group, value, threshold, ' ', ' '), if value > threshold: # Check to see if we should cut the first one if len(group) > 3 and Group(group[1:]).variance < Group(group[:-1]).variance: # We should break but first check to see if we better cut the first one # but don't cut in a range if not is_range(group[:2]): groups.append(Group([group[0]])) group = Group(group[1:]) variance = group.variance else: pregroup=Group(group[:2]) i = 2 while is_range(pregroup) and i < len(group): pregroup.append(group[i]) i+=1 if i == len(group): # Unexpected situation; just quit working with this group # and go on with the last element y = group.pop() groups.append(group) group=Group([y]) else: # Keep all elements after the initial range groups.append(Group(pregroup[:-1])) group = Group(group[i-1:]) variance=group.variance if verbose: print >>sys.stderr, '\tBREAK1: maximum difference exceeded (exclude first element)' elif x == group[-2]: # Make sure we do not separate figures that are the same cut = group.index(x) groups.append(Group(group[:cut])) group = Group(group[cut:]) variance = group.variance if verbose: print >>sys.stderr, '\tBREAK2: maximum difference exceeded (do not cut between same numbers)' else: y = group.pop() groups.append(group) group= Group([y]) if verbose: print >>sys.stderr, '\tBREAK3: maximum difference exceeded' else: if verbose: print >>sys.stderr elif len(group) > 1: if verbose: print >>sys.stderr, '%35s\t%6s\t \t%8s\t%6.4f\t<\t%8.3f' %(group, ' ', ' ', group.variance, threshold2), if group.variance > threshold2: y = group.pop() groups.append(group) group= Group([y]) if verbose: print >>sys.stderr, '\tBREAK4: variance of pair exceeds threshold' else: variance = group.variance if verbose: print >>sys.stderr # Attach final group groups.append(group) #Info if verbose: print >>sys.stderr, '\nVariance per group' for group in groups: print >>sys.stderr, '%.3f\t%s' %(group.variance, group) # check x=[];[x.extend(i) for i in groups];x.sort() assert x==data, 'Implementation error: %s' %x return groups def fread(fname): '''Takes a filename and returns a list of figures''' data=[] with open(os.path.abspath(os.path.expanduser(fname)), 'rU') as f: for l in f: line = l.strip() if line and not line.startswith('#'): try: digits = [int(x) for x in line.split()] except ValueError: digits = [float(x) for x in line.split()] print >>sys.stderr, 'WARNING: %s contains floats instead of integers.\n The script is not designed to process floats.' %(fname) data.extend(digits) return data def test(verbose=False, threshold=0.08, threshold2=0.1): '''Test various series of figures''' examples = [range(25), \ [1,2,3, 15,16,17, 15000000, 15000100, 16000000, 156], \ [2,2,2,2,2,2,2,92,92,76,73,76,72,169,166,3967,3889,3589, 6000, 100000, 50, 60, 92, 25], \ [2,2,2,2,2,2,2,92,92,76,73,76,72,169,166,3967,3889, 50, 60, 92, 25, 55, 65, 69, 3]+[3889 for i in range(20)]] for example in examples: print >>sys.stderr, '='*100 print >>sys.stderr, ' EXAMPLE:', example groups = search(example, verbose=verbose, threshold=threshold, threshold2=threshold2) print >>sys.stderr, '%3s GROUPS: %s' %(len(groups), groups) if __name__ == '__main__': import getopt # Defaults verbose=False extra_verbose=False threshold = 0.08 threshold2 = 0.1 def _usage(): print >>sys.stderr, '''Find groups in a list of figures (version %s) USAGE: ./groups.py [-d f] [-p f] [-v] fname [...] The groups are printed to STDOUT. Each group on a new line. OPTIONS fname: a file with integers. The integers should be separated by any kind of whitespace. A line starting with # is regarded as a comment. -d f : a float f defining the maximum difference between consecutive variances when building the groups. Decreasing -d will break larger groups more quickly. (default: %.2f) -p f : a float f defining the maximum variance for a group of 2 figures. Decreasing -p will break groups of 2 elements more quickly. (default: %.2f) -v: verbose -w: print internal processing information EXAMPLE 1, 2, 15, 16, 150, 1200, 1205, 1205 will be grouped into [1, 2], [15, 16], [150], [1200, 1205, 1205] If the groups are not according to your taste, you should try to change the -d and -p options. A range of consecutive numbers, like 15-16, will never be split. %s, %s''' %(__version__, threshold, threshold2, __author__, __date__) try: opts,args=getopt.getopt(sys.argv[1:],'hvd:p:w', ['help']) except getopt.GetoptError: # print help information and exit: _usage() sys.exit(2) for o, a in opts: if o in ('-h', '--help'): _usage() sys.exit() if o in ('-v',): verbose=True if o in ('-d',): threshold=float(a) if o in ('-p',): threshold2 = float(a) if o in ('-w',): extra_verbose = True verbose=True if not args: _usage() sys.exit(1) if verbose: print >>sys.stderr, '''=========================== Maximum difference (-d): %.3f Maximum variance (-p): %.3f ===========================''' %(threshold, threshold2) for fname in args: data = fread(fname) groups = search(data, threshold=threshold, threshold2=threshold2, verbose=extra_verbose) if verbose: print >>sys.stderr, '-'*27 if verbose: print >>sys.stderr, '%s contains %s figures;\ngrouped into %d groups' %(fname, len(data), len(groups)) if verbose: print >>sys.stderr, '-'*27 for group in groups: print ' '.join([str(x) for x in group]) if verbose: print >>sys.stderr, '-'*27