#!/usr/bin/env python '''Script to binarize Timbl instances # License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt ''' __date__='March 2012' __version__='3.0.5' __author__='Vincent Van Asch' import os, sys, getopt, pickle, time LINE = '-'*60 class FeatureError(Exception): pass class Transdict(dict): def __init__(self, d={}, bins=False): dict.__init__(self, d) self.bins = bool(bins) def __getitem__(self, i): if self.bins: for k,v in self.items(): if k == i: return v raise KeyError(i) else: return dict.__getitem__(self, i) class Bin(object): def __init__(self, lower, upper): if lower >= upper: raise ValueError('lower should be strictly smaller than upper') self._lower = lower self._upper = upper def __repr__(self): l = str(self.lower) u = str(self.upper) return '[%s; %s[' %(l, u) def __str__(self): l = str(self.lower) u = str(self.upper) return '[%s; %s[' %(l, u) def __eq__(self, other): if isinstance(other, Bin): if self.lower==other.lower and self.upper==other.upper: return True if isinstance(other, (int, float)): if self.contains(other): return True return False def __ne__(self, other): if isinstance(other, Bin): if self.lower != other.lower or self.upper != other.upper: return True if isinstance(other, (int, float)): if not self.contains(other): return True return False def __hash__(self): return hash( '%.5f%.5f' %(float(self.lower), float(self.upper)) ) @property def lower(self): '''The lower boundary (included in bin)''' return self._lower @property def upper(self): '''The upper boundary (not included)''' return self._upper def contains(self, value): if value >= self.lower and value < self.upper: return True return False def log(s): print >>sys.stderr, '%s: %s' %(time.strftime('%d/%m/%Y %H:%M:%S'), s) def read(fname, sep=None): '''A generator yielding all lines a lists. sep: the feature separator''' f=open(os.path.abspath(os.path.expanduser(fname))) try: for l in f: line = l.strip() if line: yield line.split(sep) finally: f.close() def getbins(d, bins): '''Rewrites d such that it complies to bins''' out = {} intervals={} for position, size in bins.items(): if position not in d.keys(): continue # Get all values try: values = [(int(x), y) for x,y in d[position].items()] except ValueError: try: values = [(float(x), y) for x,y in d[position].items()] except ValueError, e: raise FeatureError('Cannot convert non-numeric feature at position %d to numeric.' %position) # Create first bin values.sort(key=lambda x:x[0]) lower = values[0][0] - size upper = lower+2*size current = Bin(lower, upper) # Get all bins and keep only bins that contain a value subdict=Transdict({current:values[0][1]}, bins=True) for i in range(1, len(values)): next, freq = values[i] while not current.contains(next): lower = upper upper = lower+2*size current = Bin(lower, upper) try: subdict[current] = subdict[current] + freq except KeyError: subdict[current] = freq out[position] = subdict # add non-bins for k,v in d.items(): if k not in out.keys(): out[k] = v.copy() return out def getrounded(s, ndecimals): '''Returns the string s rounded to ndecimals''' try: v = float(s) except ValueError: raise FeatureError('Cannot round non-numeric feature "%s"' %s) ndecimals = int(ndecimals) if ndecimals == 0: v = str(int(round(v))) else: format = '%%.%df' %ndecimals v = format %round(v, ndecimals) return v def extract_features(fname, threshold=0, sep=None, idindex=None, verbose=False, bins=False, rounded=False): '''Returns a dict of dict of the features: key: position in Timbl instance value: key: value of timblfeature value: position in binarized instance threshold: features with an occurrence less than threshold are omitted. So, they are not in the outputdict. idindex: ignore the feature at this index (should be a list of int) The second element out the outputtuple is the number of features in the binarized instance''' if verbose and threshold > 1: log('Feature values occurring less than %d times are omitted.' %threshold) if idindex is None: idindex = [] d = {} for line in read(fname, sep): for i,feature in enumerate(line[:-1]): if i in idindex: # Skip this feature because it should be ignored continue # Round if required if rounded and i in rounded.keys(): feature = getrounded(feature, rounded[i]) try: subdict = d[i] except KeyError: d[i] = {feature:1} else: try: subdict[feature]+=1 except KeyError: subdict[feature] = 1 # Rewrite for bins if needed: if bins: d = getbins(d, bins) # Create the outputdict outputposition=0 for timblposition, subdict in d.items(): for feature, count in subdict.items(): if count >= threshold: # If the count is higher than the threshold # keep the entry and place the actual count with the position in the binarized instance subdict[feature] = outputposition outputposition +=1 else: # Remove the entry subdict.pop(feature) # Check that we have at least 1 feature value to translate empty=True for v in d.values(): if v: empty = False break if empty: raise FeatureError('No available features to binarize.') return d, outputposition def rewrite(fname, transdict, nfeatures, output=sys.stdout, isep=None, osep=None, correct=None, svm=False, idindex=None, bins=None, rounded=False, commented=True): '''Rewrites all Timblinstances in fname using transdict. transdict: a dict as returned by extract_features() nfeatures: the number of features in the binarized instance (also from extract_featires()) output: an open writable fileobject isep: feature separator in inputfle osep: feature separator in outputfile correct: if given this classlabel becomes classlabel 1 and the others become -1. If not set the original label is kept. svm: if True write output in SVM light format commented: if True add original instance to SVM light format idindex: ignore the feature at this index (should be a list of int)''' if osep is None: osep = ' ' if idindex is None: idindex = [] format = osep.join(['%d']*nfeatures) + osep + '%s\n' nlines=0 for line in read(fname, isep): nlines+=1 # Create the empty instance and append the class instance={} # Fill in the features for i,feature in enumerate(line[:-1]): if i in idindex: # Skip this feature because it should be ignored continue if rounded and i in rounded.keys(): feature = getrounded(feature, rounded[i]) if bins and i in bins.keys(): try: feature = int(feature) except ValueError: feature = float(feature) try: index = transdict[i][feature] except KeyError: pass else: instance[index] = 1 # Class label if correct: if line[-1] == correct: cl='1' else: cl='-1' else: cl=line[-1] if svm: # Make sparse instance = sparsify(instance) # Insert class label instance.insert(0, cl) instance = ' '.join(instance) if commented: instance = instance + ' # %s' %(' '.join(line)) instance = instance +'\n' else: instance = fullinstance(instance, nfeatures) instance.append(cl) instance = format %tuple(instance) # Output output.write(instance) return nlines def sparsify(instance): '''Takes an instance and returns a sparse version''' new = [] k = instance.keys() k.sort() for index in k: new.append('%d:%d' %(index+1, instance[index])) return new def fullinstance(instance, length): '''Retuns an instance list of length with values fron instance''' output = [0]*length for index, value in instance.items(): output[index] = value return output def deploy(fname, verbose=False): '''Unpickle feature info''' f=open(os.path.expanduser(fname), 'rb') transdict, nfeatures = pickle.load(f) f.close() if verbose: log('Read featureinfo from %s' %fname) return transdict, nfeatures def createfeatureinfo(fname, threshold, isep, verbose=False, idindex=False, stdout=False, bins=None, rounded=False, svm=False, see_transdict=False): '''Dumps the info about the features in fname to fname.feats stdout: if True prints to stdout rather than to fname.feats ''' # Do the job try: info = extract_features(fname, threshold, isep, idindex, verbose=verbose, bins=bins, rounded=rounded) except FeatureError, e: print >>sys.stderr,'ERROR: %s' %e.args[0] sys.exit(1) # Info if see_transdict: print >>sys.stderr, visualize_transdict(info[0], svm) if verbose: if idindex: log('Ignoring feature at index %s' %str(idindex)) if bins: print >>sys.stderr, bininfo(bins) if rounded: print >>sys.stderr, roundinfo(rounded) # Open file if stdout: name = 'STDOUT' f = sys.stdout else: name = os.path.abspath(os.path.expanduser(fname))+'.feats' f=open(name, 'wb') # Create output print >>f, pickle.dumps(info, pickle.HIGHEST_PROTOCOL) if not stdout: f.close() if verbose: log('Written featureinfo to %s' %name) # Return the transdict return info[0] def visualize_transdict(d, svm=False): '''A string representing the translation dictionary as nice as possible''' if not d: return '' positions = d.keys() positions.sort() output=[LINE, 'FEATURE TRANSLATION'] for p in positions: output.append(' *Feature at original position %d' %p) featurevalues = d[p].items() featurevalues.sort(key=lambda x:x[1]) for fv, bp in featurevalues: if svm: output.append(' "%s" --> svm feature index %d' %(fv, bp+1)) else: output.append(' "%s" --> binary feature at position %d' %(fv, bp)) output.append(LINE) return '\n'.join(output) def bininfo(d): '''infostrings''' keys = d.keys() keys.sort() out=[LINE, 'CONSTRUCTED BINS'] for k in keys: out.append(' Position %d: binsize %f' %(k, 2*d[k])) out.append(LINE) return '\n'.join(out) def roundinfo(d): '''infostrings''' keys = d.keys() keys.sort() out=[LINE, 'ROUNDED VALUES'] for k in keys: out.append(' Position %d: rounded to %d decimals' %(k, d[k])) out.append(LINE) return '\n'.join(out) def process(fname, threshold=0, isep=None, osep=None, verbose=False, correct=None, svm=False, predict=None, idindex=None, see_transdict=False, stdout=False, bins=None, rounded=False, commented=True): '''Writes to fname.bin the binarized instances. threshold: featurevalues occurring less than threshold are omitted isep: feature separator in inputfile (None= whitpsace) osep: feature separator in outputfile correct: if given this classlabel becomes classlabel 1 and the others become 0. If not set the original label is kept. predict: a filename of a pickled transdict/nfeaures tuple idindex: ignore the feature at this index (e.g. because it's an id) ''' # Name of output file outputfile = fname+'.%d' %threshold if svm: outputfile+='.svm' outputfile+='.bin' # Get feature info if predict: transdict, nfeatures = deploy(predict, verbose) else: if verbose: print >>sys.stderr, '''WARNING: Constructing feature translation dictionary from file.\nWARNING: Please note that this means that you cannot use this file to create a test file.''' try: transdict, nfeatures = extract_features(fname, threshold, isep, idindex, verbose=verbose, bins=bins, rounded=rounded) except FeatureError, e: transdict={0:{}} # Check that there is something in the transdict empty=True for v in transdict.values(): if v: empty = False break if empty: if e: print >>sys.stderr,'ERROR:', e.args[0] else: print >>sys.stderr,'ERROR: No available features to binarize.' sys.exit(1) # print info if see_transdict: print >>sys.stderr, visualize_transdict(transdict, svm) if verbose: log('Number of features in binarized instance: %d' %nfeatures) if correct: log('All instances with classlabel "%s" get the class label 1; the others get -1.' %correct) else: log('All instances have the original class labels.') if svm: log('Writing output in SVM Light format.') if idindex: log('Ignoring feature at index %s' %str(idindex)) if bins: print >>sys.stderr, bininfo(bins) if rounded: print >>sys.stderr, roundinfo(rounded) # Create output if stdout: outputfile = 'STDOUT' o = sys.stdout else: o = open(outputfile, 'w') try: nlines = rewrite(fname, transdict, nfeatures, output=o, isep=isep, osep=osep, correct=correct, svm=svm, idindex=idindex, bins=bins, rounded=rounded, commented=commented) except FeatureError, e: print >>sys.stderr, 'ERROR:', e.args[0] sys.exit(1) finally: o.close() if verbose: log('Written %d binarized instances to %s' %(nlines, outputfile)) return outputfile def splitignores(s): '''Takes a string like: 5 --> [5] 5-6 --> [5, 6] 1,2,5-6,9 --> [1, 2, 5, 6, 9] and returns alist of integers''' parts = s.split(',') out=[] for part in parts: if '-' in part: start, stop = part.split('-') out.extend( range(int(start), int(stop)+1) ) else: out.append(int(part)) out = list(set(out)) out.sort() return out def splitbins(s, zeros=True): '''Takes a string like 0:0.001 --> {0:0.001} 0,1:0.001 --> {0:0.001, 1:0.001} 0-5:0.001 --> {0:0.001, 1:0.001, 2:0.001, 3:0.001, 4:0.001, 5:0.001} 0,1:0.001=3:0.1 --> {0:0.001, 1:0.001, 3:0.1}''' parts = s.split('=') out={} for part in parts: indices, size = part.split(':') indices = splitignores(indices) try: size = int(size) except ValueError: size = float(size) if zeros and not size: raise ValueError('binsize must be > 0') for index in indices: if index in out.keys(): raise ValueError('binsize defined twice for position %d' %index) out[index] = size return out if __name__ == '__main__': def _usage(more=False): string = '''Binarize Timbl instances (version %s) USAGE ./binarize.py [-t cutoff] [-I int] [-B bins] [-i sep] [-o sep] [-c label] [-s] [-C | -f fname] [-S] [-p] [-v] instancefile Writes binarized versions of the Timbl instances in instancefile to instancefile.cutoff.bin or stdout. OPTIONS -t cutoff: Feature values occurring less than this threshold (integer) are omitted. (default : 0) -I int: Ignore the feature at given indices, e.g. because it's an id. Format see extended help (-m). -B bins: Option to store numeric features in bins. Format see extended help (-m). -R rounds: Numeric features are rounded. Format see extended help (-m). -i sep: Feature separator of instancefile. (None means any whitespace; "tab" means \\t)(default: None) -o sep: Feature separator of outputfile. (None means single space; "tab" means \\t)(default: None) -c label: If set, the instances with this class label get class label 1; the others get -1. If not set, the original class label is kept. -s: Write to SVM Light format. -q: Per default, the original instance is added as a comment to SVM Light instances. If -q is set the original instance is not added. -C: Instead of binarizing instances, write feature info to fname.feats to be used in a second step. fname.feats is not human readable. -f fname: Use feature info from fname instead of extracting feature knowledge from instancefile. -S: Print info about the feature translations to stderr. -p: Print the binarized instances to stdout rather than to instancefile.cutoff.bin. When -C is set. It prints the feature info to stdout rather than to fname.feats. -v: Print info to stderr. -h: Show help -m: Show extended help and examples %s %s, %s ''' extra = 'Run with -m option for more info on usage.' if more: extra = ''' FORMAT *The format of the argument of the -I option should be as in Timbl: -I 4 : ignores the feature at index 4 -I 2-4 : ignores the features at indices 2, 3, and 4 -I 0,2,6-9 : ignores the features at indices 0, 2, 6, 7, 8, and 9 Note that for Timbl the indices start counting at 1, here the indices start counting at 0. *The format of the -B option should be: -B index:binsize[=index:binsize]* index: can be any of the formats as for the -I option: e.g. 4 or 2-4 or 0,4 or a combination binsize: an integer or float specifying the half of the binsize: e.g. 0.05 creates bins of width 0.1. Example: if a feature has the values: 0.5 0.56 0.61 0.7 1.0 then there are 6 bins [0.45-0.55[ [0.55-0.65[ [0.65-0.75[ [0.75-0.85[ [0.85-0.95[ [0.95-1.05[ These bins are created by placing the smallest value in the middle of the first bin and the progressing with bins of size 0.1 until the highest value is contained in a bin. Empty bins will not be converted into a binary feature. This means that values in the test file belonging to a bin that is not in the training are omitted. You cannot define a binsize twice for the same position. Possible -B strings are: -B 0:0.001 : feature at position 0 will be binned into 0.002 bins -B 0,1:0.001 : feature at position 0 and 1 will be binned into 0.002 bins -B 0-5:0.001 : feature at position 0 up until 5 included will be binned into 0.002 bins -B 0,1:0.001=3:0.1 : feature at position 0 and 1 will be binned into 0.002 bins and at position 3 will be binned into 0.2 bins *The format of the -R option is almost the same as for the -B option: -R index:ndecimals[=index:ndecimals] Except ndecimals must be an integer greater than or equal to 0. This integer gives the number of decimals to take into account. Example: when ndecimals is set to 1, the following values map to the same binary feature: 0.05 0.06 0.07 0.0999999 0.1 all map to 0.1 0.0 0.02 0.04 all map to 0.0 If ndecimals is set to 0, all floats map to their closest integers. If neither -B or -R are set, a feature value of 2.0 will map to a different binary feature than the value 2. Rounding is carried out before binning if both options are set for the same position. DESCRIPTION ##### Workflow 1 ##### If you just want to binarize a file, run: $ ./binarize.py instancefile You can't split the binarized file in train/test afterwards because in this case you used information from the whole file (including the future test partition) to binarize the instances. ##### Workflow 2 ##### If you have a train/test split you first need to write the feature info from train to a file with the command: $ ./binarize.py -C train This will create a file train.feats Next, you can binarize train and test with feature info from train.feats $ ./binarize.py -f train.feats train $ ./binarize.py -f train.feats test This way of working ensures that you don't use information of test to binarize the files. If set, the -I, -i, -B, -R options should be set with all steps. If set, the -t option should only be set with the first step. if set, the -o, -c, -s options should only be set with the last steps. ISSUES Note that there is no guarantee that a binarized instance does not consist of all zeros. There is also no check to see if a binarized feature has only a value of 0 or only a value of 1 over all instances.''' print >>sys.stderr, string %(__version__, extra, __author__, __date__) # User options try: opts,args=getopt.getopt(sys.argv[1:],'hvt:i:o:c:sCSpf:I:B:R:mq') except getopt.GetoptError, e: # print help information and exit: _usage() sys.exit(2) # Defaults verbose=False threshold = 0 isep = None osep = None correct=None svm=False commented=True createinfo=False predict = None idindex=[] ignores=None see_transdict = False stdout = False bins = None rounded=None for o,a in opts: if o in ['-m']: _usage(more=True) sys.exit(2) if o in ['-h']: _usage() sys.exit(2) if o in ['-v']: verbose=True if o in ['-i']: if a != 'None': isep = a if o in ['-o']: if a != 'None': osep = a if o in ['-t']: threshold = int(a) if o in ['-c']: correct = a if o in ['-s']: svm = True if o in ['-q']: commented = False if o in ['-C']: createinfo = True if o in ['-f']: predict = os.path.expanduser(a) if o in ['-I']: ignores=a if o in ['-S']: see_transdict=True if o in ['-p']: stdout = True if o in ['-B']: bins = a if o in ['-R']: rounded = a # Check settings if len(args) != 1: _usage() sys.exit(2) fname = os.path.expanduser(args[0]) if not os.path.isfile(fname): print >>sys.stderr, 'ERROR: %s does not exist' %fname sys.exit(1) if predict and createinfo: print >>sys.stderr, 'ERROR: -f and -C cannot be set both' sys.exit(1) if predict and not os.path.isfile(predict): print >>sys.stderr, 'ERROR: %s does not exist' %predict sys.exit(1) if ignores is not None: try: idindex = splitignores(ignores) except Exception: print >>sys.stderr, 'ERROR: the -I string is not legal: %s' %ignores sys.exit(1) if bins is not None: try: bins = splitbins(bins) except Exception: print >>sys.stderr, 'ERROR: the -B string is not legal: %s' %bins sys.exit(1) if rounded is not None: try: rounded = splitbins(rounded, zeros=False) except Exception: print >>sys.stderr, 'ERROR: the -R string is not legal: %s' %rounded sys.exit(1) for v in rounded.values(): if not isinstance(v, int): print >>sys.stderr, 'ERROR: the -R string is not legal: %s' %rounded sys.exit(1) if isep == 'tab': isep = '\t' if osep == 'tab': osep = '\t' # Run if createinfo: createfeatureinfo(fname, threshold, isep, verbose, idindex, stdout=stdout, bins=bins, rounded=rounded, svm=svm, see_transdict=see_transdict) else: process(fname, threshold, isep, osep, verbose, correct, svm, predict, idindex, see_transdict=see_transdict, stdout=stdout, bins=bins, rounded=rounded, commented=commented)