#!/usr/bin/env python '''multitimbl.py Script that can a number of timbls simultaneously on multiple CPUs. Depends on the timbl module : http://www.clips.ua.ac.be/~vincent/scripts/timbl.py Copyright (c) 2011 CLiPS. All rights reserved. ''' __author__="Vincent Van Asch" __date__="August 2011" __version__="1.1" # Default path to Timbl TIMBL = '~/bin/timbl' import os, sys, shutil import multiprocessing, time timblversion = "1.1" try: import timbl except ImportError: print >>sys.stderr, 'Depends on timbl.py. Make sure timbl.py (v%s) can be imported. See http://www.clips.ua.ac.be/~vincent/scripts/timbl.py' %timblversion sys.exit(1) else: try: assert timbl.__version__ == timblversion except Exception: print >>sys.stderr, 'Depends on timbl.py. Could not find the right timbl.py or the right version.\nMake sure it is the one from http://www.clips.ua.ac.be/~vincent/scripts/timbl.py' sys.exit(1) # Timbl inspection function ##################################################################################### def overall(fname): ''' Input: Timbl logfile Output: dictionary with keys : acc, microf, macrof values: associated score ''' output={'micro-fscore':None, 'macro-fscore':None, 'accuracy':None} for line in fread(fname): if 'F-Score' in line: if 'microav' in line: output['micro-fscore'] = float(line.split()[-1]) elif 'macroav' in line: output['macro-fscore'] = float(line.split()[-1]) if 'overall accuracy' in line: output['accuracy'] = float(line.split()[2]) if None in output.values(): raise ValueError('Could not find all scores from "%s". Make sure +v cs option is set.' %fname) return output def report(results, verbose=True): ''' Input: Output from runtimbl() with default parameters Output: Scores averaged over all experiments ''' count=0 output={'micro-fscore':0, 'macro-fscore':0, 'accuracy':0} # Get all scores for train, test, scores in results: count+=1 output['accuracy']+=scores['accuracy'] output['macro-fscore']+=scores['macro-fscore'] output['micro-fscore']+=scores['micro-fscore'] # Average for k,v in output.items(): output[k] = float(v)/count # Info if verbose: print '''Average results on %d experiments - micro f-score: %8.5f%% - macro f-score: %8.5f%% - accuracy : %8.5f%%''' %(count, 100*output['micro-fscore'], 100*output['macro-fscore'], 100*output['macro-fscore']) return output # Helper functions ############################################################################################# def loginfo(s): print >>sys.stderr, '%s: %s' %(time.strftime('%d/%m/%Y %H:%M:%S'), s) sys.stderr.flush() def fullname(p): return os.path.abspath(os.path.expanduser(p)) def fread(fname): '''Read in file''' f=open(fullname(fname), 'rU') try: for l in f: line=l.strip() if line: yield line finally: f.close() # Multi-processing ############################################################################################### def process(args): '''Run a single timbl experiment''' # Parameters train, test, options, outputfile, logfile, timblbin, verbose, id1, id2, expcount, totalexps, analysis = args # Run timbl if verbose: loginfo('Starting %s (train) %s (test) (%d/%d)' %(id1, id2, expcount+1, totalexps)) timbl.timbl(train, test, options=options, outputfile=outputfile, stderr=None, stdout=logfile, binary=timblbin, verbose=verbose) # Remove outfile os.remove(outputfile) # Get scores: scores = analysis(logfile) return id1, id2, scores def runtimbl(traindir, testdir, outdir=None, timblbin=TIMBL, options=['+v','cs'], analysis=overall , verbose=True, cpu=None): ''' Runs a Timbl experiment for every combination of a training file in traindir and a tesfile in testdir. Uses multiple CPUs traindir: Folder with timbl training files testdir : Folder with timbl testfile outdir : Folder to write files to; if None writes to testdir/traindir.exps. The files in this folder are the files to be used by the analysis function. timblbin: Path to timbl binary options : Timbl options analysis: Function that analysis the Timbl logfile. The input should be the path to the Timbl logfile. The output is free. verbose : Print info cpu : Number of CPUs to use; if None use all available CPUs Returns a list of tuples. Every tuple consists of: (trainingfile, testfile, output-of-analysis) ''' # The number of cpu's to use processes=multiprocessing.cpu_count() if cpu: if cpu > processes: loginfo('INFO: can only use %d CPUs' %processes) else: processes = cpu # All files trains = [os.path.join(traindir, n) for n in os.listdir(traindir) if not n.endswith('metadata')] tests = [os.path.join(testdir, n) for n in os.listdir(testdir) if not n.endswith('metadata')] # no dirs trains = [n for n in trains if os.path.isfile(n) and not n.startswith('.')] tests = [n for n in tests if os.path.isfile(n) and not n.startswith('.')] totalexps = len(trains)*len(tests) # Output folder if not outdir: outdir=os.path.join(testdir, os.path.basename(traindir+'.exps')) if not os.path.isdir(outdir): os.mkdir(outdir) else: answer = None while answer not in ['y', 'n', 'c', 'yes', 'no', 'continue']: answer = raw_input('Warning: %s exists. Remove folder? ([y]es/[n]o/[c]ontinue): ' %outdir).strip().lower() if answer in ['y', 'yes']: shutil.rmtree(outdir) os.mkdir(outdir) elif answer in ['continue', 'c']: print >>sys.stderr, 'CONTINUING -- Note that this mean that existing files may be overwritten.' else: print >>sys.stderr ,'EXIT -- Doing nothing' return # Info if verbose: loginfo('Preparing to run %d experiments with %d CPUs' %(totalexps, processes)) loginfo('Training files from: %s' %traindir) loginfo('Test files from: %s' %testdir) loginfo('Output files in: %s' %outdir) loginfo('Timbl options: %s' %(' '.join(options))) loginfo('Analysis function: %s.%s' %(analysis.__module__, analysis.func_name)) loginfo('Timbl: %s' %timblbin) loginfo('') # Meta data f=open(os.path.join(testdir, 'exp.metadata'), 'w') f.write('%d experiments\ntrain: %s\ntest: %s\ntimbl options: %s\nanalysis: %s.%s' %(totalexps, traindir, testdir,' '.join(options), analysis.__module__, analysis.func_name)) f.close() # Initialize expcount=0 totalexps = len(trains)*len(tests) tasks=[] # Define all tasks for train in trains: id1 = os.path.basename(train) for test in tests: id2 = os.path.basename(test) outputfile = os.path.join(outdir, 'train-%s.test-%s.out' %(id1, id2)) logfile = os.path.join(outdir, 'train-%s.test-%s.log' %(id1, id2)) tasks.append((train, test, options, outputfile, logfile, timblbin, verbose, id1, id2, expcount, totalexps, analysis)) expcount+=1 # Run all other steps on separate processors pool = multiprocessing.Pool(processes) results = pool.imap_unordered(process, tasks) if verbose: loginfo('-- Start --') output = [] for result in results: output.append(result) if verbose: loginfo('-- Done --') return output if __name__ == '__main__': def _usage(): print >>sys.stderr, '''Multi-Timbl (version %s) Runs all Timbl experiments for the combination of each training file in training_folder with each test file in test_folder. USAGE $ python multitimbl.py [-o output_folder] [-b timbl] [-O options] [-s] [-c n] training_folder test_folder OPTIONS -o output_folder: folder where to write all output (default: test_folder/training_folder.exps) -b timbl: path to Timbl binary (default: %s) -O options: Timbl options as one string (default "+v cs") -s: work silently -c n: numer of CPUs to use simultaneously NOTE - Depends on timbl.py (v%s) (http://www.clips.ua.ac.be/~vincent/scripts/timbl.py) - It reports the accuracy, micro f-score and macro-fscore averaged over all experiments. If you need something different, you should supply a custom analysis function to runtimbl() and/or implement a custom report() function. See code. %s, %s''' %(__version__, TIMBL, timblversion, __author__, __date__) import getopt try: opts,args=getopt.getopt(sys.argv[1:],'ho:b:O:sc:', ['help']) except getopt.GetoptError: # print help information and exit: _usage() sys.exit(2) outdir = None timblbin=TIMBL options=['+v','cs'] verbose=True cpu=None for o, a in opts: if o in ('-h', '--help'): _usage() sys.exit() if o == '-o': outdir = fullname(a) if o == '-b': timblbin = fullname(a) if o == '-O': options.extend(a.split()) if o == '-s': verbose=False if o == '-c': cpu = int(a) if len(args) !=2: _usage() sys.exit(1) traindir, testdir = args # Run output = runtimbl(fullname(traindir), fullname(testdir), outdir=outdir, timblbin=timblbin, options=options, analysis=overall, verbose=verbose, cpu=cpu) report(output)