#!/usr/bin/env python '''Self-train module A self-training cycle consists of the following steps: 1- train the machine learner 2- classify the unlabeled target data 3- retrain the machine learner using data from step 1 and 2 4- test the machine learner on labeled data This module should be used as the following: >>> scores_without_selftraining, scores_with_selftraining = selftrain('train', 'unlabeled', 'test', intermediate=True, verbose=True) Note: This script has been tested on Mbt and Timbl experiments. Although writing your own mltrain() and mltest() should allow you to use other machine learners it may introduces some unexpected errors. For instance during the concatenation of traindatat and unlabaled data. # Copyright (c) 2009 CLiPS. All rights reserved. # License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt ''' __date__='January 2010' __author__='Vincent Van Asch' __version__='1.0' __license__='GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt' __url__ = 'http://www.clips.ua.ac.be/~vincent/software.html' import os, sys, time, tempfile import shutil, subprocess try: import confusionmatrix except ImportError: raise ImportError("You don't seem to have confusionmatrix.py. You can retrieve it from %s" %__url__) print confusionmatrix.__file__ # Setting ################################################################################################################################## # Path to a working Timbl binary timbl = '~/bin/Timbl' # Example functions for mltest() and mltrain() ############################################################################################# def mltrain(fname): '''Takes a labeled datafile (Timbl format) and returns the path to the inferred model''' fd, out = tempfile.mkstemp(prefix='selftrain.mltrain.') os.close(fd) # For Timbl the training file is the model shutil.copy(fname, out) return out def mltest(fname, model): '''Takes a labeled or unlabeld datafile (Timbl format)(fname) and a model (from mltrain()) and returns (file, prec, rec, fscore, acc) file: the path to a file with the data from fname in a format that can be used by mltrain() but with the labels as given by the machine learner. prec, rec, fscore, acc: scores in percentage. If fname is unlabeled these will be all None. If the machine learner does not return a specific score it will be None also.''' if not os.access(timbl, os.X_OK): raise ValueError('The timbl setting in the script does not point to a working binary.') fd, out = tempfile.mkstemp(prefix='selftrain.mltest.') os.close(fd) devnull = open(os.devnull, 'w') value = subprocess.call([abspath(timbl), '-f', model, '-t', fname, '-o', out], stdout=devnull,stderr=devnull) devnull.close() if value: raise ValueError('Timbl did not run properly.') p, r, fscore, acc = None, None, None, None cm = confusionmatrix.getConfusionMatrixFromFile(out, training=model) fscore = 100*cm.microfmeasure() acc = 100*cm.accuracy # Replace the gold with the prediced name=out+'.pred' f=open(out, 'rU') o=open(name, 'w') try: for l in f: line = l.strip() if line: parts = line.split() o.write(' '.join(parts[:-2]+[parts[-1]])) o.write('\n') finally: o.close() f.close() os.remove(out) return name, p, r, fscore, acc # Helper functions ######################################################################################################################### def abspath(fname): '''Returns the absolute path of an fname''' return os.path.abspath(os.path.expanduser(fname)) def log(s): print >>sys.stderr, '%s: %s' %(time.strftime('%d/%m/%Y %H:%M:%S'), s) def safe(x): '''Makes a float from all elements of x[1:]. If an element is None it becomes -1.0''' out=[x[0]] for i in x[1:]: if i is None: out.append(-1.0) else: out.append(float(i)) return tuple(out) def concat(fname1, fname2): '''Concatenates the two files and returns the name of the concatenated file''' # Outputfile fd, output = tempfile.mkstemp(prefix='concat.') # File 1 f = open(abspath(fname1), 'rU') txt = f.read() f.close() # Make sure we end f1 with a newline and that we don't leave a blank line txt = txt.strip() if txt: txt+='\n' os.write(fd, txt) del f, txt # File 2 f = open(abspath(fname2), 'rU') txt = f.read() f.close() # Make sure we don't start f2 with a newline txt = txt.lstrip() os.write(fd, txt) os.close(fd) return output # Main function ############################################################################################################################ def selftrain(traindata, unlabeleddata, testdata, mltrain=mltrain, mltest=mltest, verbose=True, intermediate=True): '''Runs the following steps - returning the score on the test: 1- train the machine learner 2- classify the unlabeled target data 3- retrain the machine learner using data from step 1 and 2 4- test the machine learner on labeled data It returns the score on testdata (prec, recall, fscore, acc). A score can be None if it is not returned by mltest(). mltrain: a function that is an interface to the learning step of a machine learner. The argument of the function should be @traindata. The function should return the path to the inferred model mltest: a function that is an interface to the classification step of a machine learner. The argument should be @testdata, @model (as returned by mltrain()). The function should return (fname, prec, recall, fscore, acc). A score can be None if it cannot be retrieved from the output of the ml. The scores should be in percentage [0-100]. fname: a file based on testdata, in the same format as traindata but with the labels as assigned by the ml. traindata: path to a file containing labeled data in the format as used by mltrain(). unlabeleddata: path to a file containing (un)labeled data in a format that can be used by mltest(). testdata: path to a file with data that can be used by mltest(). intermediate: if True the function also reports on using train for labeling testdata without using the unlabeled data. Note that the value returned by the function changes to: ((prec, recall, fscore, acc), (prec, recall, fscore, acc)) In which the first set are the results without self-training and the second are the results with self-training. Note that for step 3 the traindata and the file returned from mltest() in step 2 will be concatenated. ''' traindata = abspath(traindata) unlabeleddata = abspath(unlabeleddata) testdata = abspath(testdata) # Outputfiles name = '%s.%s.%s' %(os.path.basename(traindata), os.path.basename(unlabeleddata), os.path.basename(testdata)) name1 = name+'.baseline' name2 = name+'.selftrain' # Info if verbose: log('MLTRAIN: %s.%s MLTEST: %s.%s' %(mltrain.__module__, mltrain.func_name, mltest.__module__, mltest.func_name)) # Train the ml if verbose: log('Step1: started learning "%s" ...' %os.path.basename(traindata)) modelfile = mltrain(traindata) # Test without selftraining if intermediate: if verbose: log('Step1bis: started testing without selftraining...') out1 = safe(mltest(testdata, modelfile)) if verbose: log(' P: %5.2f R: %5.2f F: %5.2f A: %5.2f' %out1[1:]) shutil.move(out1[0], name1) # Label the unlabeled data if verbose: log('Step2: started labeling unlabeled data "%s" ...' %os.path.basename(unlabeleddata)) unlabeled = mltest(unlabeleddata, modelfile)[0] # Concatenate extratrain = concat(traindata, unlabeled) # Retrain the ml if verbose: log('Step3: started retraining...') modelfile2 = mltrain(extratrain) # Test the ml if verbose: log('Step4: started labeling testdata "%s" ...' %os.path.basename(testdata)) out2 = safe(mltest(testdata, modelfile2)) shutil.move(out2[0], name2) # Clean up os.remove(modelfile) os.remove(unlabeled) os.remove(extratrain) os.remove(modelfile2) if intermediate: os.remove(name1) os.remove(name2) # Info if verbose: log('Finished: P: %5.2f R: %5.2f F: %5.2f A: %5.2f' %out2[1:]) if verbose: log('-'*50) # Output if intermediate: return out1[1:], out2[1:] return out2[1:]