#!/usr/bin/env python '''timbl.py - Using Timbl through Python This module contains a function timbl() which is a simple interface to TiMBL. Copyright (c) 2011 CLiPS. All rights reserved. # License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt ''' __version__="1.1" __author__="Vincent Van Asch" __date__="August 2011" import sys import os import subprocess import time # CONSTANTS ######################################################################### _timbl = '~/bin/Timbl' NO = 0 TIME = 1 SETUP = 2 OUTPUT = 3 ALL = 4 # BODY ############################################################################## class TiMBLError(Exception): pass def abs(p, check=False): if not isinstance(p, (str, unicode)): return p if p: path = os.path.abspath(os.path.expanduser(p)) if check and not os.path.isfile(path): raise TiMBLError('%s does not exist' %path) return path def log(s): print >>sys.stderr, '%s: %s' %(time.strftime('%d/%m/%Y %H:%M:%S'), s) def humantime(secs): '''Returns seconds as (days, hours, minutes, seconds)''' days = int(secs/(24*3600)) hours=int(secs%(24*3600)/3600) minutes=int((secs%3600)/60) seconds=int((secs%3600)%60) return [days, hours, minutes, seconds] def timbl(trainingfile, testfile, options=[], outputfile=None, stderr=None, stdout=None, binary=_timbl, verbose=NO): ''' trainingfile: the path to a file with labeled instances to use as training data testfile: the path to a file with (un)labeled instances to use as test data options: a list of Timbl options. E.g. ['-k10', '-dIL', '-v', 'cs+cm'] outputfile: the path to a file to write the labeled test instances to. If None, output is written to the default Timbl output file. stderr: the path to a file to write stderr output to or an open file object. If None, the output is not saved. stderr: the path to a file to write stdout output to or an open file object. If None, the output is not saved. binary: the path to a working Timbl binary (default: ~/bin/Timbl) verbose: to set the amount of information printed to stdout. Levels: NO : nothing (default) TIME : time taken to run TiMBL SETUP : all setup data is also printed OUTPUT : time to run TiMBL and the stdout of the process ALL: everything ''' #Absolute paths trainingfile = abs(trainingfile, check=True) testfile = abs(testfile, check=True) outputfile = abs(outputfile) stderr = abs(stderr) stdout = abs(stdout) binary = abs(binary) # Check the binary if not os.access(binary, os.X_OK): raise TiMBLError('%s is not a working TiMBL binary' %(binary)) # Create the command cmd = [binary, '-f', trainingfile, '-t', testfile] if outputfile: cmd.extend(['-o', outputfile]) cmd.extend(options) # Set up if verbose > 1: print '''######################################################### TRAIN : %s TEST : %s OPTIONS: %s OUTPUT : %s STDOUT : %s STDERR : %s TIMBL : %s FULL COMMAND:\n %s #########################################################''' %(trainingfile, testfile, ' '.join(options) or 'default', outputfile or 'default', stdout or '-', stderr or '-', binary, ' '.join(cmd)) # Keep starttime start = time.time() proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate() if verbose > 2: print 'STDOUT\n', out, '\n#########################################################' if verbose > 3: print 'STDERR\n', err, '\n#########################################################' if verbose > 0 and not proc.returncode: elapsed = humantime(time.time() - start) string='Timbl fully completed in' steps = ['day', 'h', 'min', 's'] for name, value in zip(steps, elapsed): if value == 0 and string.endswith('n'): continue if name == 'day' and value > 1: name+='s' string += (' %d%s' %(value, name)) if string.endswith('n'): string+=' less than a second.' log(string) if stdout: if isinstance(stdout, file): if stdout.closed: raise TiMBLError('stdout file object is closed') else: stdout.write(out) else: with open(stdout, 'w') as f: f.write(out) if stderr: if isinstance(stderr, file): if stderr.closed: raise TiMBLError('stderr file object is closed') else: stderr.write(err) else: with open(stderr, 'w') as f: f.write(err) if proc.returncode: print >>sys.stderr, err print >>sys.stderr, out raise TiMBLError('TiMBL did not terminate correctly (return code: %s)' %str(proc.returncode))