#!/usr/bin/env python '''Script to create nfold splits from Timbl or Mbt style data Created by Vincent Van Asch on 17/12/09. Copyright (c) 2009 CLiPS. All rights reserved. # License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt ''' __author__='Vincent Van Asch' __date__='August 2011' __version__='1.2' import sys, os, random, getopt, shutil #Styles TIMBL= 1 MBT = 0 def compress(fname, sep, dir): '''Writes all collections onto one line. A collection is a set of lines delimited by sep. Output is written to dir/fname.c''' name = os.path.join(dir, os.path.basename(fname)+'.c') f=open(fname, 'rU') o = open(name, 'w') sentence=[] try: for l in f: line=l.strip() if line == sep.strip(): o.write('\x14'.join(sentence)+'\n') sentence=[] elif line: sentence.append(line) else: # White lines --if this is not the separator-- are omitted!! pass if sentence: o.write('\x14'.join(sentence)) finally: f.close() o.close() del f, o return name def decompress(fname, sep): '''Decompresses a fname, compressed with compress() and inserts sep between every collection of lines. Output is written to fname.dc''' name = fname+'.dc' f = open(fname, 'rU') o=open(name, 'w') try: for l in f: line=l.strip() if line and line != sep.strip(): words = line.split('\x14') o.write('\n'.join(words)) if sep == '\n': o.write('\n\n') else: o.write('\n'+sep+'\n') finally: f.close() o.close() del f, o return name def create(fname, n=10, keep=True, style=TIMBL, sep='', verbose=True, crop_opportunity=True, outdir=None): ''' Creates n test-train splits from fname in the folder fname.nfolds. Returns the name of the folder with the output. fname: the complete dataset crop_opportunity: if True, halts to give the possibility to crop the compressed file. Options: n: the number of folds to create keep: if True keep intermediate files style: TIMBL or MBT sep: the collection separator for MBT style files. Note that if sep is a non-empty string, empty lines are omitted. E.g. of sep: , \n verbose: if True print info Styles: TIMBL: This means that every non-empty line contains an instance that can be stored in test or in train. MBT: This means that a collection of lines (most of the time all tokens of 1 sentence) should be kept together. Only the entire collection should be stored in test and train. A collection wil not be split up. A collection is delimited with sep. (most of the time or "\n". How folds are created: - Distribute all instances (or collections) randomly over n parts. If the number of instances cannot be divided by the number of folds some parts wil be bigger. But the difference between the smallest and the biggest part will never be greater that n. - Take every part once as the test partition, the other parts consitute the train partition. ''' #################################################################################################### # INFO: Initialisation, making sure all folders exist and that the corpus contains only # # one "instance" per line # #################################################################################################### # Some paths fullname = os.path.abspath(os.path.expanduser(fname)) dir = os.path.dirname(fullname) if not outdir: outdir = os.path.join(dir, '%s.%dfolds' %(os.path.basename(fullname), n)) outdir = os.path.abspath(os.path.expanduser(outdir)) # Check if outputdir exists if not os.path.isdir(outdir): os.mkdir(outdir) else: answer = raw_input('Warning: %s exists. Remove (y/n): ' %outdir) if answer.strip().lower() == 'y': shutil.rmtree(outdir) os.mkdir(outdir) else: print >>sys.stderr ,'Doing nothing' return # Create subdirs (and remove older ones without asking) partdir = os.path.join(outdir, 'parts') if os.path.isdir(partdir): shutil.rmtree(partdir) os.mkdir(partdir) if verbose: s ='TIMBL' cs='' if not style: s='MBT' cs = '\ncollection separator:'+repr(sep) print >>sys.stderr, '''++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ SETTINGS inputfile: %s (%s) outputfolder: %s folds to be created: %d%s removing intermediate files: %s ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++''' %(fullname, s, outdir, n, cs, bool(not keep)) # Check which style the corpusfile is corpus = fullname if not style: if verbose: print >>sys.stderr, 'INFO: Compressing the corpus using %s as the collection separator...' %repr(sep), # This means that we have MBT style so first compress all collections into one line corpus = compress(fullname, sep, outdir) if verbose: print >>sys.stderr, 'done' if crop_opportunity: answer = raw_input('Proceed? (y/n): ').lower().strip() while answer not in ['y', 'n']: answer = raw_input('Proceed? (y/n): ').lower().strip() if answer == 'n': return #################################################################################################### # INFO: First we randomly distribute all instances in n parts # # With these parts we will create the folds # #################################################################################################### # All filenames of the parts fnames = [os.path.join(partdir, 'part.%d' %i) for i in range(n)] # Get the total number of instances f=open(corpus, 'rU') total=0 for line in f: if line.strip(): total+=1 f.close(); del f if total < n: print >>sys.stderr, 'ERROR: the number of instances is lower than the number of folds.' raise ValueError('the number of instances(%d) is lower than the number of folds(%d)' %(total, n)) # Get the number of instances per file partsize = [total/n for i in range(n)] # Distribute the remaining instances over the parts for i in range(total%n): partsize[i]+=1 # Check if reduce(lambda x,y: x+y, partsize) != total: raise ValueError('Something went wrong distributing instances over folds. Found only %d from %d instances' %(reduce(lambda x,y: x+y, partsize), total)) # Distibute the instances if verbose: print >>sys.stderr, 'INFO: Distributing %d instances randomly over %d parts...' %(total, n), f = open(corpus, 'rU') parts = [open(name, 'w') for name in fnames] current_part_size=[0]*n choices = range(n) try: for l in f: line = l.strip() if line: assigned=False while not assigned: # Choose a part to assign the instance to i = random.choice(choices) if current_part_size[i] < partsize[i]: # There is still room for an extra instance in this part so store it parts[i].write(line+'\n') current_part_size[i]+=1 assigned=True else: # This part is full, so remove it from the choices and pick again choices.remove(i) finally: f.close() for f in parts: f.close() del parts, f if not keep and not style: os.remove(corpus) if verbose: print >>sys.stderr, 'done' if not style: # MBT: All parts contain compressed lines, so decompress them if verbose: print >>sys.stderr, 'INFO: Decompressing parts...', decompressed=[] for name in fnames: decompressed.append( decompress(name, sep) ) fnames = decompressed[:] del decompressed if verbose: print >>sys.stderr, 'done' #################################################################################################### # INFO: Now we created n +/- equal parts. The names of these parts are stored in the list fnames # # We are now going to create the folds by taking n-1 parts as train and 1 part as test. # #################################################################################################### if verbose: print >>sys.stderr, 'INFO: Creating %d folds...' %n for i in range(n): if verbose: print >>sys.stderr, 'INFO: Fold %d ...' %i, # The test and train file for this fold testfile = os.path.join(outdir, 'test.%d' %i) trainfile = os.path.join(outdir, 'train.%d' %i) # Copy part.i to test.i shutil.copyfile(fnames[i], testfile) # Merge all other parts into the trainfile trainparts = fnames[:i]+fnames[i+1:] tf=open(trainfile, 'w') try: for tp in trainparts: f = open(tp, 'rU') try: for l in f: tf.write(l.strip()+'\n') finally: f.close() del f finally: tf.close() del tf if verbose: print >>sys.stderr, 'done' # Remove the parts if not required if not keep: if verbose: print >>sys.stderr, 'INFO: Removing intermediate files' shutil.rmtree(partdir) if verbose: print >>sys.stderr, 'DONE' return outdir if __name__ == '__main__': def _usage(): print >>sys.stderr, '''Creating folds from TiMBL and MBT style files (version %s) USAGE python nfold.py [-n int] [-k] [-m] [-s sep] [-v] filename [...] OPTIONS -n int: the number of folds to create (default: 10) -m: inputfile is MBT style rather than Timbl style -s sep: use sep as the collection separator (default: ) (-m must be set) -v: print info -k: keep intermediate files INFORMATION Styles: TIMBL: This means that every non-empty line contains an instance that can be stored in test or in train. MBT: This means that a collection of lines (most of the time all tokens of 1 sentence) should be kept together. Only the entire collection should be stored in test and train. A collection wil not be split up. A collection is delimited with sep. (most of the time or \\n. If the separator is a non-empty string (like ) empty lines are disregarded. How folds are created: - Distribute all instances (or collections) randomly over n parts. If the number of instances cannot be divided by the number of folds some parts wil be bigger. But the difference between the smallest and the biggest part will never be greater that n. - Take every part once as the test partition, the other parts consitute the train partition. %s ''' %(__version__, __date__) try: opts,args=getopt.getopt(sys.argv[1:],'hkms:vn:', ['help', 'keep', 'mbt', 'sep=', 'verbose','folds']) except getopt.GetoptError: # print help information and exit: _usage() sys.exit(2) keep=False style=TIMBL sep = '' verbose=False n=10 for o, a in opts: if o in ('-h', '--help'): _usage() sys.exit() if o in ('-k', '--keep'): keep=True if o in ('-m', '--mbt'): style=MBT if o in ('-s', '--sep'): sep = a if o in ('-v', '--verbose'): verbose=True if o in ('-n', '--folds'): try: n = int(a) except ValueError: print >>sys.stderr, 'ERROR: the number of folds should be an integer' sys.exit(1) if not len(args): _usage() sys.exit(1) fname = os.path.abspath(os.path.expanduser(args[0])) if not os.path.isfile(fname): print >>sys.stderr, 'ERROR: %s does not exist' %fname sys.exit(1) if sep == '\\n': sep = '\n' for fname in args: try: create(fname, n=n, keep=keep, style=style, sep=sep, verbose=verbose) except Exception: print >>sys.stderr, 'ERROR' raise pass