#!/usr/bin/env python '''Creates the Zipf curve from a text file Depencency: tokenizer.py (http://www.clips.ua.ac.be/~vincent/software.html) pylab (http://matplotlib.sourceforge.net/) ''' __version__='1.0' __date__='November 2011' __author__='Vincent Van Asch' import pylab import tokenizer import sys, os, getopt, time import random def loginfo(s): print >>sys.stderr, '%s: %s' %(time.strftime('%d/%m/%Y %H:%M:%S'), s) def zipf(fname, zipfdata={}, verbose=True): '''Returns the (x,y) values for a zipf plot as values of a dict key: token value: (rank, frequency) and the total number of tokens ''' # Read in with open(os.path.abspath(os.path.expanduser(fname)), 'rU') as f: txt = f.read() if '.' not in txt: loginfo('Did not find a full stop in %s. Not tokenizing merely splitting...' %os.path.basename(fname)) sentences = txt.split() else: sentences = tokenizer.split(txt) # Previous data data={} if zipfdata: for w, v in zipfdata.items(): data[w] = v[2] # tokenize and count total=0.0 for sentence in sentences: for w in sentence.split(): data[w] = data.get(w, 0) + 1 total+=1 tokens = data.items() tokens.sort(key=lambda x:x[1], reverse=True) # reorder output={} for r, wc in enumerate(tokens): w,c = wc output[w] = (r, c/total, c) if verbose: loginfo('%d tokens in %s' %(total, fname)) loginfo('%d types' %len(data)) return output, total colors=['r', 'g', 'b', 'c', 'm', 'y', 'k'] def plot(zipfdata, label=None): global colors if label and not isinstance(label, unicode): label = label.decode('utf8') data = zipfdata.values() data.sort(key=lambda x:x[0]) x = [d[0] for d in data] y = [d[1] for d in data] ax = pylab.axes() ax.set_xscale('log') ax.set_yscale('log') if label: if colors: c = colors.pop(0) else: c='k' pylab.plot(x, y, c+'+', label=label) else: pylab.plot(x, y, 'r+') #print zipfdata.keys() def randomtext(N, alphabet='abcdefghijklmnopqrstuvwxyz', sepchance=0.3, outname=None): '''Create a random text file using the alphabet of N characters. sepchance: the chance for inserting the word separator ''' if isinstance(alphabet, (str, unicode)): chars = list(alphabet) chars=[] for i,a in enumerate(alphabet): for j in range((i)+1): chars.append(a) else: chars = alphabet[:] N = int(N) sep = ' ' S = 1 if sepchance is not None: S = int( (sepchance*float(len(chars))) / (1.0 - sepchance) ) #loginfo('S %d' %S) for i in range(S): chars.append(sep) if not outname: name = 'randomtext.%d.txt' %N else: name = outname with open(name, 'w') as f: for i in range(N): f.write(random.choice(chars).encode('utf8')) loginfo('Written %s' %name) return name def getcharfrequency(fname, factor=1000, verbose=True): '''Can be used to extract an alphabet for randomtext() Notes: - all characters transformed to lowercase - Infrequent words are set to 0 intil the maximum difference between most and least frequent is factor - sentence terminals (?!.) are not in the output ''' chars={} with open(fname, 'rU') as f: txt = f.read() txt = txt.decode('utf8').lower() # normalize whitespace txt = ' '.join(txt.split()) total=0 sepcount=0 for c in txt: if c.strip() and c not in '.?!': chars[c] = chars.get(c, 0) + 1 else: sepcount+=1 total+=1 sepchance = float(sepcount)/total if verbose: print 'Probability of sentence delimiters ( .?!):', sepchance ranks = chars.items() ranks.sort(key=lambda x:x[1]) # start pruning min_f = float(min([x[1] for x in ranks])) max_f = float(max([x[1] for x in ranks])) if verbose: loginfo('Analysis') print 'SPREAD :', max_f/min_f print 'ALPHABET LENGTH:', len(ranks) print 'ALPHABET:\n', ' '.join([('%s:%d' %x).encode('utf8') for x in ranks]) while max_f/min_f > factor: ranks = [x for x in ranks if x[1] > min_f] min_f = float(min([x[1] for x in ranks])) max_f = float(max([x[1] for x in ranks])) out=[] min_f = float(min([x[1] for x in ranks])) max_f = float(max([x[1] for x in ranks])) ranks = [(c, int(cc/min_f)) for c, cc in ranks] if verbose: loginfo('Extracted alphabet') print 'SPREAD :', max_f/min_f print 'ALPHABET LENGTH:', len(ranks) print 'ALPHABET:\n', ' '.join([('%s:%d' %x).encode('utf8') for x in ranks]) for char, freq in ranks: out.extend([char for i in range(freq)]) return out, sepchance def mimictext(fname, verbose=True): '''Mimics a text with random tokens''' # Get the character frequencies if verbose: loginfo('Extracting alphabet for %s' %fname) alphabet, sepchance = getcharfrequency(fname, factor=1000, verbose=verbose) # create file N = 10000000 if verbose: loginfo('Creating text of %d characters with same character probabilities' %N) outname = fname if outname.endswith('.txt'): outname = outname[:-4] outname +='.mimic.txt' randomtext(N, alphabet=alphabet, sepchance=sepchance, outname=outname) def _usage(): print >>sys.stderr, '''Create Zipf plot from file (version %s) USAGE python zipf.py [-v] [-s] textfile1 [textfile2] [...] OPTIONS -s : create a separate plot for each text file -v : verbose DEPENDENCIES tokenizer.py (http://www.clips.ua.ac.be/~vincent/software.html) pylab (http://matplotlib.sourceforge.net/) EXTRA OPTION -R -R : instead of using the textfiles to create a Zipf plot use the files to create synthetic files that have the same character frequencies as in the original file but the tokens are randomly generated. This may become interesting if you start playing with the code of the functions getcharfrequency() and randomtext(). NOTE Tokenizing long text files my take a lot of memory %s, %s''' %(__version__, __author__, __date__) if __name__ == '__main__': try: opts,args=getopt.getopt(sys.argv[1:],'hsvR', ['help']) except getopt.GetoptError: # print help information and exit: _usage() sys.exit(2) verbose=False separate=False mimic=False for o, a in opts: if o in ('-h', '--help'): _usage() sys.exit() if o in ('-s',): separate=True if o in ('-v',): verbose=True if o in ('-R',): mimic=True if not args: _usage() sys.exit() if mimic: for fname in args: if verbose: loginfo('Creating random text from %s' %fname) mimictext(fname, verbose=verbose) sys.exit(0) data={} for fname in args: data, wordcount = zipf(fname, zipfdata=data, verbose=verbose) if separate: plot(data, label=os.path.basename(fname)) data={} if not separate: plot(data) if separate: pylab.legend() pylab.title('Zipf plot') pylab.xlabel('Word rank') pylab.ylabel('Word frequency') pylab.show()