#!/usr/bin/env python '''transcode.py Script to convert a textfile in one encoding into another encoding. Copyright (c) 2011 CLiPS. All rights reserved. # License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt ''' __author__="Vincent Van Asch" __date__="April 2011" __version__ = '1.1' import codecs, os, sys, getopt, re def transcode(fname, source, target, verbose=False, rewhite=False): ''' fname: file to transcode source: encoding of the file target: encoding of the output file Creates a file: fname.target ''' name = os.path.abspath(os.path.expanduser(fname)) try: f=codecs.open(name, 'rU', source) except LookupError: print >>sys.stderr, 'ERROR: encoding "%s" is not known.' %source return txt=None try: txt = f.read() except UnicodeDecodeError: print >>sys.stderr, 'ERROR: Could not open "%s" with source encoding "%s". File has probably another encoding.' %(os.path.basename(name), source) finally: f.close() if txt is None: return name = name+'.'+target try: f=codecs.open(name, 'w', target) except LookupError: print >>sys.stderr, 'ERROR: encoding "%s" is not known.' %target os.remove(name) return try: if rewhite: if verbose: print >>sys.stderr, '''Replacing all (multiple) whitespaces with %s and newlines with '\\n'.''' %repr(rewhite) # Split in lines lines = txt.split('\r') lines2=[] for line in lines: lines2.extend(line.split('\n')) lines=lines2[:] del lines2 # Split in part, join and write for line in lines: line = re.sub('\s+', ' ', line) f.write(rewhite.join(line.split())) f.write('\n') else: f.write(txt) except UnicodeEncodeError: print >>sys.stderr, 'ERROR: Could not write "%s" with target encoding "%s" because of unavailable characters. Try another target encoding.' %(os.path.basename(name), target) os.remove(name) return finally: f.close() if verbose: print >>sys.stderr, 'Transcoded file from "%s" to "%s": %s' %(source, target, name) return name def _usage(): print >>sys.stderr, '''Convert a file into another encoding (version %s) USAGE ./transcode.py [-s enc] [-t enc] [-v] [-r] fname1 [fname2 ...] OPTIONS -s enc: The encoding of the source file. (default: latin1) -t enc: The encoding of the target file. (default: utf8) -S : Normalize whitespace to a single space and newlines to "\n" -T : Normalize whitespace to a tab and newlines to "\n" -v : Print some info A list of available encodings can be found at: http://docs.python.org/library/codecs.html#standard-encodings %s''' %(__version__, __date__) if __name__ == '__main__': try: opts,args=getopt.getopt(sys.argv[1:],'hs:t:vST', ['help', 'source=', 'target=', 'verbose', 'rewhite', 'retab']) except getopt.GetoptError: # print help information and exit: _usage() sys.exit(2) source='latin1' target='utf8' verbose=False rewhite = False for o, a in opts: if o in ('-h', '--help'): _usage() sys.exit() if o in ('-s', '--source'): source = a if o in ('-t', '--target'): target = a if o in ('-v', '--verbose'): verbose=True if o in ('-S', '--rewhite'): rewhite = ' ' if o in ('-T', '--retab'): rewhite = '\t' if len(args) < 1: _usage() sys.exit(1) for fname in args: transcode(fname, source, target, verbose=verbose, rewhite=rewhite)