""" Tabulate feature weight data into a single CSV for further analysis using other tools. This produces a CSV with header. The features themselves are not included. Marco Lui, February 2013 """ import argparse, os, csv, sys import numpy as np import bz2, base64 from cPickle import loads from langid.train.common import read_weights, read_features if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('model', metavar="MODEL_DIR", help="path to langid.py training model dir") parser.add_argument('output', metavar="OUTPUT", help = "write to OUTPUT") parser.add_argument('-f','--features', metavar="FILE", help = 'only output features from FILE') parser.add_argument('--raw', action='store_true', help="include raw features") parser.add_argument('--bin', action='store_true', help="include ig for lang-bin") args = parser.parse_args() def model_file(name): return os.path.join(args.model, name) # Try to determine the set of features to consider if args.features: # Use a pre-determined feature list print >>sys.stderr, "using user-supplied feature list:", args.features feats = read_features(args.features) elif os.path.exists(model_file('LDfeats')): # Use LDfeats print >>sys.stderr, "using LDfeats" feats = read_features(model_file('LDfeats')) else: raise ValueError("no suitable feature list") print >>sys.stderr, "considering {0} features".format(len(feats)) records = dict( (k, {}) for k in feats ) headers = [] headers.append('len') for k in feats: records[k]['len'] = len(k) # Document Frequency if os.path.exists(model_file('DF_all')): print >>sys.stderr, "found weights for document frequency" w = read_weights(model_file('DF_all')) headers.append('DF') for k in feats: records[k]['DF'] = w[k][0] # IG weights for the all-languages event if os.path.exists(model_file('IGweights.lang')): print >>sys.stderr, "found weights for lang" w = read_weights(model_file('IGweights.lang')) headers.append('IGlang') for k in feats: records[k]['IGlang'] = w[k][0] # IG weights for the all-domains event if os.path.exists(model_file('IGweights.domain')): print >>sys.stderr, "found weights for domain" w = read_weights(model_file('IGweights.domain')) headers.append('IGdomain') for k in feats: records[k]['IGdomain'] = w[k][0] # IG weights for language-binarized if args.bin and os.path.exists(model_file('IGweights.lang.bin')) and os.path.exists(model_file('lang_index')): print >>sys.stderr, "found weights for lang.bin" w = read_weights(model_file('IGweights.lang.bin')) # find the list of langs in-order with open(os.path.join(args.model, "lang_index")) as f: reader = csv.reader(f) langs = zip(*reader)[0] r_h = ['IGlang.bin.{0}'.format(l) for l in langs] headers.extend( r_h ) for k in feats: records[k].update( dict(zip(r_h, w[k])) ) if os.path.exists(model_file('LDfeats.scanner')) and os.path.exists(model_file('model')): print >>sys.stderr, "found weights for P(t|c)" with open(model_file('model')) as f: model = loads(bz2.decompress(base64.b64decode(f.read()))) with open(model_file('LDfeats.scanner')) as f: _, _, nb_feats = loads(f.read()) nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output = model nb_numfeats = len(nb_ptc) / len(nb_pc) nb_ptc = np.array(nb_ptc).reshape(len(nb_ptc)/len(nb_pc), len(nb_pc)) # Normalize to 1 on the term axis for i in range(nb_ptc.shape[1]): nb_ptc[:,i] = (1/np.exp(nb_ptc[:,i][None,:] - nb_ptc[:,i][:,None]).sum(1)) w = dict(zip(nb_feats, nb_ptc)) r_h = ['ptc.{0}'.format(l) for l in nb_classes] headers.extend( r_h ) for k in feats: records[k].update( dict(zip(r_h, w[k])) ) if args.raw: headers.append('feat') for k in feats: records[k]['feat'] = k print >>sys.stderr, "writing output" with open(args.output, 'w') as f: writer = csv.DictWriter(f,headers) writer.writeheader() writer.writerows(records.values()) print >>sys.stderr, "done"