#!/usr/bin/env python """ train.py - All-in-one tool for easy training of a model for langid.py. This depends on the training tools for individual steps, which can be run separately. Marco Lui, January 2013 Copyright 2013 Marco Lui . All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The views and conclusions contained in the software and documentation are those of the authors and should not be interpreted as representing official policies, either expressed or implied, of the copyright holder. """ TRAIN_PROP = 1.0 # probability than any given document is selected MIN_DOMAIN = 1 # minimum number of domains a language must be present in to be included MAX_NGRAM_ORDER = 4 # largest order of n-grams to consider TOP_DOC_FREQ = 15000 # number of tokens to consider for each order NUM_BUCKETS = 64 # number of buckets to use in k-v pair generation CHUNKSIZE = 50 # maximum size of chunk (number of files tokenized - less = less memory use) FEATURES_PER_LANG = 300 # number of features to select for each language import argparse import os, csv import numpy import base64, bz2, cPickle import shutil from common import makedir, write_weights, write_features, read_weights, read_features from index import CorpusIndexer from tokenize import build_index, NGramTokenizer from DFfeatureselect import tally, ngram_select from IGweight import compute_IG from LDfeatureselect import select_LD_features from scanner import build_scanner, Scanner from NBtrain import learn_nb_params if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-p","--proportion", type=float, help="proportion of training data to use", default=TRAIN_PROP) parser.add_argument("-m","--model", help="save output to MODEL_DIR", metavar="MODEL_DIR") parser.add_argument("-j","--jobs", type=int, metavar='N', help="spawn N processes (set to 1 for no paralleization)") parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets") parser.add_argument("-d","--domain", metavar="DOMAIN", action='append', help="use DOMAIN - can be specified multiple times (uses all domains found if not specified)") parser.add_argument("-l","--lang", metavar="LANG", action='append', help="use LANG - can be specified multiple times (uses all langs found if not specified)") parser.add_argument("--min_domain", type=int, help="minimum number of domains a language must be present in", default=MIN_DOMAIN) parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS) parser.add_argument("--max_order", type=int, help="highest n-gram order to use", default=MAX_NGRAM_ORDER) parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE) parser.add_argument("--df_tokens", type=int, help="number of tokens to consider for each n-gram order", default=TOP_DOC_FREQ) parser.add_argument("--word", action='store_true', default=False, help="use 'word' tokenization (currently str.split)") parser.add_argument("--df_feats", metavar="FEATS", help="Instead of DF feature selection, use a list of features from FEATS") parser.add_argument("--ld_feats", metavar="FEATS", help="Instead of LD feature selection, use a list of features from FEATS") parser.add_argument("--feats_per_lang", type=int, metavar='N', help="select top N features for each language", default=FEATURES_PER_LANG) parser.add_argument("--no_domain_ig", action="store_true", default=False, help="use only per-langugage IG in LD calculation") parser.add_argument("--debug", action="store_true", default=False, help="produce debug output (all intermediates)") parser.add_argument("--line", action="store_true", help="treat each line in a file as a document") group = parser.add_argument_group('sampling') group.add_argument("--sample_size", type=int, help="size of sample for sampling-based tokenization", default=140) group.add_argument("--sample_count", type=int, help="number of samples for sampling-based tokenization", default=None) parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR") args = parser.parse_args() if args.sample_count and args.line: parser.error("sampling in line mode is not implemented") if args.df_feats and args.ld_feats: parser.error("--df_feats and --ld_feats are mutually exclusive") corpus_name = os.path.basename(args.corpus) if args.model: model_dir = args.model else: model_dir = os.path.join('.', corpus_name+'.model') makedir(model_dir) # display paths print "corpus path:", args.corpus print "model path:", model_dir indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion, langs = args.lang, domains = args.domain, line_level=args.line) # Compute mappings between files, languages and domains lang_dist = indexer.dist_lang lang_index = indexer.lang_index lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items())) print "langs({0}): {1}".format(len(lang_dist), lang_info) domain_dist = indexer.dist_domain domain_index = indexer.domain_index domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items())) print "domains({0}): {1}".format(len(domain_dist), domain_info) print "identified {0} documents".format(len(indexer.items)) if args.line: print "treating each LINE as a document" items = sorted(set( (d,l,p) for (d,l,n,p) in indexer.items )) if args.debug: langs_path = os.path.join(model_dir, 'lang_index') domains_path = os.path.join(model_dir, 'domain_index') index_path = os.path.join(model_dir, 'paths') # output the language index with open(langs_path,'w') as f: writer = csv.writer(f) writer.writerows((l, lang_dist[lang_index[l]]) for l in sorted(lang_index, key=lang_index.get)) # output the domain index with open(domains_path,'w') as f: writer = csv.writer(f) writer.writerows((d, domain_dist[domain_index[d]]) for d in sorted(domain_index, key=domain_index.get)) # output items found with open(index_path,'w') as f: writer = csv.writer(f) writer.writerows(items) if args.temp: buckets_dir = args.temp else: buckets_dir = os.path.join(model_dir, 'buckets') makedir(buckets_dir) if args.ld_feats: # LD features are pre-specified. We are basically just building the NB model. LDfeats = read_features(args.ld_feats) else: # LD features not pre-specified, so we compute them. # Tokenize DFfeats = None print "will tokenize %d documents" % len(items) # TODO: Custom tokenizer if doing custom first-pass features if args.df_feats: print "reading custom features from:", args.df_feats DFfeats = read_features(args.df_feats) print "building tokenizer for custom list of {0} features".format(len(DFfeats)) tk = Scanner(DFfeats) elif args.word: print "using word tokenizer" tk = str.split else: print "using byte NGram tokenizer, max_order: {0}".format(args.max_order) tk = NGramTokenizer(1, args.max_order) # First-pass tokenization, used to determine DF of features tk_dir = os.path.join(buckets_dir, 'tokenize-pass1') makedir(tk_dir) b_dirs = build_index(items, tk, tk_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size, args.line) if args.debug: # output the paths to the buckets bucketlist_path = os.path.join(model_dir, 'bucketlist') with open(bucketlist_path,'w') as f: for d in b_dirs: f.write(d+'\n') # We need to compute a tally if we are selecting features by DF, but also if # we want full debug output. if DFfeats is None or args.debug: # Compute DF per-feature doc_count = tally(b_dirs, args.jobs) if args.debug: doc_count_path = os.path.join(model_dir, 'DF_all') write_weights(doc_count, doc_count_path) print "wrote DF counts for all features to:", doc_count_path if DFfeats is None: # Choose the first-stage features DFfeats = ngram_select(doc_count, args.max_order, args.df_tokens) if args.debug: feature_path = os.path.join(model_dir, 'DFfeats') write_features(DFfeats, feature_path) print 'wrote features to "%s"' % feature_path # Dispose of the first-pass tokenize output as it is no longer # needed. if not args.debug: shutil.rmtree(tk_dir) # Second-pass tokenization to only obtain counts for the selected features. # As the first-pass set is typically much larger than the second pass, it often # works out to be faster to retokenize the raw documents rather than iterate # over the first-pass counts. DF_scanner = Scanner(DFfeats) df_dir = os.path.join(buckets_dir, 'tokenize-pass2') makedir(df_dir) b_dirs = build_index(items, DF_scanner, df_dir, args.buckets, args.jobs, args.chunksize) b_dirs = [[d] for d in b_dirs] # Build vectors of domain and language distributions for use in IG calculation domain_dist_vec = numpy.array([ domain_dist[domain_index[d]] for d in sorted(domain_index, key=domain_index.get)], dtype=int) lang_dist_vec = numpy.array([ lang_dist[lang_index[l]] for l in sorted(lang_index.keys(), key=lang_index.get)], dtype=int) # Compute IG ig_params = [ ('lang', lang_dist_vec, '.lang', True), ] if not args.no_domain_ig: ig_params.append( ('domain', domain_dist_vec, '.domain', False) ) ig_vals = {} for label, dist, suffix, binarize in ig_params: print "Computing information gain for {0}".format(label) ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs) if args.debug: weights_path = os.path.join(model_dir, 'IGweights' + suffix + ('.bin' if binarize else '')) write_weights(ig, weights_path) ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig) # Select features according to the LD criteria features_per_lang = select_LD_features(ig_vals['lang'], ig_vals.get('domain'), args.feats_per_lang, ignore_domain = args.no_domain_ig) LDfeats = reduce(set.union, map(set, features_per_lang.values())) print 'selected %d features' % len(LDfeats) if args.debug: feature_path = os.path.join(model_dir, 'LDfeats') write_features(sorted(LDfeats), feature_path) print 'wrote LD features to "%s"' % feature_path with open(feature_path + '.perlang', 'w') as f: writer = csv.writer(f) for i in range(len(features_per_lang)): writer.writerow(map(repr,features_per_lang[i])) print 'wrote LD.perlang features to "%s"' % feature_path + '.perlang' # Compile a scanner for the LDfeats tk_nextmove, tk_output = build_scanner(LDfeats) if args.debug: scanner_path = feature_path + '.scanner' with open(scanner_path, 'w') as f: cPickle.dump((tk_nextmove, tk_output, LDfeats), f) print "wrote scanner to {0}".format(scanner_path) # Assemble the NB model langs = sorted(lang_index, key=lang_index.get) nb_classes = langs nb_dir = os.path.join(buckets_dir, 'NBtrain') makedir(nb_dir) nb_pc, nb_ptc = learn_nb_params([(int(l),p) for _, l, p in items], len(langs), tk_nextmove, tk_output, nb_dir, args) # output the model output_path = os.path.join(model_dir, 'model') model = nb_ptc, nb_pc, nb_classes, tk_nextmove, tk_output string = base64.b64encode(bz2.compress(cPickle.dumps(model))) with open(output_path, 'w') as f: f.write(string) print "wrote model to %s (%d bytes)" % (output_path, len(string)) # remove buckets if debug is off. We don't generate buckets if ldfeats is supplied. if not args.debug and not args.ld_feats: shutil.rmtree(df_dir) if not args.temp: # Do not remove the buckets dir if temp was supplied as we don't know # if we created it. shutil.rmtree(buckets_dir)