#include #include #include #include #include #include "trigram.h" using namespace std; namespace biostrm = boost::iostreams; //TODO remove once everything uses tdlOptions tTrigramModel::tTrigramModel(string fbasename) { init(); //set defaults in case they aren't specified in the model tagtype = FULL; mapgen = true; string efname = fbasename + ".ex"; readEmissionScores(efname); string tfname = fbasename + ".tx"; readTransitionScores(tfname); } tTrigramModel::tTrigramModel(tdlOptions *tdlopts) { init(tdlopts); string fbasename = tdlopts->get("ut-model"); string efname = fbasename + ".ex"; readEmissionScores(efname); string tfname = fbasename + ".tx"; readTransitionScores(tfname); } tTrigramModel::tTrigramModel(vector &ifiles, tdlOptions *tdlopts, bool lambdaset, double l1, double l2, double l3, bool quiet) { init(tdlopts); if (lambdaset) {//use given lambda lambda1 = l1; lambda2 = l2; lambda3 = l3; } total = 0; for (vector::iterator fitr = ifiles.begin(); fitr != ifiles.end(); ++fitr) { ifstream ifile; if (!fitr->empty() && *fitr != "-") { ifile.open(fitr->c_str()); if (!ifile.is_open()) { cerr << "Couldn't read " << *fitr << ", skipping." << endl; continue; } } istream &inputstr = (ifile.is_open()?ifile:cin); if (inputstr.good()) { string iline; getline(inputstr, iline); string tminus2, tminus1; tminus1 = tminus2 = STAG(); while (!inputstr.eof()) { if (iline.empty()) {//end-of-sentence if (tminus1 != STAG()) { incTGCounts(tminus2, tminus1, ETAG()); incBGCounts(STAG(), STAG()); incBGCounts(tminus1, ETAG()); incUGCounts(ETAG()); incUGCounts(STAG()); } tminus1 = tminus2 = STAG(); } else { string word, tag; istringstream linestr(iline); getline(linestr, word, '\t'); getline(linestr, tag, '\t'); normalise(&word, &tag); incTWCounts(tag, word); incTGCounts(tminus2, tminus1, tag); incBGCounts(tminus1, tag); incUGCounts(tag); tminus2 = tminus1; tminus1 = tag; total++; if (!quiet) { if (total % 100000 == 0) cerr << total << " tokens read" << endl; else if (total % 10000 == 0) cerr << ". "; } } getline(inputstr, iline); } if (!quiet) cerr << endl; if (tminus1 != STAG()) { //add last end point incTGCounts(tminus2, tminus1, ETAG()); incBGCounts(STAG(), STAG()); incBGCounts(tminus1, ETAG()); incUGCounts(ETAG()); incUGCounts(STAG()); } if (ifile.is_open()) ifile.close(); } } } void tTrigramModel::init(tdlOptions *tdlopts) { _ut_debug = false; if (tdlopts && tdlopts->lookup("ut-debug") != NULL) { if (tdlopts->get("ut-debug") == "true") { _ut_debug = true; } } //set defaults in case they aren't specified in the model tagtype = FULL; if (tdlopts && tdlopts->lookup("ut-tagtype") != NULL) { string type = tdlopts->get("ut-tagtype"); for (map::const_iterator it = TagTypeName.begin(); it != TagTypeName.end(); ++it) { if (type.compare(it->second) == 0) { tagtype = it->first; break; } } } mapgen = true; if (tdlopts && tdlopts->lookup("ut-mapgen") != NULL) { string mapgenopt = tdlopts->get("ut-mapgen"); if (mapgenopt == "true") mapgen = true; else mapgen = false; } _caseclass_sep = "▲"; if (tdlopts && tdlopts->lookup("ut-caseclass_separator") != NULL) _caseclass_sep = tdlopts->get("ut-caseclass_separator"); string basedir = ""; if (tdlopts && tdlopts->lookup("ut-basedir") != NULL) basedir = tdlopts->get("ut-basedir"); if (tdlopts && tdlopts->lookup("suffixes") != NULL) { string affixfile = string(basedir+"/"+tdlopts->get("suffixes")); read_affix_file(suffixes, affixfile); } else { suffixes["w_period_plr"] = vector(1, string(".")); suffixes["w_qmark_plr"] = vector(1, string("?")); suffixes["w_qqmark_plr"] = vector(1, string("?")); suffixes["w_qmark-bang_plr"] = vector(1, string("!")); suffixes["w_comma_plr"] = vector(1, string(",")); suffixes["w_bang_plr"] = vector(1, string("!")); suffixes["w_semicol_plr"] = vector(1, string(";")); suffixes["w_double_semicol_plr"] = vector(1, string(";;")); suffixes["w_rparen_plr"] = vector(1, string(")")); suffixes["w_comma-rp_plr"] = vector(1, string(",)")); suffixes["w_rbrack_plr"] = vector(1, string("]")); suffixes["w_rbrack_plr"].push_back(string("}")); suffixes["w_rbrack_plr"].push_back(string("|")); suffixes["w_dqright_plr"] = vector(1, string("”")); suffixes["w_dqright_plr"].push_back(string("\"")); suffixes["w_dqright_plr"].push_back(string("''")); suffixes["w_sqright_plr"] = vector(1, string("’")); suffixes["w_sqright_plr"].push_back(string("'")); suffixes["w_hyphen_plr"] = vector(1, string("-")); suffixes["w_threedot_plr"] = vector(1, string("...")); suffixes["w_asterisk_plr"] = vector(1, string("*")); suffixes["w_comma-nf_plr"] = vector(1, string(",")); suffixes["w_italright_plr"] = vector(1, string("i¦")); suffixes["w_drop-iright_plr"] = vector(1, string("i¦")); } if (tdlopts && tdlopts->lookup("prefixes") != NULL) { string affixfile = string(basedir+"/"+tdlopts->get("prefixes")); read_affix_file(prefixes, affixfile); } else { prefixes["w_lparen_plr"] = vector(1, string("(")); prefixes["w_lbrack_plr"] = vector(1, string("[")); prefixes["w_lbrack_plr"].push_back(string("{")); prefixes["w_lbrack_plr"].push_back(string("|")); prefixes["w_dqleft_plr"] = vector(1, string("“")); prefixes["w_dqleft_plr"].push_back(string("”")); prefixes["w_dqleft_plr"].push_back(string("\"")); prefixes["w_dqleft_plr"].push_back(string("``")); prefixes["w_sqleft_plr"] = vector(1, string("‘")); prefixes["w_sqleft_plr"].push_back(string("'")); prefixes["w_sqleft_plr"].push_back(string("`")); prefixes["w_asterisk_pre_plr"] = vector(1, string("*")); prefixes["w_italleft_plr"] = vector(1, string("¦i")); prefixes["w_drop-ileft_plr"] = vector(1, string("¦i")); } if (tdlopts && tdlopts->lookup("generics_map") != NULL) { string genfile = string(basedir+"/"+tdlopts->get("generics_map")); read_mapgen_file(genfile); } else { generics_map["aj_-_i-cmp-unk_le"] = "aj_pp_i-cmp_le"; generics_map["aj_-_i-crd-gen_le"] = "aj_-_i-crd-two_le"; generics_map["aj_-_i-crd-unk_le"] = "aj_-_i-crd-two_le"; generics_map["aj_-_i-frct-gen_le"] = "aj_-_i-frct_le"; generics_map["aj_-_i-ord-gen_le"] = "aj_-_i-ord-two_le"; generics_map["aj_-_i-sup-unk_le"] = "aj_-_i-sup_le"; generics_map["aj_-_i-unk_le"] = "aj_-_i_le"; generics_map["aj_np_i-crd-gen_le"] = "aj_np_i-crd-nsp_le"; generics_map["av_-_dc-like-unk_le"] = "av_-_dc-like-pr_le"; generics_map["av_-_i-unk_le"] = "av_-_i-vp_le"; generics_map["n_-_c-pl-gen_le"] = "n_-_mc_le:n_pl_olr"; generics_map["n_-_c-pl-unk_le"] = "n_-_mc_le:n_pl_olr"; generics_map["n_-_day-crd-gen_le"] = "n_-_c-day_le"; generics_map["n_-_mc-ns-g_le"] = "n_-_mc-ns_le"; generics_map["n_-_mc-unk_le"] = "n_-_mc_le"; generics_map["n_-_meas-n-gen_le"] = "n_-_c-meas_le"; generics_map["n_-_pn-dom-e-gen_le"] = "n_-_pn-dom-euro_le"; generics_map["n_-_pn-dom-gen_le"] = "n_-_pn-dom-card_le"; generics_map["n_-_pn-dom-o-gen_le"] = "n_-_pn-dom-ord_le"; generics_map["n_-_pn-gen_le"] = "n_-_pn_le"; generics_map["n_-_pn-pl-unk_le"] = "n_-_pn-pl_le"; generics_map["n_-_pn-unk_le"] = "n_-_pn_le"; generics_map["n_np_pn-hour-gen_le"] = "n_-_pn-hour_le"; generics_map["v_-_pas-unk_le"] = "v_-_psv_le"; generics_map["v_np*_bse-unk_le"] = "v_np*_le:v_n3s-bse_ilr"; generics_map["v_np*_pa-unk_le"] = "v_np*_le:v_pst_olr"; generics_map["v_np*_pr-3s-unk_le"] = "v_np*_le:v_3s-fin_olr"; generics_map["v_np*_pr-n3s-unk_le"] = "v_np*_le:v_n3s-bse_ilr"; generics_map["v_np*_prp-unk_le"] = "v_np*_le:v_prp_olr"; generics_map["v_np*_psp-unk_le"] = "v_np*_le:v_psp_olr"; generics_map["v_np*_unk_le"] = "v_np*_le"; } if (tdlopts && tdlopts->lookup("ut-whitelist") != NULL) { string wlfile = string(basedir+"/"+tdlopts->get("ut-whitelist")); read_whitelist_file(wlfile); } } tTrigramModel::~tTrigramModel() { clearCounts(); } double tTrigramModel::getTransProb(string a, string b, string c) { string key = a + "␣" + b + "␣" + c; if (trigrams_s.count(key) > 0) { return trigrams_s[key]; } else { key = b + "␣" + c; if (bigrams_s.count(key) > 0) { return bigrams_s[key]; } else if (unigrams_s.count(c) > 0) { return unigrams_s[c]; } else { return MINLOGPROB; } } } double tTrigramModel::getEmit(string t, string w) { string key = t + "␣" + w; if (emissions_s.count(key) > 0) return emissions_s[key]; else return MINLOGPROB; } void tTrigramModel::trigramDeletedInterpolation() { lambda1 = lambda2 = lambda3 = 0; for (TGMap::iterator tgitr = tgcounts.begin(); tgitr != tgcounts.end(); ++tgitr) { string t1 = tgitr->first; for (BGMap::iterator bgitr = tgitr->second ->begin(); bgitr != tgitr->second->end(); ++bgitr) { string t2 = bgitr->first; for (CMap::iterator ugitr = bgitr->second->begin(); ugitr != bgitr->second->end(); ++ugitr) { string t3 = ugitr->first; double tg = (static_cast(ugitr->second) - 1) / ((*bgcounts[t1])[t2] - 1); double bg = (static_cast((*bgcounts[t2])[t3]) - 1)/ (ugcounts[t2] - 1); double ug = (static_cast(ugcounts[t3]) - 1) / (total - 1); if (tg >= bg && tg >= ug) { lambda3 += ugitr->second; } else { if (bg >= tg && bg >= ug) { lambda2 += ugitr->second; } else { lambda1 += ugitr->second; } } } } } int totallambda = lambda1 + lambda2 + lambda3; lambda1 = lambda1/totallambda; lambda2 = lambda2/totallambda; lambda3 = lambda3/totallambda; } void tTrigramModel::calculateEmissionScores() { for (map::iterator citr = twcounts.begin(); citr != twcounts.end(); ++citr) { string tag = citr->first; int tcount = ugcounts[tag]; for (CMap::iterator twitr = citr->second->begin(); twitr != citr->second->end(); ++twitr) { string key = tag + "␣" + twitr->first; double logprob = log(static_cast(twitr->second)/tcount); //cerr << key << ": log(" << twitr->second << "/" << tcount // << " = " << logprob << endl; emissions_s.insert(SMapVal(key, logprob)); } } //Stored emission logprobs } void tTrigramModel::calculateTransitionScores() { for (TGMap::iterator tgitr = tgcounts.begin(); tgitr != tgcounts.end(); ++tgitr) { string t1 = tgitr->first; for (BGMap::iterator bgitr = tgitr->second ->begin(); bgitr != tgitr->second->end(); ++bgitr) { string t2 = bgitr->first; for (CMap::iterator ugitr = bgitr->second->begin(); ugitr != bgitr->second->end(); ++ugitr) { string t3 = ugitr->first; string key = t1 + "␣" + t2 + "␣" + t3; //cerr << key << endl; //cerr << "(" << lambda1 << " * (" << ugcounts[t3] << "/" << total << ")) + // << (" << lambda2 << " * (" << (*bgcounts[t2])[t3] << "/" << ugcounts[t2] // << ") + (" << lambda3 << " * (" << ugitr->second << "/" // << (*bgcounts[t1])[t2] << ")) = "; double prob = (lambda1 * (static_cast(ugcounts[t3])/total)) + (lambda2 * (static_cast((*bgcounts[t2])[t3])/ugcounts[t2])) + (lambda3 * (static_cast(ugitr->second)/(*bgcounts[t1])[t2])); //cerr << prob << endl; if (prob > 0) //don't add zero prob events trigrams_s.insert(SMapVal(key, log(prob))); } } } //Added trigram scores: log(λ1*P(t3) + λ2*P(t2,t3) + λ3*P(t1,t2,t3)) for (BGMap::iterator bgitr = bgcounts.begin(); bgitr != bgcounts.end(); ++bgitr) { string t2 = bgitr->first; for (CMap::iterator ugitr = bgitr->second->begin(); ugitr != bgitr->second->end(); ++ugitr) { string t3 = ugitr->first; string key = t2 + "␣" + t3; double prob = (lambda1 * (static_cast(ugcounts[t3])/total)) + (lambda2 * (static_cast(ugitr->second)/ugcounts[t2])); if (prob > 0) //don't add zero prob events bigrams_s.insert(SMapVal(key, log(prob))); } } //Added bigram scores: log(λ1*P(t3) + λ2*P(t2,t3)) for (CMap::iterator ugitr = ugcounts.begin(); ugitr != ugcounts.end(); ++ugitr) { double prob = lambda1*(static_cast(ugitr->second)/total); if (prob > 0) //don't add zero prob events unigrams_s.insert(SMapVal(ugitr->first, log(prob))); } //Added unigram scores: log(λ1*P(t3)) } void tTrigramModel::clearCounts() { for (map::iterator citr = twcounts.begin(); citr != twcounts.end(); ++citr) delete citr->second; for (TGMap::iterator tgitr = tgcounts.begin(); tgitr != tgcounts.end(); ++tgitr) { for (BGMap::iterator bgitr = tgitr->second ->begin(); bgitr != tgitr->second->end(); ++bgitr) delete bgitr->second; delete tgitr->second; } tgcounts.clear(); for (BGMap::iterator bgitr = bgcounts.begin(); bgitr != bgcounts.end(); ++bgitr) delete bgitr->second; bgcounts.clear(); ugcounts.clear(); } void tTrigramModel::incTWCounts(string t, string w, int c) { if (twcounts.count(t) == 0) twcounts.insert(BGMapVal(t, new CMap())); if (twcounts[t]->count(w) == 0) twcounts[t]->insert(CMapVal(w, 0)); (*twcounts[t])[w] += c; } void tTrigramModel::incTGCounts(string t1, string t2, string t3, int c) { if (tgcounts.count(t1) == 0) tgcounts.insert(TGMapVal(t1, new map())); if (tgcounts[t1]->count(t2) == 0) tgcounts[t1]->insert(BGMapVal(t2, new CMap())); if ((*tgcounts[t1])[t2]->count(t3) == 0) (*tgcounts[t1])[t2]->insert(CMapVal(t3, 0)); (*(*tgcounts[t1])[t2])[t3] += c; } void tTrigramModel::incBGCounts(string t1, string t2, int c) { if (bgcounts.count(t1) == 0) bgcounts.insert(BGMapVal(t1, new CMap())); if (bgcounts[t1]->count(t2) == 0) bgcounts[t1]->insert(CMapVal(t2, 0)); (*bgcounts[t1])[t2] += c; } void tTrigramModel::incUGCounts(string t1, int c) { if (ugcounts.count(t1) == 0) ugcounts.insert(CMapVal(t1, 0)); ugcounts[t1] += c; } void tTrigramModel::writeCompiled(string mname, bool quiet, string comment) { string efname = mname + ".ex.gz"; string tfname = mname + ".tx.gz"; ofstream ef(efname.c_str(), ios_base::out| ios_base::binary); if (ef.is_open()) { biostrm::filtering_stream zf; zf.push(biostrm::gzip_compressor()); zf.push(ef); write_header(zf, comment); int count = 0; for (SMap::iterator citr = emissions_s.begin(); citr != emissions_s.end(); ++citr) { string key = citr->first; string sep("␣"); size_t index = 0; size_t sep_idx = key.find(sep, index); while (sep_idx != string::npos) { key.replace(sep_idx, sep.length(), 1, '\t'); index = sep_idx + sep.length(); sep_idx = key.find(sep, index); } zf << key << "\t" << citr->second << endl; count++; } zf.reset(); if (!quiet) cerr << "Wrote " << count << " lines to " << efname << endl; } else { cerr << "ERROR: Couldn't open " << efname << " for writing." << endl; exit(1); } ofstream tf(tfname.c_str(), ios_base::out | ios_base::binary); if (tf.is_open()) { biostrm::filtering_stream zf; zf.push(biostrm::gzip_compressor()); zf.push(tf); write_header(zf, comment); int count = 0; for (SMap::iterator citr = trigrams_s.begin(); citr != trigrams_s.end(); ++citr) { string key = citr->first; string sep("␣"); size_t index = 0; size_t sep_idx = key.find(sep, index); while (sep_idx != string::npos) { key.replace(sep_idx, sep.length(), 1, '\t'); index = sep_idx; sep_idx = key.find(sep, index); } zf << key << "\t" << citr->second << endl; count++; } for (SMap::iterator citr = bigrams_s.begin(); citr != bigrams_s.end(); ++citr) { string key = citr->first; string sep("␣"); size_t index = 0; size_t sep_idx = key.find(sep, index); while (sep_idx != string::npos) { key.replace(sep_idx, sep.length(), 1, '\t'); index = sep_idx; sep_idx = key.find(sep, index); } zf << key << "\t" << citr->second << endl; count++; } for (SMap::iterator citr = unigrams_s.begin(); citr != unigrams_s.end(); ++citr) { string key = citr->first; zf << key << "\t" << citr->second << endl; count++; } zf.reset(); if (!quiet) cerr << "Wrote " << count << " lines to " << tfname << endl; } else { cerr << "ERROR: Couldn't open " << tfname << " for writing." << endl; exit(1); } } void tTrigramModel::readEmissionScores(string efname) { biostrm::filtering_stream in; ifstream ef(string(efname+".gz").c_str(), ios_base::in | ios_base::binary); if (ef.is_open()) { in.push(biostrm::gzip_decompressor()); in.push(ef); } else { ef.open(efname.c_str()); if (ef.is_open()) in.push(ef); else { cerr << "Couldn't open " << efname << ". Exiting." << endl; exit(1); } } string eline; getline(in, eline); while (!in.eof()) { if (eline.empty()) { getline(in, eline); continue; } if (eline.compare(0, 3, "\%\%\%") == 0) { //header istringstream linestr(eline.substr(3)); string opt; linestr >> opt; if (opt.compare("TAGTYPE:") == 0) { string type; linestr >> type; for (map::const_iterator it = TagTypeName.begin(); it != TagTypeName.end(); ++it) { if (type.compare(it->second) == 0) { tagtype = it->first; break; } } } if (opt.compare("MAPPED:") == 0) { string mopt; linestr >> mopt; if (mopt.compare("YES") == 0) mapgen = true; else if (mopt.compare("NO") == 0) mapgen = false; } if (opt.compare("CASECLASS SEP:") == 0) { linestr >> _caseclass_sep; } getline(in, eline); continue; } string tag, word; double score; istringstream linestr(eline); try { getline(linestr, tag, '\t'); getline(linestr, word, '\t'); //words can have spaces linestr >> score; } catch (...) { cerr << "Error parsing " << efname << " at\n\t" << eline << endl; exit(1); } string key = tag + "␣" + word; emissions_s.insert(SMapVal(key, score)); getline(in, eline); } ef.close(); in.reset(); } void tTrigramModel::readTransitionScores(string tfname) { biostrm::filtering_stream in; ifstream tf(string(tfname+".gz").c_str(), ios_base::in | ios_base::binary); if (tf.is_open()) { in.push(biostrm::gzip_decompressor()); in.push(tf); } else { tf.open(tfname.c_str()); if (tf.is_open()) in.push(tf); else { cerr << "Couldn't open " << tfname << ". Exiting." << endl; exit(1); } } string tline, col1, col2, col3, col4; double score; getline(in, tline); while(!in.eof()) { if (tline.compare(0, 3, "\%\%\%") == 0) { //header getline(in,tline); continue; } istringstream linestr(tline); getline(linestr, col1, '\t'); getline(linestr, col2, '\t'); if (getline(linestr, col3, '\t')) {//at least 3 col if (getline(linestr, col4, '\t')) {//4 col istringstream dstr(col4); dstr >> score; string key = col1 + "␣" + col2 + "␣" + col3; trigrams_s.insert(SMapVal(key, score)); } else {//3 col istringstream dstr(col3); dstr >> score; string key = col1 + "␣" + col2; bigrams_s.insert(SMapVal(key, score)); } } else {//2 col istringstream dstr(col2); dstr >> score; unigrams_s.insert(SMapVal(col1, score)); } getline(in,tline); } tf.close(); in.reset(); } void tTrigramModel::normalise(string *surface, string *tag) { if (mapgen) { size_t colon = tag->find_first_of(':'); if (colon == string::npos) colon = tag->length(); //letype only if (generics_map.count(tag->substr(0, colon)) > 0) { tag->replace(0, colon, generics_map[tag->substr(0, colon)]); } } if (tagtype == NOAFFIX || tagtype == LETYPE || tagtype == MSC) { //strip plr rules and affixes from surface size_t colon = tag->find_last_of(':'); size_t index = tag->length()-1; while (colon != string::npos) { string rule = tag->substr(colon+1, index - colon); // if (index > 2 && tag->compare(index - 2, 3, "plr") == 0) if (prefixes.count(rule) > 0) { for (vector::iterator a = prefixes[rule].begin(); a != prefixes[rule].end(); ++a) { if (surface->compare(0, a->length(), *a) == 0) { surface->erase(0, a->length()); break; } } tag->erase(colon, index - colon + 1); } else { if (suffixes.count(rule) > 0) { for (vector::iterator a = suffixes[rule].begin(); a != suffixes[rule].end(); ++a) { //now we've added case class after the surface, erase from before //the (optional) case class //size_t send = surface->find_last_of(_caseclass_sep); size_t send = surface->find(_caseclass_sep); if (send != string::npos && send + _caseclass_sep.length() < surface->length() && ((*surface)[send + _caseclass_sep.length()] == 'c' || (*surface)[send + _caseclass_sep.length()] == 'n')) { //send is start of case class } else { send = surface->length(); } if (surface->compare(send - a->length(), a->length(), *a) == 0) { surface->erase(send - a->length(), a->length()); break; } } tag->erase(colon, index - colon + 1); } else { if (tagtype == LETYPE || tagtype == MSC) { //strip all rules tag->erase(colon, index - colon + 1); } } } index = colon - 1; colon = tag->find_last_of(':', index); } //seen all rules } if (tagtype == MSC || tagtype == MSCAFFIX) { size_t underscore = tag->find('_'); size_t colon = tag->find(':'); if (colon == string::npos) tag->erase(underscore, tag->length()-underscore); else tag->erase(underscore, colon - underscore); } } void tTrigramModel::write_header(biostrm::filtering_stream &of, string comment) { time_t t = time(0); struct tm *now = localtime(&t); of << "\%\%\% Model created " << now->tm_mday << "/" << (now->tm_mon + 1) << "/" << (now->tm_year + 1900) << "\n"; of << "\%\%\% Lambda values were " << lambda1 << ", " << lambda2 << ", " << lambda3 << "\n"; if (!comment.empty()) { size_t newline = comment.find('\n'); while (newline != string::npos) { comment.replace(newline, 1, "\n\%\%\%", 4); newline = comment.find('\n', newline+1); } of << "\%\%\% Comment: " << comment << "\n"; } of << "\%\%\% TAGTYPE: " << TagTypeName.at(tagtype) << "\n"; of << "\%\%\% MAPPED: " << (mapgen?"YES":"NO") << "\n"; of << "\%\%\% CASECLASS SEP: " << _caseclass_sep << "\n"; of << "\n\n"; } string tTrigramModel::lowercase(string orig) { for(int i = 0; orig[i]; i++){ orig[i] = tolower(orig[i]); } return orig; } void tTrigramModel::read_affix_file(affixMap &amap, string filename) { ifstream af(filename.c_str()); if (af.is_open()) { string aline, rule, affix; getline(af, aline); while (!af.eof()) { if (aline.empty() || aline.at(0) == ';') { getline(af, aline); continue; } istringstream linestr(aline); getline(linestr, rule, '\t'); getline(linestr, affix, '\t'); if (amap.count(rule) == 0) amap[rule] = vector(); amap[rule].push_back(affix); getline(af, aline); } af.close(); } else { cerr << "Couldn't open affix file " << filename << ". Continuing without affix map." << endl; } } void tTrigramModel::read_mapgen_file(string filename) { ifstream mf(filename.c_str()); if (mf.is_open()) { string mline, gen, native; getline(mf, mline); while (!mf.eof()) { if (mline.empty() || mline.at(0) == ';') { getline(mf, mline); continue; } istringstream linestr(mline); getline(linestr, gen, '\t'); getline(linestr, native, '\t'); generics_map[gen] = native; getline(mf, mline); } mf.close(); } else { cerr << "Couldn't open generics file " << filename << ". Continuing without generics map." << endl; } } void tTrigramModel::read_whitelist_file(string filename) { ifstream wf(filename.c_str()); if (wf.is_open()) { string wline, let; getline(wf, wline); while (!wf.eof()) { if (wline.empty() || wline.at(0) == ';') { getline(wf, wline); continue; } istringstream linestr(wline); linestr >> let; whitelist[let] = 1; getline(wf, wline); } wf.close(); } else { cerr << "Couldn't open whitelist file " << filename << ". Continuing without whitelist." << endl; } } bool tTrigramModel::on_whitelist(string s) { if (whitelist.count(s) > 0) return true; else return false; } double logsumexp (vector els) { double sum = 0; for (vector::iterator it = els.begin(); it != els.end(); ++it) sum += exp(*it); return log(sum); } double sumexp (vector els) { double sum = 0; for (vector::iterator it = els.begin(); it != els.end(); ++it) sum += exp(*it); return sum; }