#include #include #include #include #include #include #include #include "unicode.h" #include "grammar.h" #include "profile.h" #include "derivReader.h" #include "derivReader.cpp" #include "lattice.h" #include "trigram.h" #include "tdl_options.h" using namespace std; namespace fs = boost::filesystem; typedef vector Slot; typedef vector Path; typedef Slot::iterator SlotItr; typedef Path::iterator PathItr; typedef map SIMap; typedef SIMap::iterator SIMapItr; typedef map ISMap; typedef ISMap::iterator ISMapItr; void collectleaves(delphin::Grammar &g, delphin::Node s, vector &ancestors, vector &leaves); void parse_options(int argc, char **argv, tdlOptions *tdl_opts, string *gfname, string *pname, bool *num, bool *offsets, bool *verbose, bool *goldonly, tIid *itemnum); extern string strip_quotes(string orig); void viterbi(vector *endsptr, tTrigramModel &model, bool verbose); double best_path(vector &ends, Path *path); void print_conll(Path &path, bool offsets); int main (int argc, char **argv) { // setting option variables string gfname, pname; tdlOptions *tdl_opts = new tdlOptions(); tIid itemnum; bool verbose, goldonly, num, offsets; try { parse_options(argc, argv, tdl_opts, &gfname, &pname, &num, &offsets, &verbose, &goldonly, &itemnum); } catch ( const boost::program_options::error& e ) { cerr << "Invalid command: " << e.what() << "\nExiting." << endl; exit(1); } tTrigramModel model(tdl_opts); cerr << "Read model " << tdl_opts->get("ut-model") << endl; // UTF-8 encoder initialize_encoding_converter("utf-8"); delphin::Grammar g(gfname); delphin::Profile p(pname); vector leaves; //used in collectleaves() delphin::DerivReader > reader(g, leaves, NULL, NULL, &collectleaves); pair,string> result = p.getResult(); tIid context = -1; // context is item id int event = -1; // event is lexical tree id vectorends; //list of nodes, indexed by end position vectorstrees; map seen; tState *node; while (result.first.first >= 0) {//new lexical tree if (result.first.first != context) {//new item/context if (ends.size() > 0) { //nodes in lattice viterbi(&ends, model, verbose); Path path; double score = best_path(ends, &path); if (num) cout << "#" << context << endl; print_conll(path, offsets); path.clear(); for (vector::iterator eit = ends.begin(); eit != ends.end(); ++eit) { for (SlotItr nit = eit->begin(); nit != eit->end(); ++nit) delete *nit; } ends.clear(); seen.clear(); for (vector::iterator iit = strees.begin(); iit != strees.end(); ++iit) delete *iit; strees.clear(); } else { //no nodes in lattice if (context > -1 && !(itemnum >= 0 && context != itemnum) && !(goldonly && !p.numGold(context))) { //why? cerr << "Item " << context << " skipped?." << endl; } } } context = result.first.first; event = result.first.second; //ignore items, depending on commandline options if ((itemnum >= 0 && context != itemnum) || //single item (goldonly && !p.numGold(context))) { //goldonly result = p.getResult(); continue; } //do stuff with result leaves.clear(); reader.readDeriv(result.second); if (leaves.size() != 1) { cerr << "Unexpected input in profile: " << context << ":" << event << " has " << leaves.size() << " leaves. " << "Is it really a lexical profile?" << endl; exit(1); } string orig = strip_quotes(leaves[0].surface); string word = orig; if (!leaves[0].caseclass.empty()) { word += model.caseclass_sep(); word += leaves[0].caseclass; } string tag = leaves[0].tags[0]; model.normalise(&word, &tag); int start = leaves[0].start; int end = leaves[0].end; int startchar = leaves[0].startchar; int endchar = leaves[0].endchar; lItem *utag = new lItem(start, end, orig, word, startchar, endchar); strees.push_back(utag); ostringstream key(tag); key << ":" << end << ":" << start; if (seen.count(key.str()) == 0) { if (ends.size() < leaves[0].end+1) ends.resize(leaves[0].end+1); double emit = model.getEmit(tag, word); node = new tState(utag, tag, emit, tag); ends[(node->itemptr())[0]->end()].push_back(node); seen[key.str()] = node; } else { seen[key.str()]->itemptr(utag); } result = p.getResult(); } //finish last context if (ends.size() > 0) { viterbi(&ends, model, verbose); Path path; double score = best_path(ends, &path); if (num) cout << "#" << context << endl; print_conll(path, offsets); for (vector::iterator eit = ends.begin(); eit != ends.end(); ++eit) { for (SlotItr nit = eit->begin(); nit != eit->end(); ++nit) delete *nit; } for (vector::iterator iit = strees.begin(); iit != strees.end(); ++iit) delete *iit; } finalize_encoding_converter(); return 0; } void parse_options(int argc, char **argv, tdlOptions *tdl_opts, string *gfname, string *pname, bool *num, bool *offsets, bool *verbose, bool *goldonly, tIid *itemnum) { namespace po = boost::program_options; po::options_description general("Options"); general.add_options() ("help,h", "This usage information.") ("model,m", po::value(), "Trigram tagging model (must be set in the config file or on command line") ("config,c", po::value(), "Configuration file") ("goldonly,g", "Only process items with 'gold' trees") ("single,s", po::value(itemnum)->default_value(-1), "Select a specific item, default (-1): all.") ("number,n", "Print item numbers") ("offsets,o", "Print character offsets of tokens") ("verbose,v", "Print lattice details."); po::options_description hidden("Hidden options"); hidden.add_options() ("grammar-file", po::value(gfname), "grammar .tdl file") ("profile", po::value(pname), "lexical profile") ; po::options_description cmd_line ("Command line options"); cmd_line.add(general).add(hidden); po::positional_options_description p; p.add("grammar-file",1).add("profile",1); po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmd_line).positional(p).run(), vm); notify(vm); if (vm.count("help")) { cout << "Usage: " << argv[0] << " [options] " << "grammar-file lexical-profile" << endl; cout << general << endl; exit(0); } if (!vm.count("grammar-file") || !vm.count("profile")) { cerr << "Insufficient arguments given." << endl; cerr << "Usage: " << argv[0] << " [options] " << "grammar-file lexical-profile" << endl; cerr << general << endl; exit(1); } if (vm.count("config")) { try { string cfname = vm["config"].as(); fs::path cpath(cfname); if (cpath.has_parent_path()) tdl_opts->set("ut-basedir", cpath.parent_path().string()); else tdl_opts->set("ut-basedir", fs::current_path().string()); tdl_opts->read(cfname); if (tdl_opts->lookup("ut-model") != NULL) { //make model findable from current dir fs::path mpath(tdl_opts->get("ut-model")); if (!mpath.is_complete()) { tdl_opts->set("ut-model", fs::path(tdl_opts->get("ut-basedir") / mpath).string()); } } } catch (exception &e) { cerr << "Error reading config file: " << e.what() << endl; exit(1); } } if (vm.count("model")) { tdl_opts->set("ut-model", vm["model"].as()); } if (tdl_opts->lookup("ut-model") == NULL) { cerr << "Error: a model must be given, either as the ut-model option " << "in the config file, or on the command line." << endl; exit(1); } if (vm.count("goldonly")) *goldonly = true; else *goldonly = false; if (vm.count("verbose")) *verbose = true; else *verbose = false; if (vm.count("number")) *num = true; else *num = false; if (vm.count("offsets")) *offsets = true; else *offsets = false; } //should only be a single leaf in a lexical profile, but we'll use a vector, //just in case void collectleaves(delphin::Grammar &g, delphin::Node s, vector &ancestors, vector &leaves) { leaves.push_back(s); string tag(ancestors.back().surface); for (int x=ancestors.size()-2; x >= 0 && g.is_lexrule(ancestors[x].surface); x--) { string infl = ancestors[x].surface; tag+=":"; tag+=infl; } leaves.back().tags.push_back(tag); leaves.back().probs.push_back("1.0"); leaves.back().startchar = leaves.back().start; leaves.back().endchar = leaves.back().end; leaves.back().start = ancestors.back().start; leaves.back().end = ancestors.back().end; } void viterbi(vector *endsptr, tTrigramModel &model, bool verbose) { vector ends = *endsptr; if (ends.size() > 0) { //start sentinal lItem *startitem = new lItem(0, 0, "", ""); tState *startnode = new tState(startitem, STAG(), 0, STAG()); startnode->add_prev(NULL, 0, -1); ends[0].push_back(startnode); // if we want to consider probability of being the last tag lItem *enditem = new lItem(ends.size()-1, ends.size(), "", ""); tState *endnode = new tState(enditem, ETAG(), 0, ETAG()); ends.push_back(Slot()); ends.back().push_back(endnode); //Trigram: hij -> j = tag at t, i = tag at t-1, h = tag at t-2 //for each time slot t for (int t = 1; t < ends.size(); ++t) { if (verbose) cerr << "t=" << t << endl; //for each node that ends at time slot t: j for (SlotItr it = ends[t].begin(); it != ends[t].end(); ++it) { tState *node = *it; if (verbose) cerr << " " << node->tag() << endl; //sanity check, should never fail? if ((node->itemptr())[0]->start() >= 0 && (node->itemptr())[0]->start() < ends.size()-1) { double maxProb = log(DBL_MIN); tState *bestNode = NULL; int best_idx = -1; //for each node that ends at j: i for (SlotItr sit = ends[(node->itemptr())[0]->start()].begin(); sit != ends[(node->itemptr())[0]->start()].end(); ++sit) { if (verbose) cerr << " " << (*sit)->tag() << endl; maxProb = log(DBL_MIN); best_idx = -1; int idx = 0; //index of delta in node i, to get to h //for each node that led to i: h for (vector::iterator pvit = (*sit)->prevs().begin(); pvit != (*sit)->prevs().end(); ++pvit) { string itag = (*sit)->tag(); if (pvit->backptr() != NULL) itag = pvit->backptr()->tag(); if (verbose) cerr << " " << itag << ": "; double transProb = model.getTransProb(itag, (*sit)->tag(), node->tag()); double logProb = pvit->delta() + transProb + node->emit(); if (verbose) cerr << pvit->delta() << "+" << transProb << "+" << node->emit() << "= " << logProb << endl; if (best_idx == -1 || maxProb < logProb) { maxProb = logProb; best_idx = idx; bestNode = *sit; } idx++; } //end of h nodes node->add_prev(bestNode, maxProb, best_idx); if (verbose && best_idx > -1) { string itag = (*sit)->tag(); if ((*sit)->prevs()[best_idx].backptr() != NULL) itag = (*sit)->prevs()[best_idx].backptr()->tag(); cerr << " " << itag << " " << (*sit)->tag() << " " << maxProb << endl; } } //end of i nodes } //end of sanity check } //end of j nodes at t } //end of slots, t==T delete startitem; delete enditem; } //non-empty ends processed } double best_path(vector &ends, Path *path) { double score = log(DBL_MIN); if (ends.size() > 0) { tState *best = NULL; double bestdelta = log(DBL_MIN); int best_uidx = -1; //find best node at end of path for (SlotItr it = ends[ends.size()-1].begin(); it != ends[ends.size()-1].end(); ++it) { int idx = 0; for (vector::iterator pvit = (*it)->prevs().begin(); pvit != (*it)->prevs().end(); ++pvit) { if (best == NULL || bestdelta < pvit->delta()) { best = *it; bestdelta = pvit->delta(); best_uidx = idx; } idx++; } } //found end of best path if (best != NULL) { int idx = best_uidx; //follow path, recording nodes for (tState *node = best; node != NULL;) { path->push_back(node); int newidx; if (node->prevs()[idx].backptr() != NULL) newidx = node->prevs()[idx].trace(); node = node->prevs()[idx].backptr(); idx = newidx; } //reverse path for (int i = 0, j=path->size()-1; itag().compare(STAG()) != 0 && (*pit)->tag().compare(ETAG()) != 0) { if (offsets) { cout << ((*pit)->itemptr())[0]->startchar() << "\t" << ((*pit)->itemptr())[0]->endchar() << "\t"; } cout << ((!((*pit)->itemptr())[0]->surface().empty())? ((*pit)->itemptr())[0]->surface():((*pit)->itemptr())[0]->word()) << "\t" << (*pit)->tag() << endl;; } } cout << endl; }