#include "derivReader.h" #include #include using namespace std; using namespace delphin; namespace delphin { template DerivReader::DerivReader(Grammar const &g, Data &d, void (*start_int_node)(Grammar &, string, vector &, int, double, int, int, Data &), void (*end_int_node)(Grammar &, vector &, vector &, Data &), void (*leaf)(Grammar &, Node, vector &, Data &)) :_g(g), _d(d), _start_int_node_funct(start_int_node), _end_int_node_funct(end_int_node), _leaf_funct(leaf) { rootnode.assign("^\\s*\\(([^\\s\"]+)\\s*\\("); endnode.assign("^\\s*\\)\\s*"); startnode.assign("^\\s*\\(\\s*"); intnode.assign( "^\\s*\\((\\d+) (\\S+) (\\S+) (\\d+) (\\d+)\\s*\\("); leafnode.assign( "^\\s*\\(\"(([^\"]*(\\\\\")*)*[^\"]*)\"(\\s*\\d+\\s*)?"); tokenre.assign("^\\s*\"token (\\[.*?\\])\""); } template DerivReader::~DerivReader() { } template void DerivReader::readDeriv(string treestring) { string rest = treestring; vector ancestors; readNode(rest, ancestors); } template string DerivReader::readNode(const string &rest, vector &ancestors) { boost::smatch res; string next = rest; if (boost::regex_search(next, res, rootnode)) { ancestors.push_back(Node(res[1].str())); next = next.substr(string(res[0].first, res[1].second).length()); next = readNode(next, ancestors); if (!boost::regex_match(next, res, endnode)) { cerr << "Unbalanced tree at: " << next << endl; exit(1); } else { vector children; children.push_back(ancestors.back()); ancestors.pop_back(); // finished node's children: function if (_end_int_node_funct != NULL) { _end_int_node_funct(_g, children, ancestors, _d); } } } else if (boost::regex_search(next, res, intnode)) { string s1 = res[2].str(); // record lexical type, not lex entry string let = _g.letype(s1); if (!let.empty()) { s1 = let; //modified to grab lemma, lex entry _and_ lex type //string lex = _g.lexeme(s1); //s1 = string(lex+"\t"+s1+"\t"+let); } for (unsigned int i=0;i> edge; istringstream(res[3].str()) >> score; istringstream(res[4].str()) >> start; istringstream(res[5].str()) >> end; // funct call to deal with internal nodes if (_start_int_node_funct != NULL) _start_int_node_funct(_g, s1, ancestors, edge, score, start, end, _d); // end funct call to deal with internal nodes ancestors.push_back(Node(s1)); ancestors.back().start = start; ancestors.back().end = end; next = next.substr(string(res[0].first,res[5].second).length()); vector children; while (boost::regex_search(next, res, startnode)) { next = readNode(next, ancestors); children.push_back(ancestors.back()); ancestors.pop_back(); } if (boost::regex_search(next, res, endnode)) { next = next.substr(string(res[0].first,res[0].second).length()); // finished node's children: function if (_end_int_node_funct != NULL) { _end_int_node_funct(_g, children, ancestors, _d); } } else { cerr << "Unbalanced node(I) at: " << next << endl; exit(1); } } else if (boost::regex_search(next, res, leafnode)) { string s1 = res[1].str(); s1 = "\"" + s1 + "\""; Node newtok = Node(s1); next = next.substr(string(res[0].first,res[0].second).length()); if (isdigit(next.at(0))) { unsigned int paren = next.find_first_of(')'); if (paren == string::npos) { cerr << "Ill-formed token at: " << next << endl; exit(1); } else { istringstream idstr(next.substr(0, paren)); int tid; idstr >> tid; newtok.tok_ids.push_back(tid); next.erase(0,paren); } } else { while (next.substr(0,8) == "\"token [") { map tokfeatures; next = readTokenFS(next, tokfeatures); string listfeat("+ID.LIST.FIRST"); while (tokfeatures.count(listfeat) == 1 && isdigit(tokfeatures[listfeat].at(0))) { istringstream idstr(tokfeatures[listfeat]); int tid; idstr >> tid; newtok.tok_ids.push_back(tid); listfeat.erase(listfeat.length()-5); listfeat += "REST.FIRST"; } if (tokfeatures.count("+CARG") == 1) { string carg = newtok.carg; if (!carg.empty()) carg += "_"; carg += tokfeatures["+CARG"]; newtok.carg = carg; } if (tokfeatures.count("+CLASS.+CASE") == 1) { if (newtok.caseclass.empty()) newtok.caseclass = tokfeatures["+CLASS.+CASE"]; } if (tokfeatures.count("+FROM") == 1 && newtok.start < 0) { istringstream numstr(tokfeatures["+FROM"]); int from; numstr >> from; newtok.start = from; } if (tokfeatures.count("+TO") == 1) { istringstream numstr(tokfeatures["+TO"]); int to; numstr >> to; newtok.end = to; } listfeat = "+STAG.+STAGS.FIRST"; string listfeat2("+STAG.+SPRBS.FIRST"); while (tokfeatures.count(listfeat) == 1) { string stag = tokfeatures[listfeat]; string sprob = "0"; if (tokfeatures.count(listfeat2) == 1) { sprob = tokfeatures[listfeat2]; } newtok.supertags.push_back(stag); newtok.superprobs.push_back(sprob); listfeat.erase(listfeat.length()-5); listfeat += "REST.FIRST"; listfeat2.erase(listfeat2.length()-5); listfeat2 += "REST.FIRST"; } } } if (boost::regex_search(next, res, endnode)) { next = next.substr(string(res[0].first,res[0].second).length()); } else { cerr << "Unbalanced node(L) at: " << next << endl; exit(1); } // function to deal with finding a leaf if (_leaf_funct != NULL) _leaf_funct(_g, newtok, ancestors, _d); // end function to deal with leaf ancestors.push_back(newtok); } else { cerr << "Ill-formed derivation tree at: " << next << endl; exit(1); } return next; } template string DerivReader::readTokenFS(string &rest, map &tokfeatures) { boost::smatch res; string tstring; map reentrancies; if (boost::regex_search(rest, res, tokenre)) { tstring = string(res[1].first, res[1].second); rest = rest.substr(string(res[0].first,res[0].second).length()); readFS(tstring, string(), tokfeatures, reentrancies); if (!tstring.empty() && tstring.at(0) == '"') { tstring.erase(0,1); } } else { cerr << "no token FS found at: " << rest << endl; exit(1); } removeWhitespace(rest); while (!rest.empty() && isdigit(rest.at(0))) { //another token rest.erase(0,1); } removeWhitespace(rest); return rest; } template void DerivReader::readFS(string &tok, string featstr, map &tokfeatures, map &reentrancies) { string feat, val; pair value; if (tok.at(0) != '[') { cerr << "not a FS at: " << tok << endl; exit(1); } tok = tok.substr(1); while (!tok.empty() && tok.at(0) != ']') { removeWhitespace(tok); feat = readFeat(tok); string fullfeat = featstr; if (!fullfeat.empty()) fullfeat += "."; fullfeat += feat; removeWhitespace(tok); value = readVal(tok); if (value.first >= 0) { if (value.second.empty()) { //look up index val = tokfeatures[reentrancies[value.first]]; } else { //first defn of index reentrancies.insert(pair(value.first, fullfeat)); val = value.second; } } else { val = value.second; } tokfeatures.insert(pair(fullfeat, val)); if (!tok.empty() && tok.at(0) == '[') { readFS(tok, fullfeat, tokfeatures, reentrancies); } removeWhitespace(tok); } if (tok.empty()) { cerr << "unterminated FS" << endl; exit(1); } if (tok.at(0) == ']') { tok = tok.erase(0,1); } } template string DerivReader::readFeat(string &tok) { string feat; feat = parseSymbol(tok); removeWhitespace(tok); return feat; } template pair DerivReader::readVal(string &tok) { int index = -1; string val; if (tok.substr(0,3).compare("\\\\\"") == 0) {//parse string val = parseString(tok); } else { //parse symbol if (tok.at(0) == '#') { //index tok.erase(0,1); string indexstr; while (!tok.empty() && isdigit(tok.at(0))) { indexstr += tok.at(0); tok.erase(0,1); } istringstream indexstream(indexstr); indexstream >> index; if (tok.at(0) == '=') {//index and val tok.erase(0,1); if (tok.substr(0,3).compare("\\\\\"") == 0) { val = parseString(tok); } else { val = parseSymbol(tok); } } } else { //symbol val = parseSymbol(tok); } } removeWhitespace(tok); return pair(index,val); } template string DerivReader::parseSymbol(string &tok) { string symbol; while (!tok.empty() && !isspace(tok.at(0))) { symbol += tok.at(0); tok.erase(0,1); } return symbol; } template string DerivReader::parseString(string &tok) { string val; tok.erase(0,3); while (!tok.empty()) { if (tok.length() >= 4 && tok.at(3) == '"') { if (tok.substr(0,3) != "\\\\\\") { if (tok.substr(1,2) == "\\\\") { val += tok.at(0); tok.erase(0,1); break; } else { cerr << "quoting wrong at: " << tok << endl; exit(1); } } } val += tok.at(0); tok.erase(0,1); } if (tok.length() >= 3 && tok.substr(0,3) == "\\\\\"") { tok.erase(0,3); } else { cerr << "Unterminated quoted string at: " << tok << endl; exit(1); } return val; } template void DerivReader::removeWhitespace(std::string &rest) { while (!rest.empty() && isspace(rest.at(0))) rest.erase(0,1); } } //namespace