package lkformat; import se.lth.cs.nlp.depsrl.format.*; import se.lth.cs.nlp.nlputils.core.Pair; import se.lth.cs.nlp.nlputils.core.Triple; import se.lth.cs.nlp.nlputils.depgraph.*; import java.util.*; import java.io.*; public class SRLPostProcess { /* Luca drank coffee at 12 in his office. Event:drink.01(e1) Arg0(e1, Luca) Arg1(e1, coffee) ArgM-TMP(e1, AT(12)) Argm-LOC(e1, IN(office)) He put it on the table. Event:put.01(e2) Arg0(e2, He) Arg1(e2, it) Arg2(e2, ON(table)) eller Arg2(e2, table)??? He gave me the bottle. Event:give.01(e3) Arg0(e3, He) Arg1(e3, bottle) Arg2(e3, me) He gave the bottle to me. Event:give.01(e4) Arg0(e4, He) Arg1(e4, bottle) Arg2(e4, TO(me)) eller Arg2(e4, me)??? */ private static int idCounter = 0; private static int startTokenPos = 0; static void processPAs(ArrayList predOut, ArrayList argOut, ArrayList argLinksOut, List pas, DepGraph dg, LexicalDB propBank, LexicalDB nomBank) { //System.out.println("pas = " + pas); HashMap preds = new HashMap(); for(PAStructure pa: pas) if(!pa.lemma.endsWith("SU")) preds.put(pa.pred, pa); HashMap evIds = new HashMap(); HashMap argIds = new HashMap(); processPAsOrdered(predOut, argOut, argLinksOut, dg.nodes[0], preds, evIds, argIds, propBank, nomBank); startTokenPos += dg.nodes.length - 1; } private static void processPAsOrdered(ArrayList predOut, ArrayList argOut, ArrayList argLinksOut, DepNode n, HashMap preds, HashMap evIds, HashMap argIds, LexicalDB propBank, LexicalDB nomBank) { for(DepNode c: n.children) processPAsOrdered(predOut, argOut, argLinksOut, c, preds, evIds, argIds, propBank, nomBank); PAStructure pa = preds.get(n); if(pa == null) return; idCounter++; String id = "" + idCounter; evIds.put(n, id); String predToken = "" + (startTokenPos + pa.pred.position); /* Bug in some lemmas. */ String lstr = pa.lemma; lstr = lstr.replaceAll("\\.+", "."); predOut.add(new String[] { id, predToken, lstr}); Roleset rs = null; if(n.pos.startsWith("V") || n.pos.startsWith("J")) rs = propBank.getRoleset(pa.lemma); if(rs == null) rs = nomBank.getRoleset(pa.lemma); for(int i = 0; i < pa.argLabels.size(); i++) { String label = pa.argLabels.get(i); if(label.startsWith("SU")) continue; if(label.startsWith("C-") || label.startsWith("R-")) continue; ArrayList as = new ArrayList(); as.add(pa.args.get(i)); for(int j = i + 1; j < pa.argLabels.size(); j++) { if(pa.argLabels.get(j).matches("[CR]-" + label)) as.add(pa.args.get(j)); } //DepNode arg = pa.args.get(i); // for instance, IN(bar): carg = bar node, rel = IN Pair p = getContentAndRel(as, preds); DepNode carg = p.right; String rel = p.left; //String argString; String evIdArg = evIds.get(carg); /*if(evIdArg != null) argString = evIdArg; else argString = getArgString(carg);*/ String argLinkString; String argTokenId = null; if(evIdArg != null) { argLinkString = evIdArg; } else { /*String cachedArgId = argIds.get(carg); if(cachedArgId == null) { idCounter++; cachedArgId = "" + idCounter; argIds.put(carg, cachedArgId); String argTokenId = "" + (startTokenPos + carg.position); argOut.add(new String[] { cachedArgId, argTokenId }); } argLinkString = cachedArgId;*/ argTokenId = "" + (startTokenPos + carg.position); } /* if(!rel.equals("") && label.startsWith("AM")) { //String rel = arg.word.toUpperCase(); argString = rel + "(" + argString + ")"; }*/ char c = label.charAt(label.length() - 1); String vnLabel = null, roleDescr = null; if(Character.isDigit(c) && rs != null) { String rid = "" + c; Role role = null; for(Role r: rs.roles) if(r.id.equals(rid)) { role = r; break; } if(role != null) { /*if(true && role.vntheta != null) //semRel = "VN:" + role.vntheta; else semRel = getNiceName(role.descr);*/ vnLabel = role.vntheta; roleDescr = getNiceName(role.descr); } } String linkId = "" + (++idCounter); if(label.startsWith("AM-")) label = label.substring(3); else label = label.substring(1); rel = rel.replaceAll("_+$", ""); //argLinksOut.add(new String[] { linkId, id, argLinkString, label, vnLabel, roleDescr, rel }); argLinksOut.add(new String[] { linkId, id, evIdArg, argTokenId, label, vnLabel, roleDescr, rel }); //pw.printf("%s(%s, %s)\n", semRel, id, argString); } //pw.println(); } /* Standardizes a PropBank/NomBank role description. */ private static String getNiceName(String l) { int ix = l.indexOf(','); if(ix != -1) l = l.substring(0, ix); l = l.replaceAll("\\(.*", ""); l = l.replaceAll(" ", "_"); l = l.toLowerCase(); return l; } private static String getArgString(DepNode n) { if(!n.pos.startsWith("NNP")) { if(n.lemma != null) return n.lemma; else return n.word; } StringBuilder sb = new StringBuilder(); for(DepNode c: n.children) if(c.position < n.position && c.relations[0].matches("NAME|TITLE")) sb.append(c.word + "_"); sb.append(n.word); for(DepNode c: n.children) if(c.position > n.position && c.relations[0].matches("NAME|POSTHON")) sb.append("_" + c.word); return sb.toString(); } private static Pair getContentAndRel(ArrayList ns, HashMap preds) { DepNode main = null; //System.out.println("gcar: ns = " + ns); for(DepNode n: ns) if(n.pos.matches("IN|TO")) { main = n; break; } if(main == null) for(DepNode n: ns) if(n.pos.matches("VB|JJ.*")) { main = n; break; } if(main == null) for(DepNode n: ns) if(n.pos.matches("VB.*")) { main = n; break; } if(main == null) for(DepNode n: ns) if(preds.containsKey(n)) { main = n; break; } if(main == null) main = ns.get(0); //System.out.println("gcar: ns = " + ns); Pair p = getContentAndRel(main, preds); //System.out.println("gcar: ns = " + ns); //System.out.println("returns " + p); return p; } private static Pair getContentAndRel(DepNode n, HashMap preds) { //System.out.println("gcar: " + n); if(preds.containsKey(n)) return new Pair("", n); if(!n.pos.matches("(IN|TO|VB.*)") || n.children.length == 0) return new Pair("", n); String rel; if(n.pos.matches("IN|TO") || n.word.matches("[Aa]ccording")) rel = n.word.toUpperCase(); else rel = ""; DepNode c; c = findChild(n, "IN|TO|VB.*"); if(c != null) { Pair p = getContentAndRel(c, preds); p.left = rel + "_" + p.left; return p; } c = findChild(n, "NN.*|PRP"); if(c != null) return new Pair(rel, c); c = findChild(n, "JJ.*|RB.*"); if(c != null) return new Pair(rel, c); c = findChild(n, "\\$|\\#"); if(c != null) return new Pair(rel, c); return new Pair(rel, n.children[0]); } private static DepNode findChild(DepNode n, String regex) { for(DepNode c: n.children) if(c.pos.matches(regex)) return c; return null; } }