package mpqareader; import java.util.*; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import java.io.*; import se.lth.cs.nlp.depsrl.format.CoNLL2008Format; import se.lth.cs.nlp.depsrl.format.PAStructure; import se.lth.cs.nlp.nlputils.annotations.*; import se.lth.cs.nlp.nlputils.core.ArrayComparator; import se.lth.cs.nlp.nlputils.core.BinaryOperator; import se.lth.cs.nlp.nlputils.core.CollectionUtils; import se.lth.cs.nlp.nlputils.core.Triple; import se.lth.cs.nlp.nlputils.depgraph.DepGraph; import se.lth.cs.nlp.nlputils.depgraph.DepNode; public class MPQAToLK { private static final String ENCODING = "UTF-8"; private static int dirCounter = 0; private static void processText(String dir, AnnotatedText text, Scanner srlInput, BufferedReader sstInput) throws IOException { dirCounter++; String docnameFull = (String) text.getProperty("mpqa_file"); String docname = docnameFull.replaceAll("database.mpqa.2.0/docs/", ""); System.out.println(dirCounter + ": " + docname); docname = docname.replaceAll("/", "_"); String textFileName = dir + "/" + docname + ".lktext.xml"; String tokenFileName = dir + "/" + docname + ".tokens.xml"; String posFileName = dir + "/" + docname + ".pos.xml"; String subjFileName = dir + "/" + docname + ".mpqasubjectivity.xml"; String subjSenFileName = dir + "/" + docname + ".subjsen.xml"; String depFileName = dir + "/" + docname + ".depsyntax.xml"; String srlFileName = dir + "/" + docname + ".predargs.xml"; String sstFileName = dir + "/" + docname + ".sst.xml"; PrintWriter textOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(textFileName), ENCODING)); textOut.println(""); textOut.println(""); textOut.println(""); textOut.println(" " + docnameFull + ""); for(String k: text.properties.keySet()) { Object v = text.getProperty(k); if(k.equals("mpqa_file")) continue; if(k.equals("source")) k = "mpqa_source"; if(v instanceof String) { textOut.println(" " + v + ""); } else { if(v instanceof ArrayList) { String vs = v.toString(); vs = vs.substring(1, vs.length() - 1); textOut.println(" " + vs + ""); } } } textOut.println(""); textOut.print(""); if(docname.equals("20020201_20.45.53-22539")) { String s = "" + (char) 26; text.text = text.text.replaceAll(s, ""); } textOut.println(escapeXML(text.text)); textOut.println(""); textOut.println(""); textOut.close(); PrintWriter tokenOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(tokenFileName), ENCODING)); tokenOut.println(""); tokenOut.println(""); tokenOut.println(""); tokenOut.println(" " + docname + ".lktext.xml"); tokenOut.println(" MPQAToLK"); tokenOut.println(""); PrintWriter posOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(posFileName), ENCODING)); posOut.println(""); posOut.println(""); posOut.println(""); posOut.println(" " + docname + ".lktext.xml"); posOut.println(" LTHPOSTagger"); posOut.println(""); PrintWriter subjSenOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(subjSenFileName), ENCODING)); subjSenOut.println(""); subjSenOut.println(""); subjSenOut.println(""); subjSenOut.println(" " + docname + ".lktext.xml"); subjSenOut.println(" MPQAToLK"); subjSenOut.println(""); PrintWriter subjOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(subjFileName), ENCODING)); subjOut.println(""); subjOut.println(""); subjOut.println(""); subjOut.println(" " + docname + ".lktext.xml"); subjOut.println(" MPQAToLK"); subjOut.println(""); PrintWriter depOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(depFileName), ENCODING)); depOut.println(""); depOut.println(""); depOut.println(""); depOut.println(" " + docname + ".lktext.xml"); depOut.println(" LTH-DEP-SRL"); depOut.println(""); PrintWriter srlOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(srlFileName), ENCODING)); srlOut.println(""); srlOut.println(""); srlOut.println(""); srlOut.println(" " + docname + ".lktext.xml"); srlOut.println(" LTH-DEP-SRL"); srlOut.println(""); PrintWriter sstOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(sstFileName), ENCODING)); sstOut.println(""); sstOut.println(""); sstOut.println(""); sstOut.println(" " + docname + ".lktext.xml"); sstOut.println(" SSTLight"); sstOut.println(""); int idc = findSentenceSpans(text, docname, tokenOut, subjSenOut, srlInput, sstInput, posOut, depOut, srlOut, sstOut); tokenOut.println(""); tokenOut.close(); posOut.println(""); posOut.close(); depOut.println(""); depOut.close(); srlOut.println(""); srlOut.close(); sstOut.println(""); sstOut.close(); subjSenOut.println(""); subjSenOut.close(); printMPQAAnnotation(text, docname, subjOut, 0); subjSenOut.close(); subjOut.close(); } private static final Comparator INTENSITY_COMP = new ArrayComparator(new String[] { "extreme", "high", "medium", "low", "none" }); private static final BinaryOperator POLARITY_OP = new BinaryOperator() { public String apply(String t1, String t2) { if(t1 == null) return t2; if(t2 == null) return t1; if(t1.equals(t2)) return t1; if(t1.startsWith("unc") && !t2.startsWith("unc")) return t2; if(t2.startsWith("unc") && !t1.startsWith("unc")) return t1; boolean unc = t1.startsWith("unc"); if(unc) { t1 = t1.substring("uncertain-".length()); t2 = t2.substring("uncertain-".length()); } if(t1.equals("both") || t2.equals("both")) return unc? "uncertain-both": "both"; if(t1.equals("positive") && t2.equals("negative")) return unc? "uncertain-both": "both"; if(t2.equals("positive") && t1.equals("negative")) return unc? "uncertain-both": "both"; if(t1.equals("neutral")) return unc? ("uncertain-" + t2): t2; if(t2.equals("neutral")) return unc? ("uncertain-" + t1): t1; throw new RuntimeException("unhandled: t1 = " + t1 + ", t2 = " + t2); } }; static String[] classifySubjSen(Span sen, AnnotatedText text) { /* * FROM THE MPQA DOCUMENTATION * A sentence was considered subjective if 1 OR 2: 1. the sentence contains a "GATE_direct-subjective" annotation WITH attribute intensity NOT IN ['low', 'neutral'] AND NOT WITH attribute insubstantial. 2. the sentence contains a "GATE_expressive-subjectivity" annotation WITH attribute intensity NOT IN ['low'] Otherwise, a sentence was considered objective. */ String posInt = "none"; String negInt = "none"; String neuInt = "none"; for(int i = 1; i < text.layers.size(); i++) for(Span s: (AnnotationLayer) text.layers.get(i)) { if(s.start < sen.start || s.end > sen.end) continue; if(s.label.matches("GATE_direct-subjective|GATE_expressive-subjectivity")) { if(s.getProperty("insubstantial") != null) continue; String intens = (String) s.getProperty("intensity"); if(intens != null) intens = intens.trim(); else intens = "low"; if(intens.equals("")) intens = "low"; if(intens.equals("neutral")) intens = "low"; String pol = (String) s.getProperty("polarity"); if(pol == null) pol = "neutral"; else pol = pol.trim(); if(pol.equals("")) pol = "neutral"; boolean uncertain = pol.startsWith("uncertain-"); if(uncertain) pol = pol.substring("uncertain-".length()); if(pol.equals("neutral")) neuInt = CollectionUtils.min(intens, neuInt, INTENSITY_COMP); if(pol.matches("negative|both")) negInt = CollectionUtils.min(intens, negInt, INTENSITY_COMP); if(pol.matches("positive|both")) posInt = CollectionUtils.min(intens, posInt, INTENSITY_COMP); } } if(neuInt.equals("low")) neuInt = "none"; if(posInt.equals("none") && negInt.equals("none") && neuInt.equals("none")) return null; else return new String[] { posInt, neuInt, negInt }; } private static int sen0, sen1; private static boolean isXMLTag(String s) { return s.contains("<") || s.contains(">"); } private static final Pattern ONE_LETTER = Pattern.compile(".*[A-Za-z].*"); private static int findSentenceSpans(AnnotatedText text, String baseName, PrintWriter tokenOut, PrintWriter subjSenOut, Scanner srlInput, BufferedReader sstInput, PrintWriter posOut, PrintWriter depOut, PrintWriter srlOut, PrintWriter sstOut) throws IOException { int[] tokenStarts = (int[]) text.getProperty("token-starts"); int[] tokenEnds = (int[]) text.getProperty("token-ends"); tokenOut.println(""); for(int i = 0; i < text.tokens.length; i++) { printEntity(text.tokens[i], tokenStarts[i], tokenEnds[i] - 1, i + 1, tokenOut); } tokenOut.println(""); int idCounter = text.tokens.length; int subjIdCounter = 0; tokenOut.println(""); subjSenOut.println(""); depOut.println(""); //srlOut.println(""); StringBuilder sbP = new StringBuilder(); StringBuilder sbA = new StringBuilder(); StringBuilder sb1 = new StringBuilder(); StringBuilder sb2 = new StringBuilder(); StringBuilder sb3 = new StringBuilder(); StringBuilder sbPOS = new StringBuilder(); StringBuilder sbLemma = new StringBuilder(); int paIdCounter = 0; int depIdCounter = 0; int posIdCounter = 0; int neIdCounter = 0; //ArrayList>> conllGraphs = new ArrayList(); Span prevSen = null; if(false && text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/Article247_66")) { System.out.println(text.layers.get(0).spans); System.exit(0); } for(Span sen: (AnnotationLayer) text.layers.get(0)) { sen0++; int start = sen.tokenStart; int end = sen.tokenEnd; if(end - start > 2 && text.tokens[start].equals("AR") && text.tokens[start+1].equals(":")) { System.out.println("skipped arabic sentence"); continue; } if(end - start > 2 && text.tokens[start].equals("EN") && text.tokens[start+1].equals(":")) { start += 2; } while(start < end && start < text.tokens.length && isXMLTag(text.tokens[start])) start++; if(start >= end) continue; boolean sawWord = false; for(int i = start; i < end; i++) if(ONE_LETTER.matcher(text.tokens[i]).matches()) { sawWord = true; break; } if(!sawWord) { System.err.println("Skipped this sentence:"); System.err.println("|" + text.text.substring(sen.start, sen.end) + "|"); } if(prevSen != null) { if(sen.start < prevSen.end) throw new RuntimeException("sentences not ordered"); if(sen.tokenStart < prevSen.tokenEnd) throw new RuntimeException("sentences not ordered"); /*if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/Article247_66")) { if(sen.end == 894) { System.out.println(sen); System.out.prin } }*/ } prevSen = sen; sen1++; idCounter++; tokenOut.print(" "); String[] ssp = classifySubjSen(sen, text); if(ssp != null) { String posi = ssp[0], neui = ssp[1], negi = ssp[2]; if(!posi.equals("none") || !negi.equals("none") || !neui.matches("none|low")) { subjIdCounter++; subjSenOut.print(" "); //subjSenOut.print(""); //subjSenOut.print(""); subjSenOut.print(""); subjSenOut.println(""); } } Triple> tr = CoNLL2008Format.readNextGraph(srlInput); //System.out.println("Read graph: " + tr.first); DepGraph dg = tr.first; fixLemmas(dg); HashMap tokenIndices = new HashMap(); int tokenIndex = start; for(int i = 1; i < dg.nodes.length; i++) { DepNode n = dg.nodes[i]; while(!n.word.equals(text.tokens[tokenIndex])) tokenIndex++; tokenIndices.put(n, tokenIndex); tokenIndex++; } for(int i = 1; i < dg.nodes.length; i++) { DepNode n = dg.nodes[i]; int childTokenId = 1 + tokenIndices.get(n); sbPOS.append(" " + n.pos + "\n"); if(n.lemma != null && !n.lemma.equals("_")) sbLemma.append(" " + escapeXML(n.lemma) + "\n"); if(n.parents.length != 1) throw new IllegalArgumentException("Only single-head dependency trees allowed yet"); DepNode p = dg.nodes[i].parents[0]; if(p.position == 0) { depOut.println(" " + n.relations[0] + ""); } else { int parentTokenId = 1 + tokenIndices.get(p); depOut.println(" " + n.relations[0] + ""); } tokenIndex++; } paIdCounter = printPreds(sbP, tr.third, tokenIndices, paIdCounter); paIdCounter = printArgs(sbA, tr.third, tokenIndices, paIdCounter); if(true) { String line = sstInput.readLine(); String[] ts = line.split(" "); if(ts.length % 6 != 0) throw new RuntimeException("illegal number of tokens"); ArrayList l = compress(ts, tr.first); ArrayList col1 = iobToSpans(l, 0, tokenIndices, tr.first); ArrayList col2 = iobToSpans(l, 1, tokenIndices, tr.first); ArrayList col3 = iobToSpans(l, 2, tokenIndices, tr.first); for(Span s: col1) sb1.append("" + s.label + "\n"); for(Span s: col2) sb2.append("" + s.label + "\n"); for(Span s: col3) sb3.append("" + s.label + "\n"); } } tokenOut.println(""); subjSenOut.println(""); depOut.println(""); posOut.println(""); posOut.print(sbPOS); posOut.println(""); posOut.println(""); posOut.print(sbLemma); posOut.println(""); srlOut.println(""); srlOut.print(sbP); srlOut.println(""); srlOut.println(""); srlOut.print(sbA); srlOut.println(""); sstOut.println(""); sstOut.print(sb1); sstOut.println(""); sstOut.println(""); sstOut.print(sb2); sstOut.println(""); sstOut.println(""); sstOut.print(sb3); sstOut.println(""); return subjIdCounter; } private static void fixLemmas(DepGraph dg) { for(int i = 1; i < dg.nodes.length; i++) { if(dg.nodes[i].lemma != null && !dg.nodes[i].lemma.equals("_")) dg.nodes[i].lemma = dg.nodes[i].lemma.toLowerCase(); else { if(dg.nodes[i].pos.matches("UH|EX|WDT|WRB|WP|WP\\$|DT|IN|TO|MD|PRP|VB|NN|JJ|CC|PDT|FW|NNP|NNPS|RB|RP|CD|\\.|\\,|\\#|\\$|:|\\(|\\)")) dg.nodes[i].lemma = dg.nodes[i].word.toLowerCase(); // holes in the original lemma lexicon else if(dg.nodes[i].word.toLowerCase().equals("biased")) dg.nodes[i].lemma = "bias"; else if(dg.nodes[i].word.toLowerCase().equals("aced")) dg.nodes[i].lemma = "ace"; else if(dg.nodes[i].word.toLowerCase().equals("barreled")) dg.nodes[i].lemma = "barrel"; else if(dg.nodes[i].word.toLowerCase().equals("bogged")) dg.nodes[i].lemma = "bog"; else if(dg.nodes[i].word.toLowerCase().equals("bruised")) dg.nodes[i].lemma = "bruise"; else if(dg.nodes[i].word.toLowerCase().equals("criss-crossed")) dg.nodes[i].lemma = "criss-cross"; else if(dg.nodes[i].word.toLowerCase().equals("delegated")) dg.nodes[i].lemma = "delegate"; else if(dg.nodes[i].word.toLowerCase().equals("delisted")) dg.nodes[i].lemma = "delist"; else if(dg.nodes[i].word.toLowerCase().equals("disguised")) dg.nodes[i].lemma = "disguise"; else if(dg.nodes[i].word.toLowerCase().equals("downsized")) dg.nodes[i].lemma = "downsize"; else if(dg.nodes[i].word.toLowerCase().equals("evacuated")) dg.nodes[i].lemma = "evacuate"; else if(dg.nodes[i].word.toLowerCase().equals("evaluated")) dg.nodes[i].lemma = "evaluate"; else if(dg.nodes[i].word.toLowerCase().equals("faxed")) dg.nodes[i].lemma = "fax"; else if(dg.nodes[i].word.toLowerCase().equals("emailed")) dg.nodes[i].lemma = "email"; else if(dg.nodes[i].word.toLowerCase().equals("graduated")) dg.nodes[i].lemma = "graduate"; else if(dg.nodes[i].word.toLowerCase().equals("guided")) dg.nodes[i].lemma = "guide"; else if(dg.nodes[i].word.toLowerCase().equals("headquartered")) dg.nodes[i].lemma = "headquarter"; else if(dg.nodes[i].word.toLowerCase().equals("inched")) dg.nodes[i].lemma = "inch"; else if(dg.nodes[i].word.toLowerCase().equals("influenced")) dg.nodes[i].lemma = "influence"; else if(dg.nodes[i].word.toLowerCase().equals("occured")) dg.nodes[i].lemma = "occur"; else if(dg.nodes[i].word.toLowerCase().equals("outbade")) dg.nodes[i].lemma = "outbid"; else if(dg.nodes[i].word.toLowerCase().equals("overpriced")) dg.nodes[i].lemma = "overprice"; else if(dg.nodes[i].word.toLowerCase().equals("prepped")) dg.nodes[i].lemma = "prep"; else if(dg.nodes[i].word.toLowerCase().equals("quoted")) dg.nodes[i].lemma = "quote"; else if(dg.nodes[i].word.toLowerCase().equals("radicalized")) dg.nodes[i].lemma = "radicalize"; else if(dg.nodes[i].word.toLowerCase().equals("readmitted")) dg.nodes[i].lemma = "readmit"; else if(dg.nodes[i].word.toLowerCase().equals("redefined")) dg.nodes[i].lemma = "redefine"; else if(dg.nodes[i].word.toLowerCase().equals("reinstalled")) dg.nodes[i].lemma = "reinstall"; else if(dg.nodes[i].word.toLowerCase().equals("reloaded")) dg.nodes[i].lemma = "reload"; else if(dg.nodes[i].word.toLowerCase().equals("re-occupied")) dg.nodes[i].lemma = "re-occupy"; else if(dg.nodes[i].word.toLowerCase().equals("situated")) dg.nodes[i].lemma = "situated"; else if(dg.nodes[i].word.toLowerCase().equals("self-inflicted")) dg.nodes[i].lemma = "self-inflict"; else if(dg.nodes[i].word.toLowerCase().equals("sourced")) dg.nodes[i].lemma = "source"; else if(dg.nodes[i].word.toLowerCase().equals("spirited")) dg.nodes[i].lemma = "spirit"; else if(dg.nodes[i].word.toLowerCase().equals("spotted")) dg.nodes[i].lemma = "spot"; else if(dg.nodes[i].word.toLowerCase().equals("unexplored")) dg.nodes[i].lemma = "unexplore"; else if(dg.nodes[i].word.toLowerCase().equals("wounded")) dg.nodes[i].lemma = "wound"; } } } private static void print(ArrayList l) { for(String[] ss: l) { System.out.println(ss[0] + "\t" + ss[1] + "\t" + ss[2]); } System.exit(0); } private static ArrayList iobToSpans(ArrayList ss, int col, HashMap tokenIndices, DepGraph depGraph) { //System.out.println(tokenIndices); //HashMap m = new HashMap(); int[] m = new int[depGraph.nodes.length - 1]; for(int i = 1; i < depGraph.nodes.length; i++) m[i-1] = tokenIndices.get(depGraph.nodes[i]); ArrayList out = new ArrayList(); Span current = null; for(int i = 0; i < ss.size(); i++) { String iobtag = ss.get(i)[col]; if(current != null && (iobtag.equals("0") || iobtag.startsWith("B-") || iobtag.startsWith("I-") && !current.label.equals(iobtag.substring(2)))) { current.tokenEnd = 1 + m[i-1]; out.add(current); current = null; } if(current == null && (iobtag.startsWith("I-") || iobtag.startsWith("B-"))) { current = new Span(); current.label = iobtag.substring(2); current.tokenStart = 1 + m[i]; } } if(current != null) { current.tokenEnd = 1 + m[m.length-1]; out.add(current); } return out; } private static ArrayList compress(String[] ss, DepGraph dg) { ArrayList out = new ArrayList(); int dgix = 1; int ssix = 0; while(ssix < ss.length / 6) { int ix0 = ssix; String s1 = ss[6*ssix]; String s2 = dg.nodes[dgix].word; while(s1.length() < s2.length()) { ssix++; s1 = s1 + ss[6*ssix]; } if(s1.length() > s2.length()) { throw new RuntimeException("s1 = " + s1 + ", s2 = " + s2); } String[] row = new String[3]; for(int i = 0; i < 3; i++) { String tag = null; for(int ix = ix0; ix <= ssix; ix++) { String t = ss[6*ix + 3 + i]; if(t.startsWith("B-")) { tag = t; break; } } if(tag == null) for(int ix = ix0; ix <= ssix; ix++) { String t = ss[6*ix + 3 + i]; if(t.startsWith("I-")) { tag = t; break; } } if(tag == null) tag = "0"; row[i] = tag; } out.add(row); ssix++; dgix++; } if(dgix != dg.nodes.length) throw new RuntimeException("nodes left"); return out; } private static int printPreds(StringBuilder out, List pas, HashMap tokenIndices, int paIdCounter) { for(PAStructure pa: pas) { int id = ++paIdCounter; int tokenId = 1 + tokenIndices.get(pa.pred); out.append(" " + pa.lemma + "\n"); } out.append("\n"); return paIdCounter; } private static int printArgs(StringBuilder out, List pas, HashMap tokenIndices, int paIdCounter) { for(PAStructure pa: pas) { int predTokenId = 1 + tokenIndices.get(pa.pred); for(int i = 0; i < pa.args.size(); i++) { int id = ++paIdCounter; DepNode arg = pa.args.get(i); String argLabel = pa.argLabels.get(i); int argTokenId = 1 + tokenIndices.get(arg); //out.println(" " // + argLabel + ""); out.append(" " + argLabel + "\n"); } } out.append("\n"); return paIdCounter; } static void printEntity(String l, int start, int end, int id, PrintWriter out) { StringBuilder sb = new StringBuilder(" "); else sb.append("\" on=\"#" + start + "\">"); sb.append(escapeXML(l)); sb.append(""); out.println(sb); } // slut saxat private static void printMPQAAnnotation(AnnotatedText text, String baseName, PrintWriter subjOut, int idCounter) { final boolean debug; if(false && baseName.equals("20020306_15.02.54-18922")) { debug = true; } else debug = false; for(AnnotationLayer l: text.layers) { for(Iterator it = l.iterator(); it.hasNext(); ) { Span s = it.next(); if(s.hasProperty("implicit") && s.getProperty("implicit").equals("true")) { s.end = s.start; s.tokenEnd = s.tokenStart; continue; } if(s.start < s.end) continue; if(!s.label.equals("GATE_agent")) { it.remove(); continue; } String id = (String) s.getProperty("id"); if(id != null) { id = id.trim(); if(id != null && id.equals("w")) continue; if(id != null && id.equals("implicit")) continue; } it.remove(); // TODO ska vi verkligen ta bort alla tomma agenter? } } if(baseName.equals("20020131_20.58.51-26741")) { // bug: insubstantial and nested-source confused on one item for(AnnotationLayer l: text.layers) for(Span s: l) { String ns = (String) s.getProperty("nested-source"); if(ns != null && ns.equals("c2")) { s.setProperty("nested-source", s.getProperty("insubstantial")); s.setProperty("insubstantial", ns); break; } } } else if(baseName.equals("xbank_wsj_0610")) { // bug: missing w, implicit Span w = new Span(); w.start = w.end = 0; w.label = "GATE_agent"; w.setProperty("id", "w"); text.layers.get(1).add(w); Span imp = new Span(); imp.start = w.end = 0; imp.label = "GATE_agent"; imp.setProperty("id", "implicit"); text.layers.get(1).add(imp); } else if(baseName.equals("xbank_wsj_0122")) { for(AnnotationLayer l: text.layers) for(Span s: l) { String al = (String) s.getProperty("attitude-link"); if(al != null && al.matches("agreement")) s.properties.remove("attitude-link"); } } else if(baseName.equals("xbank_wsj_0557")) { for(AnnotationLayer l: text.layers) for(Span s: l) { String al = (String) s.getProperty("attitude-link"); if(al != null && al.matches("expsale")) s.properties.remove("attitude-link"); } } else if(baseName.equals("xbank_wsj_0376")) { for(AnnotationLayer l: text.layers) for(Span s: l) { String al = (String) s.getProperty("attitude-link"); if(al != null && al.matches("ew")) s.properties.remove("attitude-link"); } } else if(baseName.equals("xbank_wsj_0187")) { for(AnnotationLayer l: text.layers) for(Span s: l) { String al = (String) s.getProperty("attitude-link"); if(al != null && al.matches("soffer")) s.properties.remove("attitude-link"); } } HashMap midToLKid = new HashMap(); HashMap spanToLKid = new HashMap(); /* Agents may reference forward or self -- we need a first pass. */ for(AnnotationLayer l: text.layers) { Collections.sort(l.spans, Span.ByLeftOrder.instance()); for(Span s: l) { if(s.label.equals("GATE_agent")) { idCounter++; spanToLKid.put(s, "" + idCounter); String id = (String) s.getProperty("id"); if(id != null) { id = id.trim(); midToLKid.put(id, "" + idCounter); } } } } subjOut.println(""); for(AnnotationLayer l: text.layers) { for(Span s: l) { if(s.label.equals("GATE_agent")) { //if(debug) // System.out.println(s); //idCounter++; //subjOut.print(" = 0) { // s.tokenStart > 0 needed TODO kolla // bug in 20020427_22.07.25-26605 subjOut.print(" start=\"#" + (s.tokenStart+1) + "\""); subjOut.print(" end=\"#" + s.tokenEnd + "\""); } subjOut.print(">"); // todo hantera ns även om agenten kommer senare subjOut.print(""); subjOut.print(""); if(false) { if(s.start < s.end) { String t = text.text.substring(s.start, s.end); subjOut.print(" "); } subjOut.print(" "); } subjOut.println(); } } } if(baseName.equals("20010715_00.31.31-4544")) { // bug: mteam -> team String id = midToLKid.get("team"); midToLKid.put("mteam",id); } else if(baseName.equals("20020509_22.11.01-7259")) { // bug: devcon -> devcoun String id = midToLKid.get("devcoun"); midToLKid.put("devcon",id); } else if(baseName.equals("temp_fbis_20.45.06-5529")) { // bug: ungovint -> usgovint String id = midToLKid.get("usgovint"); midToLKid.put("ungovint",id); } else if(baseName.equals("xbank_wsj_0610")) { // bug: wel missing midToLKid.put("wel", "24"); } else if(baseName.equals("xbank_wsj_0376")) { // bug: ana -> analysts String id = midToLKid.get("analysts"); midToLKid.put("ana",id); } else if(baseName.equals("xbank_wsj_0778")) { // bug: bdl missing midToLKid.put("bdl", "40"); } subjOut.println(""); subjOut.print(" "); subjOut.println(""); for(AnnotationLayer l: text.layers) { for(Span s: l) { if(s.label.equals("GATE_expressive-subjectivity")) { idCounter++; subjOut.print(" = 0) { subjOut.print(" start=\"#" + (s.tokenStart+1) + "\""); subjOut.print(" end=\"#" + s.tokenEnd + "\""); } subjOut.print(">"); subjOut.print(""); subjOut.print(""); if(false) { if(s.start < s.end) { String t = text.text.substring(s.start, s.end); subjOut.print(" "); } } subjOut.println(); } } } subjOut.println(""); subjOut.println(""); for(AnnotationLayer l: text.layers) { for(Span s: l) { if(s.label.equals("GATE_objective-speech-event")) { idCounter++; subjOut.print(" = 0) { subjOut.print(" start=\"#" + (s.tokenStart+1) + "\""); subjOut.print(" end=\"#" + s.tokenEnd + "\""); } subjOut.print(">"); subjOut.print(""); subjOut.print(""); if(false) { if(s.start < s.end) { String t = text.text.substring(s.start, s.end); subjOut.print(" "); } subjOut.print(" "); } subjOut.println(); } } } subjOut.println(""); subjOut.println(""); for(AnnotationLayer l: text.layers) { for(Span s: l) { if(s.label.equals("GATE_target")) { idCounter++; subjOut.print(" = 0) { subjOut.print(" start=\"#" + (s.tokenStart+1) + "\""); subjOut.print(" end=\"#" + s.tokenEnd + "\""); } subjOut.print(">"); subjOut.print(""); subjOut.print(""); if(s.hasProperty("id")) { String id = (String) s.getProperty("id"); midToLKid.put(id, "" + idCounter); } if(false) { if(s.start < s.end) { String t = text.text.substring(s.start, s.end); subjOut.print(" "); } subjOut.print(" "); } subjOut.println(); } } } subjOut.println(""); subjOut.println(""); for(AnnotationLayer l: text.layers) { for(Span s: l) { if(s.label.equals("GATE_attitude")) { idCounter++; subjOut.print(" = 0) { subjOut.print(" start=\"#" + (s.tokenStart+1) + "\""); subjOut.print(" end=\"#" + s.tokenEnd + "\""); } if(s.hasProperty("id")) { String id = (String) s.getProperty("id"); midToLKid.put(id, "" + idCounter); } subjOut.print(">"); subjOut.print(""); subjOut.print(""); if(false) { if(s.start < s.end) { String t = text.text.substring(s.start, s.end); subjOut.print(" "); } subjOut.print(" "); } subjOut.println(); } } } subjOut.println(""); if(baseName.equals("xbank_wsj_0187")) { // bug: String id = midToLKid.get("agreeinprinciple"); midToLKid.put("agreeincprinciple",id); } subjOut.println(""); for(AnnotationLayer l: text.layers) { for(Span s: l) { if(s.label.equals("GATE_direct-subjective")) { idCounter++; subjOut.print(" = 0) { subjOut.print(" start=\"#" + (s.tokenStart+1) + "\""); subjOut.print(" end=\"#" + s.tokenEnd + "\""); } subjOut.print(">"); subjOut.print(""); subjOut.print(""); if(s.start < s.end) { String t = text.text.substring(s.start, s.end); if(false) subjOut.print(" "); } subjOut.println(); } } } subjOut.println(""); subjOut.println(""); subjOut.flush(); if(debug) System.exit(0); } private static void printIdList(String s, String att, String baseName, HashMap ids, PrintWriter pw) { String[] ss = s.trim().split("\\s*,\\s*"); pw.print(" " + att + "=\""); boolean first = true; for(int i = 0; i < ss.length; i++) { String lid = ids.get(ss[i]); if(lid == null) { //throw new RuntimeException("unknown id |" + ss[i] + "|"); System.err.println("*** Warning: unknown id |" + ss[i] + "|"); continue; } if(!first) pw.print(","); else first = false; pw.print("#" + lid); } pw.print("\""); } private static String escapeXML(String s) { s = s.replaceAll("&", "&"); s = s.replaceAll("\"", """); s = s.replaceAll("<", "<"); s = s.replaceAll(">", ">"); return s; } public static void main(String[] argv) { String dir = argv[1]; String srlFile = argv[2]; String sstFile = argv[3]; try { //Scanner srlInput = new Scanner(new File(srlFile)); Scanner srlInput; if(srlFile.endsWith(".gz")) { InputStream is = new GZIPInputStream(new FileInputStream(srlFile)); srlInput = new Scanner(is); } else srlInput = new Scanner(new File(srlFile)); BufferedReader sstInput; if(sstFile.endsWith(".gz")) { InputStream is = new GZIPInputStream(new FileInputStream(sstFile)); sstInput = new BufferedReader(new InputStreamReader(is)); } else sstInput = new BufferedReader(new FileReader(sstFile)); Iterator i = MPQAReader.processDirectory(argv[0]); while(i.hasNext()) { AnnotatedText t = i.next(); processText(dir, t, srlInput, sstInput); } System.out.println("sen0 = " + sen0); System.out.println("sen1 = " + sen1); Triple> tr = CoNLL2008Format.readNextGraph(srlInput); if(tr != null) { throw new RuntimeException("did not read all trees"); } } catch(Exception e) { e.printStackTrace(); } } }