package mpqareader; import java.io.*; import java.util.*; import java.util.regex.*; import se.lth.cs.nlp.depsrl.format.CoNLL2008Format; import se.lth.cs.nlp.depsrl.format.PAStructure; import se.lth.cs.nlp.nlputils.core.*; import se.lth.cs.nlp.nlputils.ml_long.*; //import srlpostprocess.TokenSemNode; import se.lth.cs.nlp.nlputils.depgraph.*; //import se.lth.cs.nlp.depsrl.format.*; public class LKPolSubjSentClassifier { static Pattern SEN_ID_PAT = Pattern.compile("id\\s*=\\s*\"(.*?)\""); static Pattern SEN_START_PAT = Pattern.compile("start\\s*=\\s*\"#(.*?)\""); static Pattern SEN_END_PAT = Pattern.compile("end\\s*=\\s*\"#(.*?)\""); static Pattern ON_PAT = Pattern.compile("on\\s*=\\s*\"#(.*?)\""); static Pattern SCOPE_PAT = Pattern.compile("scope\\s*=\\s*\"(.*?)\""); static Pattern POS_PAT = Pattern.compile("pos=\"([lmh])\""); static Pattern NEU_PAT = Pattern.compile("neu=\"([lmh])\""); static Pattern NEG_PAT = Pattern.compile("neg=\"([lmh])\""); private static int extractIntByPattern(String s, Pattern p) { String match = extractStringByPattern(s, p); if(match == null) throw new IllegalArgumentException("Could not find match"); return Integer.parseInt(match); //extractStringByPattern(s, p)); } private static String extractStringByPattern(String s, Pattern p) { Matcher m = p.matcher(s); if(!m.find()) return null; String match = m.group(1); return match; } private static Pair> readSentenceSpans(String prefix, String file) throws IOException { BufferedReader br = new BufferedReader(new FileReader(prefix + File.separatorChar + file)); String line = br.readLine(); while(line != null && !line.contains("provides=\"SENTENCES\"")) line = br.readLine(); if(line == null) return null; String tokenFile = extractStringByPattern(line, SCOPE_PAT); if(tokenFile == null) tokenFile = file; ArrayList out = new ArrayList(); line = br.readLine(); while(!line.contains("")) { line = line.trim(); if(line.startsWith("") && !line.endsWith("")) throw new RuntimeException("Sorry, we expect sentences line by line..."); int id = extractIntByPattern(line, SEN_ID_PAT); int start = extractIntByPattern(line, SEN_START_PAT); int end = extractIntByPattern(line, SEN_END_PAT); out.add(new int[] { id, start, end }); } line = br.readLine(); } return new Pair(tokenFile, out); } private static String extractToken(String line) { int s_ix = line.indexOf('>'); int e_ix = line.indexOf('<', s_ix); if(s_ix == -1) throw new RuntimeException("Sorry, we expect tokens line by line..."); if(e_ix == -1) throw new RuntimeException("Sorry, we expect tokens line by line..."); String token = line.substring(s_ix + 1, e_ix); return Strings.decodeXML(token); } private static String readTokenPreamble(BufferedReader br) throws IOException { String line = br.readLine(); while(!line.contains("\"base\"")) line = br.readLine(); int s_ix = line.indexOf(">"); int e_ix = line.indexOf('<', s_ix); String base = line.substring(s_ix + 1, e_ix); while(line != null && !line.contains("provides=\"TOKENS\"")) line = br.readLine(); if(line == null) throw new RuntimeException("Couldn't find token annotation"); return base; } private static void readPreamble(BufferedReader br, String prov) throws IOException { String line = br.readLine(); while(line != null && !line.contains("provides=\"" + prov + "\"")) line = br.readLine(); if(line == null) throw new RuntimeException("Couldn't find annotation layer " + prov); } private static final int LOOKAHEAD = 256; private static HashMap readGoldSS(BufferedReader br) throws IOException { HashMap out = new HashMap(); String line = br.readLine().trim(); while(!line.startsWith("")) { String on = extractStringByPattern(line, ON_PAT); String pos = extractStringByPattern(line, POS_PAT); String neu = extractStringByPattern(line, NEU_PAT); String neg = extractStringByPattern(line, NEG_PAT); Intensities i = new Intensities(pos, neu, neg); out.put(on, i); line = br.readLine().trim(); } //System.out.println("Gold SS: " + out); return out; } private static Pair, Triple> readNextSentence(int[] sen, BufferedReader br, BufferedReader pbr, BufferedReader lbr) throws IOException { if(!lbr.markSupported()) throw new RuntimeException("!!!"); int senId = sen[0]; int start = sen[1]; int end = sen[2]; ArrayList out = new ArrayList(); ArrayList tags = new ArrayList(); ArrayList lemmas = new ArrayList(); /*String line = br.readLine().trim(); while(line.equals("")) line = br.readLine().trim(); if(!line.startsWith("")) { System.err.println("line = " + line); throw new RuntimeException("Sorry, we expect tokens line by line..."); } int id = extractIntByPattern(line, SEN_ID_PAT); while(id != start) { line = br.readLine().trim(); while(line.equals("")) line = br.readLine().trim(); if(line.contains("")) { throw new RuntimeException("Problem finding sentence " + senId + " start=" + start + " end=" + end); } if(!line.startsWith("")) { System.err.println("line = " + line); throw new RuntimeException("Sorry, we expect tokens line by line..."); } id = extractIntByPattern(line, SEN_ID_PAT); } out.add(extractToken(line));*/ boolean inside = false; int id = -1; String line; while(id != end) { line = br.readLine().trim(); while(line.equals("")) line = br.readLine().trim(); if(!line.startsWith("")) throw new RuntimeException("Sorry, we expect tokens line by line..."); id = extractIntByPattern(line, SEN_ID_PAT); if(id == start) inside = true; if(inside) { pbr.mark(LOOKAHEAD); String posLine = pbr.readLine().trim(); if(!posLine.startsWith(""))) { System.err.println("end = " + end); System.err.println("line = |" + line + "|"); System.err.println("posLine = |" + posLine + "|"); throw new RuntimeException("Sorry, we expect POS tags line by line..."); } if(posLine.startsWith(""))) { System.err.println("line = |" + line + "|"); System.err.println("posLine = |" + posLine + "|"); System.err.println("lemmaLine = |" + lemmaLine + "|"); throw new RuntimeException("Sorry, we expect lemmas line by line..."); } } else { lbr.reset(); } if(false) { System.err.println(line); System.err.println(posLine); System.err.println(lemmaLine); } String lemma = null; if(!lemmaLine.equals("")) { on = extractIntByPattern(lemmaLine, ON_PAT); if(on != id) { lbr.reset(); } else lemma = extractToken(lemmaLine); } String token = extractToken(line); String posTag = extractToken(posLine); out.add(token); tags.add(posTag); lemmas.add(lemma); } } DepGraph dg = new DepGraph(out.size()); for(int i = 1; i < dg.nodes.length; i++) { dg.nodes[i] = new DepNode(); dg.nodes[i].word = out.get(i - 1); dg.nodes[i].pos = tags.get(i - 1); dg.nodes[i].lemma = lemmas.get(i - 1); } /*for(int i = 1; i < dg.nodes.length; i++) { System.out.println(i + "\t" + dg.nodes[i].word + "\t" + dg.nodes[i].pos + "\t" + dg.nodes[i].lemma); System.out.flush(); } System.out.println(); System.out.flush();*/ return new Pair(out, new Triple(dg, dg, null)); } private static final int THRESHOLD = BOWPolSubjSentClassifier.LOW; private static final boolean PRINT_NEUTRAL = false; public static void processDirectory(SymbolEncoder enc, Classifier cl1, Classifier cl2, SubjectivityLexicon subjLex, String dirName, String outputDir) throws IOException { long t0 = System.currentTimeMillis(); int countSentences = 0, countDocuments = 0; PRFStats[] stats = new PRFStats[4]; for(int i = 0; i < stats.length; i++) stats[i] = new PRFStats(); File dir = new File(dirName); if(!dir.isDirectory()) throw new IllegalArgumentException(dir + " is not a directory"); for(File file: dir.listFiles()) { if(file.isDirectory()) continue; Pair> p = readSentenceSpans(dirName, file.getName()); if(p == null) continue; String tokenFile = p.left; ArrayList senSpans = p.right; countDocuments++; System.out.println("Sentence file " + file.getName() + ", token file " + tokenFile); //if(countDocuments > 10) // break; BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(dirName + File.separatorChar + tokenFile), "UTF-8")); String base = readTokenPreamble(br); String outFile; if(tokenFile.endsWith(".xml")) outFile = tokenFile.replaceFirst("\\.[^\\.]+\\.xml$", ".subjsen.xml"); else outFile = tokenFile + ".subjsen.xml"; outFile = outputDir + File.separatorChar + outFile; String posFile = null; if(tokenFile.endsWith(".xml")) posFile = tokenFile.replaceFirst("\\.[^\\.]+\\.xml$", ".pos.xml"); else posFile = tokenFile + ".pos.xml"; BufferedReader posReader = null; BufferedReader lemmaReader = null; try { posReader = new BufferedReader(new InputStreamReader(new FileInputStream(dirName + File.separatorChar + posFile), "UTF-8")); lemmaReader = new BufferedReader(new InputStreamReader(new FileInputStream(dirName + File.separatorChar + posFile), "UTF-8")); } catch(Exception e) { System.err.println("Sorry, we expect the POS file name to end with .pos.xml"); System.exit(1); } readPreamble(posReader, "POS"); readPreamble(lemmaReader, "LEMMA"); HashMap ssGoldMap = null; try { BufferedReader ssReader = null; String ssInFile = null; if(tokenFile.endsWith(".xml")) ssInFile = tokenFile.replaceFirst("\\.[^\\.]+\\.xml$", ".subjsen.xml"); else ssInFile = tokenFile + ".subjsen.xml"; ssReader = new BufferedReader(new InputStreamReader(new FileInputStream(dirName + File.separatorChar + ssInFile), "UTF-8")); readPreamble(ssReader, "SUBJECTIVE-SENTENCES"); ssGoldMap = readGoldSS(ssReader); for(Intensities inten: ssGoldMap.values()) { stats[0].nInGold++; if(inten.pos >= THRESHOLD) stats[1].nInGold++; if(inten.neu >= THRESHOLD) stats[2].nInGold++; if(inten.neg >= THRESHOLD) stats[3].nInGold++; } } catch(Exception e) { // skipping... } PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFile), "UTF-8")); printSubjSenPreamble(base, file.getName(), out); int idCounter = 0; for(int[] senSpan: senSpans) { Pair, Triple> pp = readNextSentence(senSpan, br, posReader, lemmaReader); ArrayList sen = pp.left; if(sen == null) throw new RuntimeException("null"); SparseVector sv = BOWPolSubjSentClassifier.representSentence(sen, pp.right, null, null, null, subjLex, enc); if(BOWPolSubjSentClassifier.SET) for(int j = 0; j < sv.index; j++) sv.values[j] = 1.0; if(BOWPolSubjSentClassifier.NORMALIZE) BOWPolSubjSentClassifier.normalize2(sv); Intensities guess = null; int guess1 = cl1.classify(sv); if(guess1 == BOWPolSubjSentClassifier.SUBJ) { //printSubjSen(++idCounter, senSpan[0], out); guess = cl2.classify(sv); if(!PRINT_NEUTRAL) guess.neu = BOWPolSubjSentClassifier.NONE; printSubjSen(++idCounter, senSpan[0], out, guess); } if(ssGoldMap != null) { evalSS(ssGoldMap, senSpan[0], guess, stats); } countSentences++; } printSubjSenEnd(out); out.close(); posReader.close(); lemmaReader.close(); br.close(); } long t1 = System.currentTimeMillis(); double ssPerSec = 1000.0 * countSentences / (t1 - t0); double docsPerSec = 1000.0 * countDocuments / (t1 - t0); System.out.format("Processed %f documents per second.\n", docsPerSec); System.out.format("Processed %f sentences per second.\n", ssPerSec); if(stats[0].nInGold > 0) { for(int i = 0; i < stats.length; i++) stats[i].print(); } } private static void evalSS(HashMap ssGoldMap, int id, Intensities guess, PRFStats[] stats) { if(guess == null) return; stats[0].nGuesses++; if(guess.pos >= THRESHOLD) stats[1].nGuesses++; if(guess.neu >= THRESHOLD) stats[2].nGuesses++; if(guess.neg >= THRESHOLD) stats[3].nGuesses++; Intensities goldInt = ssGoldMap.get("" + id); if(goldInt == null) return; stats[0].nOverlap++; if(goldInt.equals(guess)) stats[0].nCorrect++; if(guess.pos >= THRESHOLD && goldInt.pos >= THRESHOLD) { stats[1].nOverlap++; if(guess.pos == goldInt.pos) stats[1].nCorrect++; } if(guess.neu >= THRESHOLD && goldInt.neu >= THRESHOLD) { stats[2].nOverlap++; if(guess.neu == goldInt.neu) stats[2].nCorrect++; } if(guess.neg >= THRESHOLD && goldInt.neg >= THRESHOLD) { stats[3].nOverlap++; if(guess.neg == goldInt.neg) stats[3].nCorrect++; } } public static void main(String[] argv) { if(argv[0].equals("-makePackage")) { //makePackage(argv); } else if(argv[0].equals("-lk")) { mainLK(argv); } else if(argv[0].equals("-conll")) { mainCoNLL(argv); } else { throw new RuntimeException("Illegal options: " + Arrays.asList(argv)); } } public static void mainLK(String[] argv) { String modelFile = argv[1]; String lkDir = argv[2]; String outputDir = argv[3]; try { ObjectInputStream ois = new ObjectInputStream(Util.openFileStream(modelFile)); SymbolEncoder enc = (SymbolEncoder) ois.readObject(); Classifier cl1 = (Classifier) ois.readObject(); Classifier cl2 = (Classifier) ois.readObject(); SubjectivityLexicon subjLex; if(BOWPolSubjSentClassifier.USE_SUBJLEX) { Object o = ois.readObject(); if(o instanceof HashMap) subjLex = new SubjectivityLexicon((HashMap) o); else if(o instanceof SubjectivityLexicon) subjLex = (SubjectivityLexicon) o; else throw new RuntimeException("!!!"); } else subjLex = null; processDirectory(enc, cl1, cl2, subjLex, lkDir, outputDir); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } public static void mainCoNLL(String[] argv) { String modelFile = argv[1]; try { ObjectInputStream ois = new ObjectInputStream(Util.openFileStream(modelFile)); SymbolEncoder enc = (SymbolEncoder) ois.readObject(); Classifier cl1 = (Classifier) ois.readObject(); Classifier cl2 = (Classifier) ois.readObject(); SubjectivityLexicon subjLex; if(BOWPolSubjSentClassifier.USE_SUBJLEX) { Object o = ois.readObject(); if(o instanceof HashMap) subjLex = new SubjectivityLexicon((HashMap) o); else if(o instanceof SubjectivityLexicon) subjLex = (SubjectivityLexicon) o; else throw new RuntimeException("!!!"); } else subjLex = null; BufferedReader br = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8")); Triple> tr = CoNLL2008Format.readNextGraph(br); while(tr != null) { ArrayList sen = new ArrayList(); for(int i = 1; i < tr.first.nodes.length; i++) sen.add(tr.first.nodes[i].word); SparseVector sv = BOWPolSubjSentClassifier.representSentence(sen, tr, null, null, null, subjLex, enc); if(BOWPolSubjSentClassifier.SET) for(int j = 0; j < sv.index; j++) sv.values[j] = 1.0; if(BOWPolSubjSentClassifier.NORMALIZE) BOWPolSubjSentClassifier.normalize2(sv); Intensities guess = null; int guess1 = cl1.classify(sv); if(guess1 == BOWPolSubjSentClassifier.SUBJ) { //printSubjSen(++idCounter, senSpan[0], out); guess = cl2.classify(sv); if(!PRINT_NEUTRAL) guess.neu = BOWPolSubjSentClassifier.NONE; for(int i = 1; i < tr.first.nodes.length; i++) { pw.print(tr.first.nodes[i].word); pw.print("\t" + (guess.pos > BOWPolSubjSentClassifier.LOW? "T": "F")); pw.print("\t" + (guess.neu > BOWPolSubjSentClassifier.LOW? "T": "F")); pw.println("\t" + (guess.neg > BOWPolSubjSentClassifier.LOW? "T": "F")); } pw.println(); } else { for(int i = 1; i < tr.first.nodes.length; i++) { pw.print(tr.first.nodes[i].word); pw.println("\tF\tF\tF"); } pw.println(); } tr = CoNLL2008Format.readNextGraph(br); } pw.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } private static void printSubjSenPreamble(String base, String sentenceFileName, PrintWriter out) throws IOException { out.println(""); out.println(""); out.println(""); out.println(" " + base + ""); out.println(" BOWPolSubjSentClassifier"); out.println(""); out.println(""); } private static void printSubjSen(int nextId, int senId, PrintWriter out, Intensities guess) throws IOException { if(guess.pos == BOWPolSubjSentClassifier.NONE && guess.neu == BOWPolSubjSentClassifier.NONE && guess.neg == BOWPolSubjSentClassifier.NONE) return; String pos = BOWPolSubjSentClassifier.fromIntValueShort(guess.pos); String neu = BOWPolSubjSentClassifier.fromIntValueShort(guess.neu); String neg = BOWPolSubjSentClassifier.fromIntValueShort(guess.neg); out.print(" BOWPolSubjSentClassifier.NONE) out.print(" pos=\"" + pos + "\""); if(guess.neu > BOWPolSubjSentClassifier.NONE) out.print(" neu=\"" + neu + "\""); if(guess.neg > BOWPolSubjSentClassifier.NONE) out.print(" neg=\"" + neg + "\""); out.println("/>"); } private static void printSubjSenEnd(PrintWriter out) throws IOException { out.println(""); out.println(""); } }