package mpqa_seq_reranker; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.io.*; import mpqareader.*; import se.lth.cs.nlp.nlputils.annotations.AnnotationLayer; import se.lth.cs.nlp.nlputils.annotations.Span; import se.lth.cs.nlp.nlputils.core.DoubleObjPair; import se.lth.cs.nlp.nlputils.core.Pair; import se.lth.cs.nlp.nlputils.core.Strings; import se.lth.cs.nlp.nlputils.core.Triple; import se.lth.cs.nlp.nlputils.depgraph.DepGraph; import se.lth.cs.nlp.nlputils.depgraph.DepNode; import se.lth.cs.nlp.nlputils.ml_long.Classifier; import se.lth.cs.nlp.nlputils.ml_long.SparseVector; import se.lth.cs.nlp.nlputils.ml_long.SymbolEncoder; import se.lth.cs.nlp.opinions.*; public class FullSystem { public static void makePackage(String[] argv) { try { String subjLexFile = argv[1]; String seqLabelerFile = argv[2]; String rerankerFile = argv[3]; String outFile = argv[4]; SubjectivityLexicon lex = new SubjectivityLexicon(subjLexFile); TaggingModel tm = SeqLabeler.loadModel(seqLabelerFile); ObjectInputStream ois = new ObjectInputStream(new FileInputStream(rerankerFile)); RerankingFE fe = (RerankingFE) ois.readObject(); Classifier reranker = (Classifier) ois.readObject(); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(outFile)); oos.writeObject(lex); oos.writeObject(tm); oos.writeObject(fe); oos.writeObject(reranker); oos.close(); } catch(Exception e) { e.printStackTrace(); } } static Pattern SEN_ID_PAT = Pattern.compile("id\\s*=\\s*\"(.*?)\""); static Pattern SEN_START_PAT = Pattern.compile("start\\s*=\\s*\"#(.*?)\""); static Pattern SEN_END_PAT = Pattern.compile("end\\s*=\\s*\"#(.*?)\""); static Pattern ON_PAT = Pattern.compile("on\\s*=\\s*\"#(.*?)\""); static Pattern SCOPE_PAT = Pattern.compile("scope\\s*=\\s*\"(.*?)\""); private static int extractIntByPattern(String s, Pattern p) { String match = extractStringByPattern(s, p); if(match == null) throw new IllegalArgumentException("Could not find match"); return Integer.parseInt(match); //extractStringByPattern(s, p)); } private static String extractStringByPattern(String s, Pattern p) { Matcher m = p.matcher(s); if(!m.find()) return null; String match = m.group(1); return match; } private static Pair> readSentenceSpans(String prefix, String file) throws IOException { BufferedReader br = new BufferedReader(new FileReader(prefix + File.separatorChar + file)); String line = br.readLine(); while(line != null && !line.contains("provides=\"SENTENCES\"")) line = br.readLine(); if(line == null) return null; String tokenFile = extractStringByPattern(line, SCOPE_PAT); if(tokenFile == null) tokenFile = file; ArrayList out = new ArrayList(); line = br.readLine(); while(!line.contains("")) { line = line.trim(); if(line.startsWith("") && !line.endsWith("")) throw new RuntimeException("Sorry, we expect sentences line by line..."); int id = extractIntByPattern(line, SEN_ID_PAT); int start = extractIntByPattern(line, SEN_START_PAT); int end = extractIntByPattern(line, SEN_END_PAT); out.add(new int[] { id, start, end }); } line = br.readLine(); } return new Pair(tokenFile, out); } private static String extractToken(String line) { int s_ix = line.indexOf('>'); int e_ix = line.indexOf('<', s_ix); if(s_ix == -1) throw new RuntimeException("Sorry, we expect tokens line by line..."); if(e_ix == -1) throw new RuntimeException("Sorry, we expect tokens line by line..."); String token = line.substring(s_ix + 1, e_ix); return Strings.decodeXML(token); } private static String readTokenPreamble(BufferedReader br) throws IOException { String line = br.readLine(); while(!line.contains("\"base\"")) line = br.readLine(); int s_ix = line.indexOf(">"); int e_ix = line.indexOf('<', s_ix); String base = line.substring(s_ix + 1, e_ix); while(line != null && !line.contains("provides=\"TOKENS\"")) line = br.readLine(); if(line == null) throw new RuntimeException("Couldn't find token annotation"); return base; } private static void readPreamble(BufferedReader br, String prov) throws IOException { String line = br.readLine(); while(line != null && !line.contains("provides=\"" + prov + "\"")) line = br.readLine(); if(line == null) throw new RuntimeException("Couldn't find annotation layer " + prov); } private static final int LOOKAHEAD = 256; private static Pair, Triple> readNextSentence(int[] sen, BufferedReader br, BufferedReader pbr, BufferedReader lbr) throws IOException { if(!lbr.markSupported()) throw new RuntimeException("!!!"); int senId = sen[0]; int start = sen[1]; int end = sen[2]; ArrayList out = new ArrayList(); ArrayList tags = new ArrayList(); ArrayList lemmas = new ArrayList(); /*String line = br.readLine().trim(); while(line.equals("")) line = br.readLine().trim(); if(!line.startsWith("")) { System.err.println("line = " + line); throw new RuntimeException("Sorry, we expect tokens line by line..."); } int id = extractIntByPattern(line, SEN_ID_PAT); while(id != start) { line = br.readLine().trim(); while(line.equals("")) line = br.readLine().trim(); if(line.contains("")) { throw new RuntimeException("Problem finding sentence " + senId + " start=" + start + " end=" + end); } if(!line.startsWith("")) { System.err.println("line = " + line); throw new RuntimeException("Sorry, we expect tokens line by line..."); } id = extractIntByPattern(line, SEN_ID_PAT); } out.add(extractToken(line));*/ boolean inside = false; int id = -1; String line; while(id != end) { line = br.readLine().trim(); while(line.equals("")) line = br.readLine().trim(); if(!line.startsWith("")) throw new RuntimeException("Sorry, we expect tokens line by line..."); id = extractIntByPattern(line, SEN_ID_PAT); if(id == start) inside = true; if(inside) { pbr.mark(LOOKAHEAD); String posLine = pbr.readLine().trim(); if(!posLine.startsWith(""))) { System.err.println("end = " + end); System.err.println("line = |" + line + "|"); System.err.println("posLine = |" + posLine + "|"); throw new RuntimeException("Sorry, we expect POS tags line by line..."); } if(posLine.startsWith(""))) { System.err.println("line = |" + line + "|"); System.err.println("posLine = |" + posLine + "|"); System.err.println("lemmaLine = |" + lemmaLine + "|"); throw new RuntimeException("Sorry, we expect lemmas line by line..."); } if(false) { System.err.println(line); System.err.println(posLine); System.err.println(lemmaLine); } String lemma = null; if(!lemmaLine.equals("")) { on = extractIntByPattern(lemmaLine, ON_PAT); if(on != id) { lbr.reset(); } else lemma = extractToken(lemmaLine); } String token = extractToken(line); String posTag = extractToken(posLine); out.add(token); tags.add(posTag); lemmas.add(lemma); } } DepGraph dg = new DepGraph(out.size()); for(int i = 1; i < dg.nodes.length; i++) { dg.nodes[i] = new DepNode(); dg.nodes[i].word = out.get(i - 1); dg.nodes[i].pos = tags.get(i - 1); dg.nodes[i].lemma = lemmas.get(i - 1); } return new Pair(out, new Triple(dg, dg, null)); } private static void printSubjPreamble(String base, String sentenceFileName, PrintWriter out) throws IOException { out.println(""); out.println(""); out.println(""); out.println(" " + base + ""); out.println(" UniTNSubjExprTagger"); out.println(""); } private static void printSubjEnd(PrintWriter out) throws IOException { out.println(""); } private static ArrayList> kbest(TaggingModel model, SubjectivityLexicon sl, DepGraph dg, int k) { int n = dg.nodes.length - 1; int m = 6; String[][] tokens = new String[n][m]; for(int i = 0; i < n; i++) { DepNode dn = dg.nodes[i+1]; tokens[i][0] = dn.word; tokens[i][1] = dn.pos; tokens[i][2] = dn.lemma; String ss = sl.lookup(dn.word, dn.pos, dn.lemma); if(ss == null) tokens[i][3] = "_"; else tokens[i][3] = ss; tokens[i][4] = "_"; tokens[i][5] = "_"; } for(int i = 0; i < n; i++) { for(int j = 0; j < tokens[i].length; j++) { if(j > 0) System.out.print("\t"); System.out.print(tokens[i][j]); } System.out.println(); } System.out.println(); ArrayList> out = SeqLabeler.tagSentenceKBest(model, tokens, k); System.out.println(out); return out; } private static int printEntities(int idCounter, int tid, String label, ArrayList spans, StringBuilder out) { for(Span s: spans) if(s.label.equals(label)) { idCounter++; out.append(" <" + label + "/>\n"); } return idCounter; } public static void processDirectory(String dirName, String outputDir, TaggingModel model, int k, SubjectivityLexicon sl, RerankingFE fe, Classifier reranker) throws IOException { long t0 = System.currentTimeMillis(); int countSentences = 0; File dir = new File(dirName); if(!dir.isDirectory()) throw new IllegalArgumentException(dir + " is not a directory"); for(File file: dir.listFiles()) { if(file.isDirectory()) continue; Pair> p = readSentenceSpans(dirName, file.getName()); if(p == null) continue; String tokenFile = p.left; ArrayList senSpans = p.right; System.out.println("Sentence file " + file.getName() + ", token file " + tokenFile); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(dirName + File.separatorChar + tokenFile), "UTF-8")); String base = readTokenPreamble(br); String outFile; if(tokenFile.endsWith(".xml")) outFile = tokenFile.replaceFirst("\\.[^\\.]+\\.xml$", ".subjectivity.xml"); else outFile = tokenFile + ".subjectivity.xml"; outFile = outputDir + File.separatorChar + outFile; String posFile = null; if(tokenFile.endsWith(".xml")) posFile = tokenFile.replaceFirst("\\.[^\\.]+\\.xml$", ".pos.xml"); else posFile = tokenFile + ".pos.xml"; BufferedReader posReader = null; BufferedReader lemmaReader = null; try { posReader = new BufferedReader(new InputStreamReader(new FileInputStream(dirName + File.separatorChar + posFile), "UTF-8")); lemmaReader = new BufferedReader(new InputStreamReader(new FileInputStream(dirName + File.separatorChar + posFile), "UTF-8")); } catch(Exception e) { System.err.println("Sorry, we expect the POS file name to end with .pos.xml"); System.exit(1); } readPreamble(posReader, "POS"); readPreamble(lemmaReader, "LEMMA"); PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFile), "UTF-8")); printSubjPreamble(base, file.getName(), out); StringBuilder sbE = new StringBuilder(); StringBuilder sbD = new StringBuilder(); StringBuilder sbO = new StringBuilder(); int idCounter = 0; int tokenIdCounter = 0; for(int[] senSpan: senSpans) { Pair, Triple> pp = readNextSentence(senSpan, br, posReader, lemmaReader); ArrayList sen = pp.left; if(sen == null) throw new RuntimeException("null"); DepGraph dg = (DepGraph) pp.right.first; ArrayList> cands = kbest(model, sl, dg, k); ArrayList spans; if(k == 1) spans = cands.get(0).right.spans; else { SynSemParse ssp = new SynSemParse(dg, null); spans = Reranker.rerankSentence(fe, reranker, cands, ssp); } idCounter = printEntities(idCounter, tokenIdCounter, "ds", spans, sbD); idCounter = printEntities(idCounter, tokenIdCounter, "es", spans, sbE); idCounter = printEntities(idCounter, tokenIdCounter, "os", spans, sbO); //System.out.println(pp.left); //System.out.println(spans); /* */ tokenIdCounter += sen.size(); countSentences++; } out.println(""); out.print(sbE); out.println(""); out.println(""); out.print(sbO); out.println(""); out.println(""); out.print(sbD); out.println(""); printSubjEnd(out); out.close(); posReader.close(); lemmaReader.close(); br.close(); } long t1 = System.currentTimeMillis(); double ssPerSec = 1000.0 * countSentences / (t1 - t0); System.out.format("Processed %f sentences per second.", ssPerSec); } public static void run(String[] argv) { try { String modelFile = argv[1]; String inDir = argv[2]; String outDir = argv[3]; int k = Integer.parseInt(argv[4]); ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelFile)); SubjectivityLexicon lex = (SubjectivityLexicon) ois.readObject(); TaggingModel tm = (TaggingModel) ois.readObject(); RerankingFE fe = (RerankingFE) ois.readObject(); Classifier reranker = (Classifier) ois.readObject(); ois.close(); processDirectory(inDir, outDir, tm, k, lex, fe, reranker); } catch(Exception e) { e.printStackTrace(); } } public static void main(String[] argv) { if(argv[0].equals("-run")) run(argv); else if(argv[0].equals("-package")) makePackage(argv); else { System.err.println("error: " + Arrays.toString(argv)); } } }