package lkformat; import java.io.*; import java.util.*; //import java.util.regex.*; public class SSTToLK { private static final String ENCODING = "UTF-8"; public static void main(String[] argv) { String lkTextFile = argv[0]; String sstOutputFile = argv[1]; /*String outTokenFile = argv[2]; String outPOSFile = argv[3]; String outSSTFile = argv[4]; String outCoNLL2008File = argv[5];*/ String basename = lkTextFile.replaceAll("\\.lktext\\.xml", ""); String outTokenFile = basename + ".tokens.xml"; String outPOSFile = basename + ".pos.xml"; String outSSTFile = basename + ".sst.xml"; String outCoNLL2008File = basename + ".conll08in"; try { String lkText = readLKText(lkTextFile); BufferedReader br = new BufferedReader(new FileReader(sstOutputFile)); PrintWriter tokenOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outTokenFile), ENCODING)); PrintWriter posOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outPOSFile), ENCODING)); PrintWriter sstOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outSSTFile), ENCODING)); PrintWriter tabularCoNLL08Out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outCoNLL2008File), ENCODING)); tokenOut.println(""); tokenOut.println(""); tokenOut.println(""); tokenOut.println(" " + lkTextFile + ""); tokenOut.println(" SSTLight"); tokenOut.println(""); tokenOut.println(""); posOut.println(""); posOut.println(""); posOut.println(""); posOut.println(" " + lkTextFile + ""); posOut.println(" SSTLight"); posOut.println(""); posOut.println(""); ArrayList sentenceEnds = new ArrayList(); ArrayList lemmas = new ArrayList(); ArrayList ssIOB = new ArrayList(); ArrayList conll03IOB = new ArrayList(); ArrayList wsjIOB = new ArrayList(); int tokenIdCounter = 0; int position = 0; String line = br.readLine(); while(line != null) { line = line.trim(); if(!line.equals("")) { String[] ss = line.split(" "); if(ss.length % 6 != 0) throw new RuntimeException("this line: |" + line + "|"); int n = ss.length / 6; for(int i = 0; i < n; i++) { String token = ss[6*i]; String pos = ss[6*i + 1]; String lemma = ss[6*i + 2]; lemmas.add(lemma); tabularCoNLL08Out.print((i + 1) + "\t"); tabularCoNLL08Out.print(token + "\t"); tabularCoNLL08Out.print(lemma + "\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print(pos + "\t"); tabularCoNLL08Out.print(token + "\t"); tabularCoNLL08Out.print(lemma + "\t"); tabularCoNLL08Out.print(pos + "\t"); tabularCoNLL08Out.print("0\t"); tabularCoNLL08Out.println("ROOT"); if(i == 0) for(int j = 3; j < 6; j++) if(ss[6*i + j].startsWith("I-")) ss[6*i + j] = "B-" + ss[6*i + j].substring(2); ssIOB.add(ss[6*i + 3]); conll03IOB.add(ss[6*i + 4]); wsjIOB.add(ss[6*i + 5]); int id = ++tokenIdCounter; int start = position; int end = getEnd(lkText, token, position, line); printEntity(token, start, end - 1, id, tokenOut); printEntity(pos, id, -1, id, posOut); position = getNext(lkText, end); } sentenceEnds.add(tokenIdCounter); tabularCoNLL08Out.println(); } line = br.readLine(); } int nTokens = tokenIdCounter; tokenOut.println(""); tokenOut.println(""); int prev = 1; for(Integer i: sentenceEnds) { StringBuilder sb = new StringBuilder(" "); tokenOut.println(sb); prev = i + 1; } tokenOut.println(""); tokenOut.println(""); tokenOut.close(); posOut.println(""); posOut.println(""); for(int i = 0; i < lemmas.size(); i++) { String lemma = lemmas.get(i); int tid = i + 1; int id = tid + nTokens; printEntity(lemma, tid, -1, id, posOut); } posOut.println(""); posOut.println(""); posOut.close(); sstOut.println(""); sstOut.println(""); sstOut.println(""); sstOut.println(" " + lkTextFile + ""); sstOut.println(" SSTLight"); sstOut.println(""); String preamble = ""; int ssid = 0; ssid = printIOB(sstOut, preamble + "WNSS" + endString, ssid, nTokens, ssIOB); ssid = printIOB(sstOut, preamble + "NE-CONLL03" + endString, ssid, nTokens, conll03IOB); ssid = printIOB(sstOut, preamble + "NE-WSJ" + endString, ssid, nTokens, wsjIOB); sstOut.println(""); sstOut.close(); tabularCoNLL08Out.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } private static String readLKText(String file) throws IOException { BufferedReader br = new BufferedReader(new FileReader(file)); StringBuilder sb = new StringBuilder(); String line = br.readLine(); boolean started = false; while(line != null) { if(!started) { if(line.startsWith("".length())); sb.append("\n"); } } else { if(line.startsWith(""); else sb.append("\" on=\"#" + start + "\">"); sb.append(l); sb.append(""); out.println(sb); } private static int getNext(String lkText, int position) { while(position < lkText.length() && Character.isWhitespace(lkText.charAt(position))) position++; return position; } private static int getEnd(String lkText, String token, int position, String line) { int len = Math.min(lkText.length() - position, token.length()); //System.out.println("position = " + position); //System.out.println("len = " + len); String t2 = lkText.substring(position, position + len); if(!token.equals(t2)) { if(token.matches("``|''") && t2.startsWith("\"")) { len = 1; t2 = "\""; } else { System.out.println(line); System.out.println(getContext(lkText, position)); throw new RuntimeException(token + " != " + t2); } } //System.out.println(token); position += len; //while(position < lkText.length() && Character.isWhitespace(lkText.charAt(position))) // position++; //System.out.println("returning " + position); return position; } private static int printIOB(PrintWriter sstOut, String preamble, int ssid, int nTokens, ArrayList iob) { sstOut.println(preamble); String openTag = null; int openTagStart = -1; for(int i = 0; i < iob.size(); i++) { String tag = iob.get(i); if(!tag.equals("0") && !tag.startsWith("B-") && !tag.startsWith("I-")) throw new RuntimeException("Illegal tag " + tag); int tid = i + 1; if(openTag != null) { String t = tag.equals("0")? null: tag.substring(2); if(t == null || tag.startsWith("B") || !t.equals(openTag)) { int id = ++ssid; printEntity(openTag, openTagStart, tid-1, id, sstOut); openTag = null; } } if(!tag.equals("0")) { if(openTag == null) { openTag = tag.substring(2); openTagStart = tid; } else { if(!tag.substring(2).equals(openTag)) throw new RuntimeException("Illegal tag here"); } } //printEntity(lemma, tid, -1, id, posOut); } if(openTag != null) { int id = ++ssid; printEntity(openTag, openTagStart, nTokens, id, sstOut); openTag = null; } sstOut.println(""); return ssid; } }