package lkformat2; import java.io.*; import java.util.*; //import java.util.regex.*; public class SSTToLK { private static final String ENCODING = "UTF-8"; public static void main(String[] argv) { String lkDir = argv[0]; String sstOutputFile = argv[1]; String conll2008File = argv[2]; String outDir = argv[3]; System.out.println("argv = " + Arrays.toString(argv)); try { BufferedReader sstInput = new BufferedReader(new InputStreamReader(new FileInputStream(sstOutputFile), ENCODING)); PrintWriter conll2008Out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(conll2008File), ENCODING)); String[] files = new File(lkDir).list(); Arrays.sort(files); for(String file: files) { if(file.endsWith("lktext.xml")) processFile(lkDir + File.separatorChar + file, sstInput, conll2008Out, outDir); } conll2008Out.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } public static void processFile(String textFile, BufferedReader sstInput, PrintWriter tabularCoNLL08Out, String outDir) { try { textFile = textFile.replaceAll("[^/]+/", ""); System.out.println("textFile = " + textFile); String line = sstInput.readLine(); if(line == null || !line.startsWith("___BEGIN___")) throw new RuntimeException("Excpected beginning of file..."); String tokenFile = line.substring("___BEGIN___|".length()); tokenFile = tokenFile.replaceAll(" .*", ""); tabularCoNLL08Out.print("0\t___BEGIN___|" + tokenFile + "\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("___BEGIN___|" + tokenFile + "\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("0\t"); tabularCoNLL08Out.println("ROOT"); tabularCoNLL08Out.println(); tokenFile = tokenFile.replaceAll("[^/]+/", ""); System.out.println(tokenFile); String basename = textFile.replaceFirst("\\.lktext\\.xml", ""); basename = basename.replaceAll("[^/]+/", ""); basename = outDir + File.separatorChar + basename; String outPOSFile = basename + ".pos.xml"; String outSSTFile = basename + ".sst.xml"; PrintWriter posOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outPOSFile), ENCODING)); PrintWriter sstOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outSSTFile), ENCODING)); posOut.println(""); posOut.println(""); posOut.println(""); posOut.println(" " + textFile + ""); posOut.println(" SSTLight"); posOut.println(""); posOut.println(""); sstOut.println(""); sstOut.println(""); sstOut.println(""); sstOut.println(" " + textFile + ""); sstOut.println(" SSTLight"); sstOut.println(""); ArrayList lemmas = new ArrayList(); ArrayList ssIOB = new ArrayList(); ArrayList conll03IOB = new ArrayList(); ArrayList wsjIOB = new ArrayList(); int tokenIdCounter = 0; line = sstInput.readLine(); while(line != null && !line.contains("___END___")) { line = line.trim(); if(!line.equals("")) { String[] ss = line.split(" "); if(ss.length % 6 != 0) throw new RuntimeException("this line: |" + line + "|"); int n = ss.length / 6; for(int i = 0; i < n; i++) { String token = ss[6*i]; String pos = ss[6*i + 1]; String lemma = ss[6*i + 2]; lemmas.add(lemma); tabularCoNLL08Out.print((i + 1) + "\t"); tabularCoNLL08Out.print(token + "\t"); tabularCoNLL08Out.print(lemma + "\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print(pos + "\t"); tabularCoNLL08Out.print(token + "\t"); tabularCoNLL08Out.print(lemma + "\t"); tabularCoNLL08Out.print(pos + "\t"); tabularCoNLL08Out.print("0\t"); tabularCoNLL08Out.println("ROOT"); if(i == 0) for(int j = 3; j < 6; j++) if(ss[6*i + j].startsWith("I-")) ss[6*i + j] = "B-" + ss[6*i + j].substring(2); ssIOB.add(ss[6*i + 3]); conll03IOB.add(ss[6*i + 4]); wsjIOB.add(ss[6*i + 5]); int id = ++tokenIdCounter; printEntity(pos, id, -1, id, posOut); } tabularCoNLL08Out.println(); } line = sstInput.readLine(); } int nTokens = tokenIdCounter; posOut.println(""); posOut.println(""); for(int i = 0; i < lemmas.size(); i++) { String lemma = lemmas.get(i); int tid = i + 1; int id = tid + nTokens; printEntity(lemma, tid, -1, id, posOut); } posOut.println(""); posOut.println(""); posOut.close(); String preamble = ""; int ssid = 0; ssid = printIOB(sstOut, preamble + "WNSS" + endString, ssid, nTokens, ssIOB); ssid = printIOB(sstOut, preamble + "NE-CONLL03" + endString, ssid, nTokens, conll03IOB); ssid = printIOB(sstOut, preamble + "NE-WSJ" + endString, ssid, nTokens, wsjIOB); sstOut.println(""); sstOut.close(); tabularCoNLL08Out.print("0\t___END___\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("___END___\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("0\t"); tabularCoNLL08Out.println("ROOT"); tabularCoNLL08Out.println(); posOut.close(); sstOut.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } static void printEntity(String l, int start, int end, int id, PrintWriter out) { StringBuilder sb = new StringBuilder(" "); else sb.append("\" on=\"#" + start + "\">"); sb.append(l); sb.append(""); out.println(sb); } private static int printIOB(PrintWriter sstOut, String preamble, int ssid, int nTokens, ArrayList iob) { sstOut.println(preamble); String openTag = null; int openTagStart = -1; for(int i = 0; i < iob.size(); i++) { String tag = iob.get(i); if(!tag.equals("0") && !tag.startsWith("B-") && !tag.startsWith("I-")) throw new RuntimeException("Illegal tag " + tag); int tid = i + 1; if(openTag != null) { String t = tag.equals("0")? null: tag.substring(2); if(t == null || tag.startsWith("B") || !t.equals(openTag)) { int id = ++ssid; printEntity(openTag, openTagStart, tid-1, id, sstOut); openTag = null; } } if(!tag.equals("0")) { if(openTag == null) { openTag = tag.substring(2); openTagStart = tid; } else { if(!tag.substring(2).equals(openTag)) throw new RuntimeException("Illegal tag here"); } } //printEntity(lemma, tid, -1, id, posOut); } if(openTag != null) { int id = ++ssid; printEntity(openTag, openTagStart, nTokens, id, sstOut); openTag = null; } sstOut.println(""); return ssid; } }