package lkformat2; import java.io.*; import java.util.*; public class PreprocessParser { private static String extractAttribute(String line, String attr) { String s = attr + "=\""; int ix1 = line.indexOf(s); if(ix1 == -1) return null; ix1 += s.length(); int ix2 = line.indexOf("\"", ix1); return line.substring(ix1, ix2); } private static String extractEntityData(String line) { int ix1 = line.indexOf("", ix1); if(ix2 == -1) return null; int ix3 = line.lastIndexOf(""); if(ix3 == -1) return null; return line.substring(ix2 + 1, ix3); } public static void processFile(String fileName, PrintWriter out) { try { if(new File(fileName).isDirectory()) return; BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8")); String line = br.readLine(); while(line != null) { if(line.contains("provides=\"SENTENCES\"")) break; line = br.readLine(); } if(line == null) return; String tokenFile = extractAttribute(line, "scope"); if(tokenFile == null) tokenFile = fileName; //System.out.println("Sentences from " + fileName + ", tokens from " + tokenFile); ArrayList spans = new ArrayList(); line = br.readLine(); while(!line.contains("")) { String start = extractAttribute(line, "start"); String end = extractAttribute(line, "end"); //System.out.println("line = " + line + " start = " + start + " end = " + end); if(start == null) throw new RuntimeException("Only start-end annotation supported for sentences"); if(end == null) throw new RuntimeException("Only start-end annotation supported for sentences"); if(start.charAt(0) != '#') throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end); if(end.charAt(0) != '#') throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end); start = start.substring(1); end = end.substring(1); spans.add(new String[] { start, end }); line = br.readLine(); } if(spans.isEmpty()) return; br.close(); br = new BufferedReader(new InputStreamReader(new FileInputStream(tokenFile), "UTF-8")); line = br.readLine(); while(line != null) { if(line.contains("provides=\"TOKENS\"")) break; line = br.readLine(); } int senPos = 0; String[] senSpan = spans.get(senPos); boolean inside = false; int prev = 0; ArrayList tokens = new ArrayList(); line = br.readLine(); while(!line.contains("")) { line = line.trim(); if(!line.equals("")) { String t = extractEntityData(line); if(t == null) throw new RuntimeException("Could not extract token"); String id = extractAttribute(line, "id"); if(id == null) throw new RuntimeException("Could not extract id"); int idi = Integer.parseInt(id); if(idi != prev + 1) throw new RuntimeException("I have assumed contiguous ids..."); prev = idi; if(id.equals(senSpan[0])) inside = true; if(inside) { //out.println(t); String[] ts = new String[4]; ts[0] = id; ts[1] = t; tokens.add(ts); } if(id.equals(senSpan[1])) { senPos++; if(senPos == spans.size()) break; senSpan = spans.get(senPos); inside = false; } } line = br.readLine(); } String posFile = tokenFile.replaceAll("\\.[^\\.]+\\.xml", ".pos.xml"); // TEMPORARY //posFile = posFile.replaceFirst("solr-lkxml", "lk_output_new"); br.close(); br = new BufferedReader(new InputStreamReader(new FileInputStream(posFile), "UTF-8")); //out.println("___BEGIN___|" + tokenFile); //out.println(); out.print("0\t___BEGIN___|" + tokenFile + "\t"); out.print("_\t"); out.print("_\t"); out.print("_\t"); out.print("_\t"); out.print("___BEGIN___|" + tokenFile + "\t"); out.print("_\t"); out.print("0\t"); out.println("ROOT"); out.println(); line = br.readLine(); while(line != null) { if(line.contains("provides=\"POS\"")) break; line = br.readLine(); } if(line == null) throw new RuntimeException("No POS annotation found!"); line = br.readLine(); while(!line.contains("")) { line = line.trim(); if(!line.equals("")) { String t = extractEntityData(line); if(t == null) throw new RuntimeException("Could not extract token"); String on = extractAttribute(line, "on"); if(on == null) throw new RuntimeException("Only on annotation supported for sentences"); if(on.charAt(0) != '#') throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " on = " + on); on = on.substring(1); int position = Integer.parseInt(on) - 1; String[] ts = tokens.get(position); if(!on.equals(ts[0])) throw new RuntimeException("!!!"); ts[2] = t; } line = br.readLine(); } while(line != null) { if(line.contains("provides=\"LEMMA\"")) break; line = br.readLine(); } line = br.readLine(); while(!line.contains("")) { line = line.trim(); if(!line.equals("")) { String t = extractEntityData(line); if(t == null) throw new RuntimeException("Could not extract token"); String on = extractAttribute(line, "on"); if(on == null) throw new RuntimeException("Only on annotation supported for sentences"); if(on.charAt(0) != '#') throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " on = " + on); on = on.substring(1); int position = Integer.parseInt(on) - 1; String[] ts = tokens.get(position); if(!on.equals(ts[0])) throw new RuntimeException("!!!"); ts[3] = t; } line = br.readLine(); } br.close(); senPos = 0; senSpan = spans.get(senPos); int posInSentence = 0; for(String[] ts: tokens) { //out.println(Arrays.toString(ts)); posInSentence++; out.print(posInSentence + "\t"); out.print(ts[1] + "\t"); out.print(ts[3] + "\t"); out.print("_\t"); out.print(ts[2] + "\t"); out.print(ts[1] + "\t"); out.print(ts[3] + "\t"); out.print(ts[2] + "\t"); out.print("0\t"); out.println("ROOT"); if(ts[0].endsWith(senSpan[1])) { out.println(); senPos++; if(senPos == spans.size()) break; senSpan = spans.get(senPos); posInSentence = 0; } } out.print("1\t___END___\t"); out.print("_\t"); out.print("_\t"); out.print("_\t"); out.print("___END___\t"); out.print("_\t"); out.print("_\t"); out.print("0\t"); out.println("ROOT"); out.println(); //out.println("___END___|" + tokenFile); //out.println(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } public static void processDirectory(String dirName, String outFileName) { try { PrintWriter out = new PrintWriter(new FileWriter(outFileName)); String[] files = new File(dirName).list(); Arrays.sort(files); for(String file: files) { processFile(dirName + File.separatorChar + file, out); } out.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } public static void main(String[] argv) { processDirectory(argv[0], argv[1]); } }