package lkformat; import java.io.*; import java.util.regex.*; public class EuroparlToLK { private static final String ENCODING = "UTF-8"; private static int idCounter = 0; public static void main(String[] argv) { String fileName = argv[0]; String outTextFile = argv[1]; String outStructureFile = argv[2]; String outRawFile = argv[3]; try { BufferedReader br = new BufferedReader(new FileReader(fileName)); PrintWriter textOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outTextFile), ENCODING)); PrintWriter structOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outStructureFile), ENCODING)); PrintWriter rawOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outRawFile), ENCODING)); int ix = fileName.lastIndexOf('/'); String baseName = ix == -1? fileName: fileName.substring(ix + 1); String isoDate = extractDate(baseName); textOut.println(""); textOut.println(""); textOut.println(""); textOut.println(" " + baseName + ""); textOut.println(" " + isoDate + ""); textOut.println(""); textOut.print(""); structOut.println(""); structOut.println(""); structOut.println(""); structOut.println(" " + outTextFile + ""); structOut.println(" EuroparlToLK"); structOut.println(""); structOut.println(""); rawOut.println("DUMMY"); int position = 0; int chStart = Integer.MIN_VALUE; int spStart = Integer.MIN_VALUE; String line = br.readLine(); while(line != null) { line = line.trim(); if(line.startsWith("<")) { if(line.startsWith("")) { printEntity("p", position, position, structOut); } else throw new RuntimeException("line = " + line); } else { if(line.startsWith("(")) { if(spStart != Integer.MIN_VALUE) { printEntity("speaker", spStart, position - 1, structOut); spStart = Integer.MIN_VALUE; } } line = line.replaceAll("' s ", "'s "); textOut.println(line); position += line.length() + 1; rawOut.println(line); //rawOut.println(); } line = br.readLine(); } if(spStart != Integer.MIN_VALUE) printEntity("speaker", spStart, position - 1, structOut); if(chStart != Integer.MIN_VALUE) printEntity("chapter", chStart, position - 1, structOut); textOut.println(""); textOut.println(""); textOut.close(); structOut.println(""); structOut.println(""); structOut.close(); rawOut.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } private static void printEntity(String l, int start, int end, PrintWriter out) { StringBuilder sb = new StringBuilder(" "); sb.append(l); sb.append(""); out.println(sb); } private static Pattern DATE_PATTERN = Pattern.compile("ep-(..)-(..)-(..)\\.txt"); private static String extractDate(String d) { Matcher m = DATE_PATTERN.matcher(d); if(!m.matches()) throw new RuntimeException("Couldn't extract date"); String yy = m.group(1); String mm = m.group(2); String dd = m.group(3); if(yy.startsWith("9")) yy = "19" + yy; else yy = "20" + yy; return yy + "-" + mm + "-" + dd; } }