package mpqareader; import java.io.*; import java.util.regex.Matcher; import java.util.regex.Pattern; public class PatchCorpus { public static void main(String[] argv) { if(argv.length < 1) { System.err.println("Must specify -strip, -patch, or -check."); System.exit(1); } if(argv[0].equals("-strip")) { strip(argv); } else if(argv[0].equals("-patch")) { patch(argv); } else if(argv[0].equals("-check")) { check(argv); } else { System.err.println("Unknown mode: " + argv[0]); System.exit(1); } } private static void patch(String[] argv) { if(argv.length < 4) { System.err.println("Must give 3 arguments: directory of stripped " + "files, MPQA document directory, and output directory."); System.exit(1); } try { File strippedDir = new File(argv[1]); String mpqaDir = argv[2]; String outDir = argv[3]; if(!new File(outDir).exists()) { System.err.println("Output directory does not exist."); if(!new File(outDir).mkdir()) { System.err.println("Could not create output directory."); System.exit(1); } else System.err.println("Created output directory."); } int count = 0; char[] buf = new char[BUF_SIZE]; for(File f: strippedDir.listFiles()) { if(f.getName().endsWith("lktext.xml")) { StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); while(true) { int n = br.read(buf); if(n == -1) break; sb.append(new String(buf, 0, n)); } br.close(); int six = sb.indexOf("___STRIPPED___"); int fn_six = sb.indexOf("database.mpqa.2.0/docs/") + "database.mpqa.2.0/docs/".length(); int fn_eix = sb.indexOf("")) { Matcher m = TOKEN_PAT.matcher(line); if(!m.find()) throw new RuntimeException("line = " + line); String id = m.group(1); int start = Integer.parseInt(m.group(2)); int end = Integer.parseInt(m.group(3)); String encToken = m.group(4); String token; if(encToken.equals("___STRIPPED___")) token = text.substring(start, end + 1); else token = encToken; pw.println(" " + token + ""); line = br.readLine(); } while(line != null) { pw.println(line); line = br.readLine(); } br.close(); pw.close(); } else if(!f.getName().endsWith("lktext.xml")) { StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); while(true) { int n = br.read(buf); if(n == -1) break; sb.append(new String(buf, 0, n)); } br.close(); String outFileName = outDir + File.separatorChar + f.getName(); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")); pw.print(sb); pw.close(); } } System.out.println("Patched " + count + " files."); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } private static String encodeXML(String s) { s = s.replaceAll("&", "&"); s = s.replaceAll("\"", """); s = s.replaceAll("<", "<"); s = s.replaceAll(">", ">"); String c26 = "" + (char) 26; s = s.replaceAll(c26, ""); return s; } private static String decodeXML(String s) { s = s.replaceAll(""", "\""); s = s.replaceAll("<", "<"); s = s.replaceAll(">", ">"); s = s.replaceAll("&", "&"); return s; } private static int BUF_SIZE = 10000; private static final Pattern TOKEN_PAT = Pattern.compile("id=\"(.*?)\" start=\"#(.*?)\" end=\"#(.*?)\".*?>(.*?)") + "".length(); int eix = sb.lastIndexOf(""); sb.delete(six, eix); sb.insert(six, "___STRIPPED___"); String outFileName = outDir + File.separatorChar + f.getName(); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")); pw.print(sb); pw.close(); count++; } else if(f.getName().endsWith("tokens.xml")) { String outFileName = outDir + File.separatorChar + f.getName(); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); String line = br.readLine(); while(line != null) { pw.println(line); if(line.contains("")) { Matcher m = TOKEN_PAT.matcher(line); if(!m.find()) throw new RuntimeException("line = " + line); String id = m.group(1); int start = Integer.parseInt(m.group(2)); int end = Integer.parseInt(m.group(3)); String token = m.group(4); String ttext = text.substring(start, end + 1); String encToken; if(token.equals(ttext)) encToken = "___STRIPPED___"; else encToken = token; pw.println(" " + encToken + ""); line = br.readLine(); } while(line != null) { pw.println(line); line = br.readLine(); } pw.close(); } else { StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); while(true) { int n = br.read(buf); if(n == -1) break; sb.append(new String(buf, 0, n)); } String outFileName = outDir + File.separatorChar + f.getName(); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")); pw.print(sb); pw.close(); } } System.out.println("Stripped " + count + " files."); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } private static String readText(String file, char[] buf) throws IOException { StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); while(true) { int n = br.read(buf); if(n == -1) break; sb.append(new String(buf, 0, n)); } int six = sb.indexOf("") + "".length(); int eix = sb.lastIndexOf(""); String text = sb.substring(six, eix); text = decodeXML(text); return text; } private static void check(String[] argv) { try { File dir = new File(argv[1]); String outDir = argv[2]; char[] buf = new char[BUF_SIZE]; int count = 0; for(File f: dir.listFiles()) { count++; if(count % 50 == 0) { System.out.print("."); System.out.flush(); } StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); while(true) { int n = br.read(buf); if(n == -1) break; sb.append(new String(buf, 0, n)); } String s1 = sb.toString(); s1 = s1.replaceAll("\n\r", "\r"); String fileName2 = outDir + File.separatorChar + f.getName(); StringBuilder sb2 = new StringBuilder(); BufferedReader br2 = new BufferedReader(new InputStreamReader(new FileInputStream(fileName2), "UTF-8")); while(true) { int n = br2.read(buf); if(n == -1) break; sb2.append(new String(buf, 0, n)); } String s2 = sb.toString(); s2 = s2.replaceAll("\n\r", "\r"); if(!s1.equals(s2)) { System.out.println(f.getName() + " and " + fileName2 + " differ."); System.out.println(s1.length()); System.out.println(s2.length()); for(int i = 0; i < s1.length(); i++) { if(s1.charAt(i) != s2.charAt(i)) System.out.print("X"); char c1 = s1.charAt(i); char c2 = s2.charAt(i); String sc1; if(c1 == 13) sc1 = "\\n"; else if(c1 == 10) sc1 = "\\r"; else if(c1 < 32) sc1 = "(" + (int) c1 + ")"; else sc1 = "" + c1; String sc2; if(c2 == 13) sc2 = "\\n"; else if(c2 == 10) sc2 = "\\r"; else if(c2 < 32) sc2 = "(" + (int) c2 + ")"; else sc2 = "" + c2; System.out.println(i + "\t" + sc1 + "(" + (int) c1 + ")\t" + sc2 + "(" + (int) c2 + ")"); } break; } } System.out.println(); System.out.println("Checked " + count + " files."); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } }