package mpqareader; import java.util.*; import se.lth.cs.nlp.nlputils.annotations.*; import se.lth.cs.nlp.nlputils.core.Util; public class EPEPreprocessor { public static void main(String[] argv) { try { String mpqaDir = argv[0]; String docList = argv[1]; String outDir = argv[2]; HashSet docs = new HashSet(); Util.readLines(docList, docs); Iterator i = new MPQAReader.MPQADirIterator(mpqaDir, true, null); HashSet toExclude = new HashSet(); toExclude.add("database.mpqa.2.0/docs/20020203/20.46.36-9539"); toExclude.add("database.mpqa.2.0/docs/ula/im_401b_e73i32c22_031705-2"); toExclude.add("database.mpqa.2.0/docs/ula/AFGP-2002-600175-Trans"); toExclude.add("database.mpqa.2.0/docs/ula/20000815_AFP_ARB.0084.IBM-HA-NEW"); toExclude.add("database.mpqa.2.0/docs/ula/116CUL032"); while(i.hasNext()) { AnnotatedText t = i.next(); String filename = (String) t.getProperty("mpqa_file"); if(toExclude.contains(filename)) continue; if(!docs.contains(filename)) continue; System.out.println("*** " + filename + " ***"); //System.out.println(t.text); /*boolean seen = false; for(int j = 0; j < t.text.length(); j++) { char c = t.text.charAt(j); if(c > 0xff) { seen = true; } } if(seen) System.out.println(t.text);*/ String outfile = filename.substring("database.mpqa.2.0/docs/".length()); outfile = outfile.replace('/', '_'); outfile = outDir + "/" + outfile; Util.printToFile(outfile, t.text, "UTF-8"); } } catch(Exception e) { e.printStackTrace(); System.exit(1); } } }