////////////////////////////////////////////////////////////////// // // FreeLing - Open Source Language Analyzers // // Copyright (C) 2004 TALP Research Center // Universitat Politecnica de Catalunya // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public // License as published by the Free Software Foundation; either // version 3 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details. // // You should have received a copy of the GNU General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // contact: Lluis Padro (padro@lsi.upc.es) // TALP Research Center // despatx C6.212 - Campus Nord UPC // 08034 Barcelona. SPAIN // //////////////////////////////////////////////////////////////// #ifndef _CONFIG #define _CONFIG #include #include namespace po = boost::program_options; #include "freeling/morfo/traces.h" #include "freeling/morfo/util.h" #define MOD_TRACENAME L"CONFIG_OPTIONS" #define MOD_TRACECODE OPTIONS_TRACE #define DefaultConfigFile "analyzer.cfg" // default ConfigFile // codes for input-output formats #define PLAIN 0 #define IDENT 1 #define TOKEN 2 #define SPLITTED 3 #define MORFO 4 #define TAGGED 5 #define SENSES 6 #define SHALLOW 7 #define PARSED 8 #define DEP 9 // codes for tagging algorithms #define HMM 0 #define RELAX 1 // codes for dependency parsers #define TXALA 0 // codes for sense annotation #define NONE 0 #define ALL 1 #define MFS 2 #define UKB 3 // codes for ForceSelect #define FORCE_NONE 0 #define FORCE_TAGGER 1 #define FORCE_RETOK 2 //////////////////////////////////////////////////////////////// /// Class config implements a set of specific options /// for the NLP analyzer, providing a C++ wrapper to /// libcfg+ library. //////////////////////////////////////////////////////////////// class config { public: std::string ConfigFile; /// Language of text to process std::wstring Lang; /// Locale of text to process std::wstring Locale; /// Level of analysis in input and output int InputFormat, OutputFormat; /// Flush splitter at each line bool AlwaysFlush; /// produce output in a format suitable to train the tagger. bool TrainingOutput; /// Tokenizer options std::wstring TOK_TokenizerFile; /// Splitter options std::wstring SPLIT_SplitterFile; /// Morphological analyzer options bool MACO_UserMap, MACO_AffixAnalysis, MACO_MultiwordsDetection, MACO_NumbersDetection, MACO_PunctuationDetection, MACO_DatesDetection, MACO_QuantitiesDetection, MACO_DictionarySearch, MACO_ProbabilityAssignment, MACO_OrthographicCorrection, MACO_NERecognition; /// Morphological analyzer options std::wstring MACO_Decimal, MACO_Thousand; /// Language identifier options std::wstring IDENT_identFile; /// Morphological analyzer options std::wstring MACO_UserMapFile, MACO_LocutionsFile, MACO_QuantitiesFile, MACO_AffixFile, MACO_ProbabilityFile, MACO_DictionaryFile, MACO_NPDataFile, MACO_PunctuationFile, MACO_CorrectorFile; double MACO_ProbabilityThreshold; bool MACO_RetokContractions; // NEC options bool NEC_NEClassification; std::wstring NEC_NECFile; // Sense annotator options int SENSE_WSD_which; std::wstring SENSE_ConfigFile; std::wstring UKB_ConfigFile; /// Tagger options std::wstring TAGGER_HMMFile; std::wstring TAGGER_RelaxFile; int TAGGER_which; int TAGGER_RelaxMaxIter; double TAGGER_RelaxScaleFactor; double TAGGER_RelaxEpsilon; bool TAGGER_Retokenize; int TAGGER_ForceSelect; /// Parser options std::wstring PARSER_GrammarFile; /// Dependency options std::wstring DEP_TxalaFile; bool COREF_CoreferenceResolution; std::wstring COREF_CorefFile; /// constructor config(int ac, char **av) { // Auxiliary variables to store options read as strings before they are converted // to their final enumerate/integer values std::string InputF, OutputF, Tagger, SenseAnot, Force; std::string tracemod; std::string language, locale, identFile, tokFile, splitFile, macoDecimal, macoThousand, usermapFile, locutionsFile, quantitiesFile, affixFile, probabilityFile, dictionaryFile, npDataFile, punctuationFile, correctorFile; std::string necFile, senseFile, ukbFile; std::string hmmFile,relaxFile,grammarFile,txalaFile,corefFile; po::options_description vis_cl("Available command-line options"); vis_cl.add_options() ("help,h", "Help about command-line options.") ("help-cf", "Help about configuration file options.") #ifndef WIN32 ("version,v", "Print installed FreeLing version.") #endif ("fcfg,f", po::value(&ConfigFile)->default_value(DefaultConfigFile), "Configuration file to use") ("lang",po::value(&language),"language of the input text") ("locale",po::value(&locale),"locale encoding of input text (\"default\"=en_US.UTF-8, \"system\"=current system locale, [other]=any valid locale string installed in the system (e.g. ca_ES.UTF-8,it_IT.UTF-8,...)") ("flush","Consider each newline as a sentence end") ("noflush","Do not consider each newline as a sentence end") ("inpf",po::value(&InputF),"Input format (plain,token,splitted,morfo,sense,tagged)") ("outf",po::value(&OutputF),"Output format (ident,token,splitted,morfo,tagged,shallow,parsed,dep)") ("train","Produce output format suitable for train scripts") ("fidn,I",po::value(&identFile),"Language identifier file") ("ftok",po::value(&tokFile),"Tokenizer rules file") ("fsplit",po::value(&splitFile),"Splitter option file") ("afx","Perform affix analysis") ("noafx","Do not perform affix analysis") ("usr","Apply user mapping file") ("nousr","Do not apply user mapping file") ("loc","Perform multiword detection") ("noloc","Do not perform multiword detection") ("numb","Perform number detection") ("nonumb","Do not perform number detection") ("punt","Perform punctuation detection") ("nopunt","Do not perform punctuation detection") ("date","Perform date/time expression detection") ("nodate","Do not perform date/time expression detection") ("quant","Perform magnitude/ratio detection") ("noquant","Do not perform magnitude/ratio detection") ("dict","Perform dictionary search") ("nodict","Do not perform dictionary search") ("prob","Perform probability assignment") ("noprob","Do not perform probability assignment") ("rtkcon","Dictionary retokenizes contractions regardless of --nortk option") ("nortkcon","Dictionary leaves contraction retokenization to --rtk/nortk option") ("orto","Perform ortographic correction") ("noorto","Do not perform ortographic correction") ("ner","Perform NE recognition") ("noner","Do not perform NE recognition") ("dec",po::value(&macoDecimal),"Decimal point character") ("thou",po::value(&macoThousand),"Thousand point character") ("fmap,M",po::value(&usermapFile),"User-map file") ("floc,L",po::value(&locutionsFile),"Multiwords file") ("fqty,Q",po::value(&quantitiesFile),"Quantities file") ("fafx,S",po::value(&affixFile),"Affix rules file") ("fprob,P",po::value(&probabilityFile),"Probabilities file") ("thres,e",po::value(&MACO_ProbabilityThreshold),"Probability threshold for unknown word tags") ("fdict,D",po::value(&dictionaryFile),"Form dictionary") ("fnp,N",po::value(&npDataFile),"NE recognizer data file") ("fcorr,K",po::value(&correctorFile),"Ortographic corrector configuration file") ("fpunct,F",po::value(&punctuationFile),"Punctuation symbol file") ("nec","Perform NE classification") ("nonec","Do not perform NE classification") ("fnec",po::value(&necFile),"NEC configuration file") ("sense,s",po::value(&SenseAnot),"Type of sense annotation (no|none,all,mfs,ukb)") ("fsense,W",po::value(&senseFile),"Configuration file for sense annotation module") ("fukb,U",po::value(&ukbFile),"Configuration file for UKB word sense disambiguator") ("hmm,H",po::value(&hmmFile),"Data file for HMM tagger") ("rlx,R",po::value(&relaxFile),"Data file for RELAX tagger") ("tag,t",po::value(&Tagger),"Tagging alogrithm to use (hmm, relax)") ("iter,i",po::value(&TAGGER_RelaxMaxIter),"Maximum number of iterations allowed for RELAX tagger") ("sf,r",po::value(&TAGGER_RelaxScaleFactor),"Support scale factor for RELAX tagger (affects step size)") ("eps",po::value(&TAGGER_RelaxEpsilon),"Convergence epsilon value for RELAX tagger") ("rtk","Perform retokenization after PoS tagging") ("nortk","Do not perform retokenization after PoS tagging") ("force",po::value(&Force),"When the tagger must be forced to select only one tag per word (no|none,tagger,retok)") ("grammar,G",po::value(&grammarFile),"Grammar file for chart parser") ("txala,T",po::value(&txalaFile),"Rule file for Txala dependency parser") ("coref","Perform coreference resolution") ("nocoref","Do not perform coreference resolution") ("fcorf,C",po::value(&corefFile),"Coreference solver data file") ; po::options_description vis_cf("Available configuration file options"); vis_cf.add_options() ("Lang",po::value(&language),"Language of the input text") ("Locale",po::value(&locale)->default_value("default"),"locale encoding of input text (\"default\"=en_US.UTF-8, \"system\"=current system locale, [other]=any valid locale string installed in the system (e.g. ca_ES.UTF-8,it_IT.UTF-8,...)") ("AlwaysFlush",po::value(&AlwaysFlush)->default_value(false),"Consider each newline as a sentence end") ("InputFormat",po::value(&InputF)->default_value("plain"),"Input format (plain,token,splitted,morfo,sense,tagged)") ("OutputFormat",po::value(&OutputF)->default_value("tagged"),"Output format (token,splitted,morfo,tagged,shallow,parsed,dep)") ("TokenizerFile",po::value(&tokFile),"Tokenizer rules file") ("SplitterFile",po::value(&splitFile),"Splitter option file") ("AffixAnalysis",po::value(&MACO_AffixAnalysis)->default_value(false),"Perform affix analysis") ("UserMap",po::value(&MACO_UserMap)->default_value(false),"Apply user mapping file") ("MultiwordsDetection",po::value(&MACO_MultiwordsDetection)->default_value(false),"Perform multiword detection") ("NumbersDetection",po::value(&MACO_NumbersDetection)->default_value(false),"Perform number detection") ("PunctuationDetection",po::value(&MACO_PunctuationDetection)->default_value(false),"Perform punctuation detection") ("DatesDetection",po::value(&MACO_DatesDetection)->default_value(false),"Perform date/time expression detection") ("QuantitiesDetection",po::value(&MACO_QuantitiesDetection)->default_value(false),"Perform magnitude/ratio detection") ("DictionarySearch",po::value(&MACO_DictionarySearch)->default_value(false),"Perform dictionary search") ("RetokContractions",po::value(&MACO_RetokContractions)->default_value(true),"Dictionary retokenizes contractions regardless of --nortk option") ("ProbabilityAssignment",po::value(&MACO_ProbabilityAssignment)->default_value(false),"Perform probability assignment") ("OrthographicCorrection",po::value(&MACO_OrthographicCorrection)->default_value(false),"Perform ortographic correction") ("NERecognition",po::value(&MACO_NERecognition)->default_value(false),"Perform NE recognition") ("DecimalPoint",po::value(&macoDecimal),"Decimal point character") ("ThousandPoint",po::value(&macoThousand),"Thousand point character") ("UserMapFile",po::value(&usermapFile),"User mapping file") ("LocutionsFile",po::value(&locutionsFile),"Multiwords file") ("QuantitiesFile",po::value(&quantitiesFile),"Quantities file") ("AffixFile",po::value(&affixFile),"Affix rules file") ("ProbabilityFile",po::value(&probabilityFile),"Probabilities file") ("ProbabilityThreshold",po::value(&MACO_ProbabilityThreshold),"Probability threshold for unknown word tags") ("DictionaryFile",po::value(&dictionaryFile),"Form dictionary") ("NPDataFile",po::value(&npDataFile),"NP recognizer data file") ("CorrectorFile",po::value(&correctorFile),"Ortographic corrector configuration file") ("PunctuationFile",po::value(&punctuationFile),"Punctuation symbol file") ("NEClassification",po::value(&NEC_NEClassification)->default_value(false),"Perform NE classification") ("NECFile",po::value(&necFile),"NEC configuration file") ("SenseAnnotation",po::value(&SenseAnot)->default_value("none"),"Type of sense annotation (no|none,all,mfs,ukb)") ("SenseConfigFile",po::value(&senseFile),"Configuration file for sense annotation module") ("UKBConfigFile",po::value(&ukbFile),"Configuration file for UKB word sense disambiguator") ("TaggerHMMFile",po::value(&hmmFile),"Data file for HMM tagger") ("TaggerRelaxFile",po::value(&relaxFile),"Data file for RELAX tagger") ("Tagger",po::value(&Tagger)->default_value("hmm"),"Tagging alogrithm to use (hmm, relax)") ("TaggerRelaxMaxIter",po::value(&TAGGER_RelaxMaxIter),"Maximum number of iterations allowed for RELAX tagger") ("TaggerRelaxScaleFactor",po::value(&TAGGER_RelaxScaleFactor),"Support scale factor for RELAX tagger (affects step size)") ("TaggerRelaxEpsilon",po::value(&TAGGER_RelaxEpsilon),"Convergence epsilon value for RELAX tagger") ("TaggerRetokenize",po::value(&TAGGER_Retokenize)->default_value(false),"Perform retokenization after PoS tagging") ("TaggerForceSelect",po::value(&Force)->default_value("retok"),"When the tagger must be forced to select only one tag per word (no|none,tagger,retok)") ("GrammarFile",po::value(&grammarFile),"Grammar file for chart parser") ("DepTxalaFile",po::value(&txalaFile),"Rule file for Txala dependency parser") ("CoreferenceResolution",po::value(&COREF_CoreferenceResolution)->default_value(false),"Perform coreference resolution") ("CorefFile",po::value(&corefFile),"Coreference solver data file") ; po::options_description hid_cl("Hidden CL options"); hid_cl.add_options() ("tlevel,l",po::value(&freeling::traces::TraceLevel),"Debug traces verbosity") ("tmod,m",po::value(&tracemod),"Mask indicating which modules to trace") ; po::options_description hid_cf("Hidden CF options"); hid_cf.add_options() ("TraceLevel",po::value(&freeling::traces::TraceLevel)->default_value(0),"Debug traces verbosity") ("TraceModule",po::value(&tracemod)->default_value("0x0"),"Mask indicating which modules to trace") ; po::options_description cl_op("All command line options"); cl_op.add(vis_cl).add(hid_cl); po::options_description cf_op("All configuration file options"); cf_op.add(vis_cf).add(hid_cf); po::variables_map vm; try { po::store(po::parse_command_line(ac, av, cl_op), vm); po::notify(vm); } catch (...) { std::cerr<<"Exception ocurred while parsing command line"<> freeling::traces::TraceModule; } } private: void ExpandFileName(std::string &s) { if (s.empty()) return; std::string name = s; size_t n=name.find_first_of("$"); if (n!=std::string::npos) { size_t i=name.find_first_of("/\\",n+1); if (i==std::string::npos) i=name.size(); char* exp=getenv(name.substr(n+1,i-n-1).c_str()); if (exp==NULL){ WARNING(L"Undefined variable "+freeling::util::string2wstring(name.substr(n+1,i-n-1))+L" in configuration file "+freeling::util::string2wstring(ConfigFile)+L" expanded as empty string."); name = name.substr(0,n) + name.substr(i); } else { name = name.substr(0,n) + std::string(exp) + name.substr(i); } s=name; } } void SetBooleanOptionCL (const int pos, const int neg, bool &opt, const std::string &name) { if (pos && neg) WARNING(L"Ambiguous specification for option --"+freeling::util::string2wstring(name)+L" in command line. Using default value."); else if (pos) opt=true; else if (neg) opt=false; //else: nothing specified, leave things as they are. } }; #endif