////////////////////////////////////////////////////////////////// // // FreeLing - Open Source Language Analyzers // // Copyright (C) 2004 TALP Research Center // Universitat Politecnica de Catalunya // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // contact: Lluis Padro (padro@lsi.upc.es) // TALP Research Center // despatx C6.212 - Campus Nord UPC // 08034 Barcelona. SPAIN // //////////////////////////////////////////////////////////////// //------------------------------------------------------------------// // // IMPORTANT NOTICE // // This file contains a simple main program to illustrate // usage of FreeLing analyzers library. // // This sample main program may be used straightforwardly as // a basic front-end for the analyzers (e.g. to analyze corpora) // // Neverthless, if you want embed the FreeLing libraries inside // a larger application, or you want to deal with other // input/output formats (e.g. XML), the efficient and elegant // way to do so is consider this file as a mere example, and call // the library from your your own main code. // //------------------------------------------------------------------// #include #include #include #include #include #include #include #include "freeling.h" #include "freeling/morfo/util.h" #include "config.h" #undef MOD_TRACENAME #define MOD_TRACENAME L"FREELING_SPPP" using namespace std; using namespace freeling; const wchar_t FORMFEED=0x0C; // we use pointers to the analyzers, so we // can create only those strictly necessary. tokenizer *tk=NULL; splitter *sp=NULL; maco *morfo=NULL; nec *neclass=NULL; POS_tagger *tagger=NULL; // FL configuration options config *cfg=NULL; // Variables and classes to hold transformation rules FL->SPPP class SPPP_rule { public: boost::u32regex form1; boost::u32regex lemma; boost::u32regex tag; bool any_form, pn_form; bool any_lemma, pn_lemma; bool any_tag, pn_tag; wstring stem; wstring rule_id; wstring form2; SPPP_rule() { form1=boost::make_u32regex(L""); lemma=boost::make_u32regex(L""); tag=boost::make_u32regex(L""); any_form=true; any_lemma=true; any_tag=true; pn_form=true; pn_lemma=true; pn_tag=true;} ~SPPP_rule() {}; }; map noTag; list rules; map > replaces; list > fusion; //--------------------------------------------- // output a string both in cout and cerr //--------------------------------------------- void say(const wstring &s) { wcout<", L">"); util::find_and_replace(s, L"'", L"'"); } //--------------------------------------------- // decode special chars from XML //--------------------------------------------- void fromXML(wstring &s){ util::find_and_replace(s, L""", L"\""); util::find_and_replace(s, L"<", L"<"); util::find_and_replace(s, L">", L">"); util::find_and_replace(s, L"'", L"'"); util::find_and_replace(s, L"&", L"&"); } //--------------------------------------------- // print one analysis. //--------------------------------------------- void print_analysis(const analysis &a, const wstring &form) { wstring lcform = util::lowercase(form); wstring tag = a.get_tag(); wstring alemma = a.get_lemma(); toXML(alemma); if (a.is_retokenizable()) { say(L" "); list rtk=a.get_retokenizable(); list::iterator r; wstring rform; for (r=rtk.begin(); r!= rtk.end(); r++) { rform = r->get_form(); toXML(rform); say(L" get_tag() + L"\" form=\"" + rform +L"\" />"); } say(L" "); } else { wstring stem=L"NO-RULE-FOUND"; wstring rid=L"NO-RULE-FOUND"; wstring frm=L"NO-RULE-FOUND"; bool trobat = false; for (list::iterator r=rules.begin(); r!=rules.end() and not trobat; r++) { if ( (r->any_form or boost::u32regex_search(lcform,r->form1)==r->pn_form) and (r->any_lemma or boost::u32regex_search(alemma,r->lemma)==r->pn_lemma) and (r->any_tag or boost::u32regex_search(tag,r->tag)==r->pn_tag) ) { // matching rule found, apply right hand side. trobat = true; // compute stem if (r->stem==L"L") stem=alemma; else if (r->stem==L"T") stem=tag; else if (r->stem==L"F") stem=lcform; else stem=r->stem; // compute rule_id if (r->rule_id==L"L") rid=alemma; else if (r->rule_id==L"T") rid=tag; else if (r->rule_id==L"F") rid=lcform; // compute form wstring f=L""; for (size_t i=0; iform2.size(); i++) { if (r->form2[i]=='L') f=f+L"#"+alemma; else if (r->form2[i]=='T') f=f+L"#"+tag; else if (r->form2[i]=='F') f=f+L"#"+form; } if (not f.empty()) frm=f.substr(1); } } say(L" "); say(L" "); say(L" "); } } //--------------------------------------------- // check if the word matches some Fusion rule, // and apply it if so. //--------------------------------------------- void CheckFusion(word &w, bool tagged) { word::iterator wb,we,a; set common; if (not tagged) { wb = w.analysis_begin(); we = w.analysis_end(); } else { wb = w.selected_begin(); we = w.selected_end(); } // check all fusion rules. list >::iterator r; for (r=fusion.begin(); r!=fusion.end(); r++) { common.clear(); // clear set of common lemmas. // check rule bool ok=true; list::iterator tagout=r->begin(); // first tag is the output list::iterator tag1=r->begin(); tag1++; // second tag is first condition list::iterator tr; for (tr=tag1; tr!=r->end() and ok; tr++) { // build a set with lemmas for current rule tag set lems; for (a=wb; a!=we; a++) if ((*tr)==a->get_tag()) lems.insert(a->get_lemma()); if (tr==tag1) // first iteration, intersection so far is lem. common=lems; else { // further iterations, accumulate intersection. set is; set_intersection(common.begin(), common.end(), lems.begin(), lems.end(), inserter(is,is.begin()) ); common=is; } // if acumulated intersection is empty, rule won't match. ok = not common.empty(); } if (ok) { // rule matched. Apply it // for each lemma matching rule tags for (set::iterator lem=common.begin(); lem!=common.end(); lem++) { // Locate and erase analysis, replacing the first // with new tag. bool done=false; for (a=wb; a!=we; a++) { bool found=false; for (tr=r->begin(), tr++; tr!=r->end() and not found; tr++) found = ((*tr)==a->get_tag() and (*lem)==a->get_lemma()); if (found) { // tag and lemma match. Delete analysis if (not done) { // first matching analysis. just replace tag. a->set_tag(*tagout); done=true; } else { // not the first, delete. word::iterator a2=a; a2++; w.erase(a); a2--; a=a2; // fix iteration control } } } } } } } //--------------------------------------------- // output info for given word //--------------------------------------------- void print_token(sentence::iterator w, const wstring &currpos) { wstring wform=w->get_form(); toXML(wform); wstring lcform=w->get_lc_form(); toXML(lcform); say(L" get_span_start())+L"\" to=\""+util::int2wstring(w->get_span_finish())+L"\" >"); // if the word is in the 'replace' list, replace all its analysis. map >::iterator p=replaces.find(lcform); if (p!=replaces.end()) w->set_analysis(p->second); // Assume OutputFormat=TAGGED. Output only selected analysis. bool tagged=true; word::iterator wb = w->selected_begin(); word::iterator we = w->selected_end(); map::iterator nd=noTag.find(lcform); // find form in noTag list // if not found, try searching any selected PoS tag. for (word::iterator a=wb; nd==noTag.end() and a!=we; a++) nd=noTag.find(a->get_tag()); // if output is MORFO or word/tag was in NoDisambiguate list (and position matches), output all analysis. if (cfg->OutputFormat==MORFO or (nd!=noTag.end() and (nd->second==L"@any" or nd->second==currpos))) { wb = w->analysis_begin(); we = w->analysis_end(); tagged=false; } CheckFusion(*w,tagged); for (word::const_iterator ait=wb; ait!=we; ait++) print_analysis(*ait,wform); say(L" "); } //--------------------------------------------- // print obtained analysis. //--------------------------------------------- void PrintResults(list &ls) { sentence::iterator w; sentence::iterator nxt; list::iterator is; parse_tree tr; dep_tree dep; int nsentence=1; bool prevde=false; for (is=ls.begin(); is!=ls.end(); is++,++nsentence) { say(L""); wstring currpos=L"@begin"; for (w=is->begin(); w!=is->end(); w++) { print_token(w,currpos); if (w->is_ambiguous_mw()) { list lw = w->get_words_mw(); for (list::iterator iw=lw.begin(); iw!=lw.end(); iw++) { print_token(iw,currpos); currpos=L"@any"; } } currpos=L"@any"; } say(L""); } } //--------------------------------------------- // Plain text, start with tokenizer. //--------------------------------------------- void ProcessPlain() { wstring text; list av; list::const_iterator i; list ls; bool head=false; while (getline(wcin,text)) { wcerr<"); if (p!=wstring::npos) { text.erase(p,38); say(L""); head=true; } else ERROR_CRASH(L"ERROR - header expected"); } if (text[0]==FORMFEED) { // process last sentence in buffer (if any) ls=sp->split(av, true); //flush splitter buffer morfo->analyze(ls); if (cfg->OutputFormat>=TAGGED) tagger->analyze(ls); if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) neclass->analyze(ls); PrintResults(ls); wcout<", L""); util::find_and_replace(text, L"", L""); // translate XML chars to latin1 fromXML(text); av=tk->tokenize(text); ls=sp->split(av, cfg->AlwaysFlush); morfo->analyze(ls); if (cfg->OutputFormat>=TAGGED) tagger->analyze(ls); if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) neclass->analyze(ls); PrintResults(ls); av.clear(); // clear list of words for next use ls.clear(); // clear list of sentences for next use } } } //--------------------------------------------- // Locate file sppp.dat in the same place than // the executable, and load the transformation // rules. //--------------------------------------------- void read_SPPP_rules() { wstring name; char* flsppp=getenv("FREELINGSPPP"); if (flsppp==NULL){ ERROR_CRASH(L"FREELINGSPPP is not defined. It should point to a file with rules tuning FreeLing output to meet the LKB grammar needs."); } wstring exp=util::string2wstring(flsppp); wifstream fitx; util::open_utf8_file(fitx,exp); if (fitx.fail()) ERROR_CRASH(L"Error opening rule file "+exp); wstring line; int reading=0; int read=0; while (getline(fitx,line)) { if (line == L"") reading=1; else if (line == L"") reading=0; else if (line == L"") reading=2; else if (line == L"") reading=0; else if (line == L"") reading=3; else if (line == L"") reading=0; else if (line == L"") reading=4; else if (line == L"") reading=0; else if (reading==1) { // reading NoDisambiguate section wistringstream sin(line); wstring form,at; sin>>form>>at; noTag.insert(make_pair(form,at)); } else if (reading==2) { // whole analysis list replacements wistringstream sin(line); wstring form,al,at; sin>>form; list la; while (sin>>al>>at) la.push_back(analysis(al,at)); replaces.insert(make_pair(form,la)); } else if (reading==3) { wistringstream sin(line); list rul; wstring tag; sin>>tag; while (tag!=L"=>") { rul.push_back(tag); sin>>tag; } // store last tag at the first place. sin>>tag; rul.push_front(tag); fusion.push_back(rul); } else if (reading==4) { // Read output field arrangements wistringstream sin(line); SPPP_rule r; // new rule. wstring x; sin>>x; /// get form if (x!=L"*") { r.any_form=false; if (x[0]=='!') { r.pn_form=false; x = x.substr(1); } r.form1 = boost::make_u32regex(L"^"+x+L"$"); } sin>>x; /// get lemma if (x!=L"*") { r.any_lemma=false; if (x[0]=='!') { r.pn_lemma=false; x = x.substr(1); } r.lemma = boost::make_u32regex(L"^"+x+L"$"); } sin>>x; /// get tag if (x!=L"*") { r.any_tag=false; if (x[0]=='!') { r.pn_tag=false; x = x.substr(1); } r.tag = boost::make_u32regex(L"^"+x); } sin>>x; if (x!=L"=>") ERROR_CRASH(L"Expecting '=>' in rule read from sppp.dat"); sin>>r.stem; sin>>r.rule_id; sin>>r.form2; // Rest of the line (if any) is ignored (comments). // add rule to rule list. rules.push_back(r); } if (reading!=0) read++; } if (read==0) ERROR_CRASH(L"Rule file "+exp+L" contains no rules."); } //--------------------------------------------- // Sample main program //--------------------------------------------- int main(int argc, char **argv) { util::init_locale(L"default"); /// load transformation file from FreeLing to SPPP read_SPPP_rules(); // read configuration file and command-line options cfg = new config(argc, argv); // create required analyzers tk = new tokenizer(cfg->TOK_TokenizerFile); sp = new splitter(cfg->SPLIT_SplitterFile); // the morfo class requires several options at creation time. // they are passed packed in a maco_options object. maco_options opt(cfg->Lang); // boolean options to activate/desactivate modules // default: all modules activated (options set to "true") opt.set_active_modules (cfg->MACO_UserMap, cfg->MACO_AffixAnalysis, cfg->MACO_MultiwordsDetection, cfg->MACO_NumbersDetection, cfg->MACO_PunctuationDetection, cfg->MACO_DatesDetection, cfg->MACO_QuantitiesDetection, cfg->MACO_DictionarySearch, cfg->MACO_ProbabilityAssignment, cfg->MACO_NERecognition, false); // decimal/thousand separators used by number detection opt.set_nummerical_points(cfg->MACO_Decimal, cfg->MACO_Thousand); // Minimum probability for a tag for an unkown word opt.set_threshold(cfg->MACO_ProbabilityThreshold); // Data files for morphological submodules. by default set to "" // Only files for active modules have to be specified opt.set_data_files (cfg->MACO_UserMapFile, cfg->MACO_LocutionsFile, cfg->MACO_QuantitiesFile, cfg->MACO_AffixFile, cfg->MACO_ProbabilityFile, cfg->MACO_DictionaryFile, cfg->MACO_NPDataFile, cfg->MACO_PunctuationFile, L""); // create analyzer with desired options morfo = new maco(opt); if (cfg->OutputFormat>=TAGGED) { if (cfg->TAGGER_which == HMM) tagger = new hmm_tagger(cfg->TAGGER_HMMFile,cfg->TAGGER_Retokenize, cfg->TAGGER_ForceSelect); else if (cfg->TAGGER_which == RELAX) tagger = new relax_tagger(cfg->TAGGER_RelaxFile, cfg->TAGGER_RelaxMaxIter, cfg->TAGGER_RelaxScaleFactor, cfg->TAGGER_RelaxEpsilon, cfg->TAGGER_Retokenize, cfg->TAGGER_ForceSelect); } if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) { neclass = new nec(cfg->NEC_NECFile); } // Input is plain text. ProcessPlain(); // clean up. Note that deleting a null pointer is a safe (yet useless) operation delete cfg; delete tk; delete sp; delete morfo; delete tagger; delete neclass; }