//////////////////////////////////////////////////////////////////
//
//    FreeLing - Open Source Language Analyzers
//
//    Copyright (C) 2004   TALP Research Center
//                         Universitat Politecnica de Catalunya
//
//    This library is free software; you can redistribute it and/or
//    modify it under the terms of the GNU Lesser General Public
//    License as published by the Free Software Foundation; either
//    version 2.1 of the License, or (at your option) any later version.
//
//    This library is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
//    Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public
//    License along with this library; if not, write to the Free Software
//    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//    contact: Lluis Padro (padro@lsi.upc.es)
//             TALP Research Center
//             despatx C6.212 - Campus Nord UPC
//             08034 Barcelona.  SPAIN
//
////////////////////////////////////////////////////////////////


//------------------------------------------------------------------//
//
//                    IMPORTANT NOTICE
//
//  This file contains a simple main program to illustrate 
//  usage of FreeLing analyzers library.
//
//  This sample main program may be used straightforwardly as 
//  a basic front-end for the analyzers (e.g. to analyze corpora)
//
//  Neverthless, if you want embed the FreeLing libraries inside
//  a larger application, or you want to deal with other 
//  input/output formats (e.g. XML), the efficient and elegant 
//  way to do so is consider this file as a mere example, and call 
//  the library from your your own main code.
//
//------------------------------------------------------------------//


#include <sstream>
#include <fstream>
#include <iostream>

#include <set>
#include <map>
#include <vector>
#include <boost/filesystem.hpp>

#include "freeling.h"
#include "freeling/morfo/util.h"
#include "config.h"

#undef MOD_TRACENAME
#define MOD_TRACENAME L"FREELING_SPPP"

using namespace std;
using namespace freeling;

const wchar_t FORMFEED=0x0C;

// we use pointers to the analyzers, so we
// can create only those strictly necessary.
tokenizer *tk=NULL;
splitter *sp=NULL;
maco *morfo=NULL;
nec *neclass=NULL;
POS_tagger *tagger=NULL;

// FL configuration options
config *cfg=NULL;

// Variables and classes to hold transformation rules FL->SPPP
class SPPP_rule {
  public:
    boost::u32regex form1;
    boost::u32regex lemma;
    boost::u32regex tag;
    bool any_form, pn_form;
    bool any_lemma, pn_lemma;
    bool any_tag, pn_tag;
 
    wstring stem;    
    wstring rule_id;
    wstring form2;

     SPPP_rule() {
       form1=boost::make_u32regex(L""); 
       lemma=boost::make_u32regex(L"");
       tag=boost::make_u32regex(L"");
       any_form=true; any_lemma=true; any_tag=true; 
       pn_form=true;  pn_lemma=true;  pn_tag=true;}
    ~SPPP_rule() {};
};

map<wstring,wstring> noTag;
list<SPPP_rule> rules;
map<wstring,list<analysis> > replaces;
list<list<wstring> > fusion;


//---------------------------------------------
// output a string both in cout and cerr
//---------------------------------------------
void say(const wstring &s) {
  wcout<<s<<endl;
  wcerr<<s<<endl;
}


//---------------------------------------------
// encode special chars to XML
//---------------------------------------------
void toXML(wstring &s){
  util::find_and_replace(s, L"&", L"&amp;");
  util::find_and_replace(s, L"\"", L"&quot;");
  util::find_and_replace(s, L"<", L"&lt;");
  util::find_and_replace(s, L">", L"&gt;");
  util::find_and_replace(s, L"'", L"&apos;");
}

//---------------------------------------------
// decode special chars from XML
//---------------------------------------------
void fromXML(wstring &s){
  util::find_and_replace(s, L"&quot;", L"\"");
  util::find_and_replace(s, L"&lt;", L"<");
  util::find_and_replace(s, L"&gt;", L">");
  util::find_and_replace(s, L"&apos;", L"'");
  util::find_and_replace(s, L"&amp;", L"&");
}

//---------------------------------------------
// print one analysis.
//---------------------------------------------
void print_analysis(const analysis &a, const wstring &form) {
 
  wstring lcform = util::lowercase(form);
  wstring tag = a.get_tag();
  wstring alemma = a.get_lemma();
  toXML(alemma);

  if (a.is_retokenizable()) {            
    say(L"    <analysis stem=\""+alemma+L"\" >");
    list<word> rtk=a.get_retokenizable();
    list<word>::iterator r;
    wstring rform;
    for (r=rtk.begin(); r!= rtk.end(); r++) {
      rform = r->get_form();
      toXML(rform);
      say(L"      <rule id=\""
	  + wstring(r == rtk.begin() ? L"" : L"+") + r->get_tag() 
	  + L"\" form=\"" + rform +L"\" />");
    }
    say(L"    </analysis>");
  }  

  else {

    wstring stem=L"NO-RULE-FOUND"; wstring rid=L"NO-RULE-FOUND"; wstring frm=L"NO-RULE-FOUND";
    bool trobat = false;
    for (list<SPPP_rule>::iterator r=rules.begin(); r!=rules.end() and not trobat; r++) {
      
      if ( (r->any_form or boost::u32regex_search(lcform,r->form1)==r->pn_form) and
	   (r->any_lemma or boost::u32regex_search(alemma,r->lemma)==r->pn_lemma) and
	   (r->any_tag or boost::u32regex_search(tag,r->tag)==r->pn_tag) ) {
	
	// matching rule found, apply right hand side.
	trobat = true;
	
	// compute stem
	if (r->stem==L"L") stem=alemma;
	else if (r->stem==L"T") stem=tag;
	else if (r->stem==L"F") stem=lcform;
	else stem=r->stem;
	
	// compute rule_id
	if (r->rule_id==L"L") rid=alemma;
	else if (r->rule_id==L"T") rid=tag;
	else if (r->rule_id==L"F") rid=lcform;
	
	// compute form
	wstring f=L"";
	for (size_t i=0; i<r->form2.size(); i++) {
	  if (r->form2[i]=='L') f=f+L"#"+alemma;
	  else if (r->form2[i]=='T') f=f+L"#"+tag;
	  else if (r->form2[i]=='F') f=f+L"#"+form;
	}
	if (not f.empty()) frm=f.substr(1);      
      }
    }
    
    say(L"    <analysis stem=\""+stem+L"\" >");
    say(L"      <rule id=\""+rid+L"\" form=\""+frm+L"\" />");
    say(L"    </analysis>");
  }
}


//---------------------------------------------
// check if the word matches some Fusion rule,
// and apply it if so.
//---------------------------------------------
void CheckFusion(word &w, bool tagged) {
  word::iterator wb,we,a;
  set<wstring> common;

  if (not tagged) {
     wb = w.analysis_begin();
     we = w.analysis_end();
  }
  else {
     wb = w.selected_begin();
     we = w.selected_end();
  }

  // check all fusion rules.
  list<list<wstring> >::iterator r;
  for (r=fusion.begin(); r!=fusion.end(); r++) {
 
    common.clear(); // clear set of common lemmas.

    // check rule
    bool ok=true;
    list<wstring>::iterator tagout=r->begin(); // first tag is the output
    list<wstring>::iterator tag1=r->begin(); tag1++; // second tag is first condition

    list<wstring>::iterator tr;
    for (tr=tag1; tr!=r->end() and ok; tr++) {
      // build a set with lemmas for current rule tag
      set<wstring> lems; 
      for (a=wb; a!=we; a++) 
	if ((*tr)==a->get_tag()) 
	  lems.insert(a->get_lemma());

      if (tr==tag1) 
	// first iteration, intersection so far is lem.
	common=lems; 
      else {
	// further iterations, accumulate intersection.
	set<wstring> is;
	set_intersection(common.begin(), common.end(), lems.begin(), lems.end(), inserter(is,is.begin()) );
	common=is;
      }

      // if acumulated intersection is empty, rule won't match.
      ok = not common.empty();
    }

    if (ok) {  // rule matched. Apply it      
      // for each lemma matching rule tags
      for (set<wstring>::iterator lem=common.begin(); lem!=common.end(); lem++) {

	// Locate and erase analysis, replacing the first 
        // with new tag.
	bool done=false;
	for (a=wb; a!=we; a++) {
	  bool found=false;
	  for (tr=r->begin(), tr++; tr!=r->end() and not found; tr++)
	    found = ((*tr)==a->get_tag() and (*lem)==a->get_lemma());
	  
	  if (found) {  // tag and lemma match. Delete analysis
	    if (not done) {
	      // first matching analysis. just replace tag.
	      a->set_tag(*tagout);
	      done=true;
	    }
	    else {
	      // not the first, delete.
	      word::iterator a2=a; a2++;
	      w.erase(a);	      
	      a2--; a=a2;  // fix iteration control
	    }
	  }
	}
      }	  
    }
  }
}


//---------------------------------------------
// output info for given word
//---------------------------------------------

void print_token(sentence::iterator w, const wstring &currpos) {

  wstring wform=w->get_form(); toXML(wform); 
  wstring lcform=w->get_lc_form(); toXML(lcform);
  
  say(L"  <token form=\""+wform+L"\" from=\""+util::int2wstring(w->get_span_start())+L"\" to=\""+util::int2wstring(w->get_span_finish())+L"\" >");
  
  // if the word is in the 'replace' list, replace all its analysis.
  map<wstring,list<analysis> >::iterator p=replaces.find(lcform);
  if (p!=replaces.end()) w->set_analysis(p->second);
  
  // Assume OutputFormat=TAGGED. Output only selected analysis.
  bool tagged=true;
  word::iterator wb = w->selected_begin();
  word::iterator we = w->selected_end();
  
  map<wstring,wstring>::iterator nd=noTag.find(lcform);  // find form in noTag list
  // if not found, try searching any selected PoS tag.
  for (word::iterator a=wb; nd==noTag.end() and a!=we; a++) 
    nd=noTag.find(a->get_tag());  
  
  // if output is MORFO or word/tag was in NoDisambiguate list (and position matches), output all analysis.
  if (cfg->OutputFormat==MORFO or (nd!=noTag.end() and (nd->second==L"@any" or nd->second==currpos))) {
    wb = w->analysis_begin();
    we = w->analysis_end();
    tagged=false;
  }
  
  CheckFusion(*w,tagged);
  for (word::const_iterator ait=wb; ait!=we; ait++) print_analysis(*ait,wform);
  
  say(L"  </token>");
  
}


//---------------------------------------------
// print obtained analysis.
//---------------------------------------------
void PrintResults(list<sentence> &ls) {
  sentence::iterator w;
  sentence::iterator nxt;
  list<sentence>::iterator is;
  parse_tree tr;  
  dep_tree dep;
  int nsentence=1;
  bool prevde=false; 
 
  for (is=ls.begin(); is!=ls.end(); is++,++nsentence) {
   
    say(L"<segment>");
    wstring currpos=L"@begin";
    for (w=is->begin(); w!=is->end(); w++) {      

      print_token(w,currpos);

      if (w->is_ambiguous_mw()) {
	list<word> lw = w->get_words_mw();
	for (list<word>::iterator iw=lw.begin(); iw!=lw.end(); iw++) {
	  print_token(iw,currpos);
	  currpos=L"@any";
	}
      }	
      
      currpos=L"@any";
    }
    say(L"</segment>");
  }
}


//---------------------------------------------
// Plain text, start with tokenizer.
//---------------------------------------------
void ProcessPlain() {
  wstring text;
  list<word> av;
  list<word>::const_iterator i;
  list<sentence> ls;

    bool head=false;
    while (getline(wcin,text)) {

      wcerr<<L"   ## Read line ("<<text<<L")"<<endl;
      if (!head) {
	wstring::size_type p=text.find(L"<?xml version='1.0' encoding='utf-8'?>"); 
	if (p!=wstring::npos) {
	  text.erase(p,38);
	  say(L"<?xml version=\"1.0\" encoding=\"utf-8\"?>");
          head=true;
	}
	else ERROR_CRASH(L"ERROR - <?xml?> header expected");
      }

      if (text[0]==FORMFEED) {
	// process last sentence in buffer (if any)
	ls=sp->split(av, true);  //flush splitter buffer
	morfo->analyze(ls);
	if (cfg->OutputFormat>=TAGGED) tagger->analyze(ls);
	if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) neclass->analyze(ls);
	
	PrintResults(ls);
        wcout<<FORMFEED<<endl; 
        wcerr<<FORMFEED<<endl; 
        head=false;
      }
      else {
	// clean xml tags from input
	wstring::size_type p;
	util::find_and_replace(text, L"<text>", L"");
	util::find_and_replace(text, L"</text>", L"");

        // translate XML chars to latin1
	fromXML(text);
	
	av=tk->tokenize(text);
	ls=sp->split(av, cfg->AlwaysFlush);
	morfo->analyze(ls);
	if (cfg->OutputFormat>=TAGGED) tagger->analyze(ls);
	if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) neclass->analyze(ls);

	PrintResults(ls);	
	av.clear(); // clear list of words for next use
	ls.clear(); // clear list of sentences for next use
      }
    }

}


//---------------------------------------------
// Locate file sppp.dat in the same place than 
// the executable, and load the transformation 
// rules.
//---------------------------------------------
void read_SPPP_rules() {

  wstring name;
  char* flsppp=getenv("FREELINGSPPP");
  if (flsppp==NULL){
    ERROR_CRASH(L"FREELINGSPPP is not defined. It should point to a file with rules tuning FreeLing output to meet the LKB grammar needs.");
  }
  
  wstring exp=util::string2wstring(flsppp);

  wifstream fitx;
  util::open_utf8_file(fitx,exp);
  if (fitx.fail()) ERROR_CRASH(L"Error opening rule file "+exp);

  wstring line;
  int reading=0;
  int read=0;
  while (getline(fitx,line)) {
    if (line == L"<NoDisambiguate>") reading=1;
    else if (line == L"</NoDisambiguate>") reading=0;
    else if (line == L"<ReplaceAll>") reading=2;
    else if (line == L"</ReplaceAll>") reading=0;
    else if (line == L"<Fusion>") reading=3;
    else if (line == L"</Fusion>") reading=0;
    else if (line == L"<Output>") reading=4;
    else if (line == L"</Output>") reading=0;

    else if (reading==1) {  // reading NoDisambiguate section
      wistringstream sin(line);
      wstring form,at;
      sin>>form>>at;
      noTag.insert(make_pair(form,at));
    }

    else if (reading==2) { // whole analysis list replacements
      wistringstream sin(line);

      wstring form,al,at;
      sin>>form;
      list<analysis> la;
      while (sin>>al>>at) la.push_back(analysis(al,at));
      replaces.insert(make_pair(form,la));
    }
 
    else if (reading==3) {
      wistringstream sin(line);
      list<wstring> rul;

      wstring tag;
      sin>>tag;
      while (tag!=L"=>") {
	rul.push_back(tag);
	sin>>tag;
      }
      // store last tag at the first place.
      sin>>tag;
      rul.push_front(tag);

      fusion.push_back(rul);
    }
 
    else if (reading==4) {  // Read output field arrangements
      wistringstream sin(line);

      SPPP_rule r;  // new rule.

      wstring x;
      sin>>x;  /// get form
      if (x!=L"*") {
	r.any_form=false;
	if (x[0]=='!') { r.pn_form=false; x = x.substr(1); }	  
	r.form1 = boost::make_u32regex(L"^"+x+L"$");
      }
      sin>>x;  /// get lemma
      if (x!=L"*") {
	r.any_lemma=false;
	if (x[0]=='!') { r.pn_lemma=false; x = x.substr(1); }
	r.lemma = boost::make_u32regex(L"^"+x+L"$");
      }
      sin>>x;  /// get tag
      if (x!=L"*") {
	r.any_tag=false;
	if (x[0]=='!') { r.pn_tag=false; x = x.substr(1); }	  
	r.tag = boost::make_u32regex(L"^"+x);
      }

      sin>>x; 
      if (x!=L"=>") ERROR_CRASH(L"Expecting '=>' in rule read from sppp.dat");

      sin>>r.stem;
      sin>>r.rule_id;
      sin>>r.form2;

      // Rest of the line (if any) is ignored (comments).

      // add rule to rule list.
      rules.push_back(r);
    }

    if (reading!=0) read++;
  }

  if (read==0) ERROR_CRASH(L"Rule file "+exp+L" contains no rules.");
}
  

//---------------------------------------------
// Sample main program
//---------------------------------------------
int main(int argc, char **argv) {

  util::init_locale(L"default");

  /// load transformation file from FreeLing to SPPP
  read_SPPP_rules();

  // read configuration file and command-line options
  cfg = new config(argc, argv);

  // create required analyzers
  tk = new tokenizer(cfg->TOK_TokenizerFile); 
  sp = new splitter(cfg->SPLIT_SplitterFile);

  // the morfo class requires several options at creation time.
  // they are passed packed in a maco_options object.
  maco_options opt(cfg->Lang);
  // boolean options to activate/desactivate modules
  // default: all modules activated (options set to "true")
  opt.set_active_modules (cfg->MACO_UserMap, 
                          cfg->MACO_AffixAnalysis,    cfg->MACO_MultiwordsDetection,
			  cfg->MACO_NumbersDetection, cfg->MACO_PunctuationDetection,
			  cfg->MACO_DatesDetection,   cfg->MACO_QuantitiesDetection,
			  cfg->MACO_DictionarySearch, cfg->MACO_ProbabilityAssignment,
			  cfg->MACO_NERecognition,    false);

  // decimal/thousand separators used by number detection
  opt.set_nummerical_points(cfg->MACO_Decimal, cfg->MACO_Thousand);
  // Minimum probability for a tag for an unkown word
  opt.set_threshold(cfg->MACO_ProbabilityThreshold);
  // Data files for morphological submodules. by default set to ""
  // Only files for active modules have to be specified 
  opt.set_data_files (cfg->MACO_UserMapFile, 
                      cfg->MACO_LocutionsFile, cfg->MACO_QuantitiesFile,
		      cfg->MACO_AffixFile, cfg->MACO_ProbabilityFile,
		      cfg->MACO_DictionaryFile, cfg->MACO_NPDataFile,
		      cfg->MACO_PunctuationFile, L"");

  // create analyzer with desired options
  morfo = new maco(opt);

  if (cfg->OutputFormat>=TAGGED) {
     if (cfg->TAGGER_which == HMM)
       tagger = new hmm_tagger(cfg->TAGGER_HMMFile,cfg->TAGGER_Retokenize, cfg->TAGGER_ForceSelect);
     else if (cfg->TAGGER_which == RELAX)
       tagger = new relax_tagger(cfg->TAGGER_RelaxFile, cfg->TAGGER_RelaxMaxIter,
                                 cfg->TAGGER_RelaxScaleFactor,
                                 cfg->TAGGER_RelaxEpsilon,
                                 cfg->TAGGER_Retokenize,
                                 cfg->TAGGER_ForceSelect);
  }

  if (cfg->OutputFormat>=TAGGED && cfg->NEC_NEClassification) {
    neclass = new nec(cfg->NEC_NECFile);
  }


  // Input is plain text.
  ProcessPlain();

  // clean up. Note that deleting a null pointer is a safe (yet useless) operation
  delete cfg;
  delete tk;
  delete sp; 
  delete morfo; 
  delete tagger;
  delete neclass; 
}