#ifndef _TRIGRAM_H_ #define _TRIGRAM_H_ #define MINLOGPROB -20 #include #include #include #include #include #include #include #include #include "tdl_options.h" namespace biostrm = boost::iostreams; inline std::string const &STAG() { static std::string stag = ""; return stag; } inline std::string const &ETAG() { static std::string etag = ""; return etag; } double logsumexp (std::vector els); double sumexp (std::vector els); typedef std::map CMap; typedef CMap::value_type CMapVal; typedef boost::unordered_map SMap; typedef SMap::value_type SMapVal; typedef std::map *> TGMap; typedef TGMap::value_type TGMapVal; typedef std::map BGMap; typedef BGMap::value_type BGMapVal; typedef boost::unordered_map > affixMap; using boost::assign::map_list_of; enum TagType {FULL, NOAFFIX, LETYPE, MSCAFFIX, MSC}; const std::map TagTypeName = map_list_of (FULL, "FULL") (NOAFFIX, "NOAFFIX") (LETYPE, "LETYPE") (MSCAFFIX, "MSCAFFIX") (MSC, "MSC"); class tTrigramModel { public: tTrigramModel(){}; tTrigramModel(std::vector &ifiles, tdlOptions *tdlopts=NULL, bool lambdaset=false, double l1=0, double l2=0, double l3=0, bool quiet=false); tTrigramModel(std::string fbasename); tTrigramModel(tdlOptions *tdlopts); ~tTrigramModel(); double getTransProb(std::string a, std::string b, std::string c); double getEmit(std::string t, std::string w); void trigramDeletedInterpolation(); void calculateEmissionScores(); void calculateTransitionScores(); void clearCounts(); void writeCompiled(std::string fname, bool quiet, std::string comment); void write_header(biostrm::filtering_stream &of, std::string comment); void read_affix_file(affixMap &amap, std::string filename); void read_mapgen_file(std::string filename); void read_whitelist_file(std::string filename); bool ut_debug() {return _ut_debug;}; bool on_whitelist(std::string s); void normalise(std::string *surface, std::string *tag); std::string lowercase(std::string orig); std::string caseclass_sep() { return _caseclass_sep;} private: SMap trigrams_s; SMap bigrams_s; SMap unigrams_s; SMap emissions_s; int total; //total tokens std::map twcounts; //word/tag counts TGMap tgcounts; //trigram tag counts BGMap bgcounts; //bigram tag counts CMap ugcounts; //unigram tag counts double lambda3, lambda2, lambda1; TagType tagtype; bool mapgen; bool _ut_debug; std::string _caseclass_sep; void init(tdlOptions *tdlopts = NULL); void readEmissionScores(std::string fname); void readTransitionScores(std::string fname); void incTWCounts(std::string t, std::string w, int c=1); void incTGCounts(std::string t1, std::string t2, std::string t3, int c=1); void incBGCounts(std::string t1, std::string t2, int c=1); void incUGCounts(std::string t1, int c=1); //until I read the affix rules properly affixMap suffixes; affixMap prefixes; boost::unordered_mapgenerics_map; boost::unordered_mapwhitelist; }; #endif