require_relative 'globals' module TextlabOBTStat class LemmaModel # @todo refactor model training to separate class UTF8_MODEL_FN = File.join(TextlabOBTStat.root_path, "models", "trening-u-flert-d.lemma_model.utf8") LATIN1_MODEL_FN = File.join(TextlabOBTStat.root_path, "models", "trening-u-flert-d.lemma_model") UTF8_NOWAC_FREQ_FN = File.join(TextlabOBTStat.root_path, "models", "nowac07_z10k-lemma-frq-noprop.lst.utf8") LATIN1_NOWAC_FREQ_FN = File.join(TextlabOBTStat.root_path, "models", "nowac07_z10k-lemma-frq-noprop.lst") DEFAULT_FILE = "data/trening-u-flert-d.train.cor" VERSION_1_FILE_HEADER = "version 1" LEMMA_DATA_SEP = "^" attr_reader :model, :unknown_model def initialize(opts={}) @lemma_backoff_disambiguation = :nowac # :prefix, :nowac or :nowac_full @unknown_model = {} @format = opts[:format] || 'utf-8' @model_fn = opts[:model_fn] || default_model_fn(@format) @model = create_lemma_model(@model_fn) if @lemma_backoff_disambiguation == :nowac or @lemma_backoff_disambiguation == :nowac_full read_unknown_model end end def default_model_fn(format) case format when 'utf-8' UTF8_MODEL_FN when 'latin1' LATIN1_MODEL_FN else raise NotImplementedError end end def nowac_freq_fn(format) case format when 'utf-8' UTF8_NOWAC_FREQ_FN when 'latin1' LATIN1_NOWAC_FREQ_FN else NotImplementedError end end def model_entry(word) @model[word] end def top_lemma(word) lemmas = @model[word] return nil if lemmas.nil? top_result = nil lemmas.each do |l| if top_result.nil? top_result = l elsif l[1] > top_result[1] top_result = l end end top_result[0] end def prefix_similarity(w1,w2) len = [w1.length, w2.length].min len.times { |i| return i - 1 if w1[i] != w2[i] } len end def disambiguate_lemma(word, lemma_list) word_lookup = @model[word] # filter incompatible lemmas # TODO handle punctuation with prefixed $ word_lookup = word_lookup.find_all { |l| lemma_list.include? l.first } if not word_lookup.nil? if word_lookup.nil? or word_lookup.empty? if @lemma_backoff_disambiguation == :nowac or @lemma_backoff_disambiguation == :nowac_full lemma_counts = lemma_list.collect { |lemma| [lemma, @unknown_model[lemma]] } elsif @lemma_backoff_disambiguation == :prefix lemma_counts = lemma_list.collect { |lemma| [lemma, prefix_similarity(word, lemma)] } else raise RuntimeError end if lemma_counts.all? { |x| x[1].nil? } return lemma_list.first else lemma_counts = lemma_counts.find_all { |x| not x[1].nil? } return lemma_counts.max { |a,b| a[1] <=> b[1] }.first end end best_score = 0 best_lemma = nil word_lookup.each do |k, v| if v > best_score best_lemma = k end end raise RuntimeError if best_lemma.nil? best_lemma end def lemma_counts(text) lemma_counts = {} no_correct = 0 text.sentences.each do |s| s.words.each do |w| tag = w.get_correct_tags if tag.count != 1 no_correct += 1 next end tag = tag.first lemma = tag.lemma word = w.string data = lemma_counts[word] if data.nil? lemma_counts[word] = { lemma => 1 } elsif data[lemma].nil? data[lemma] = 1 else data[lemma] += 1 end end end [lemma_counts, no_correct] end # creates a lemma model based on the cor file # passed as the file argument, and stores this model # in the @model instance variable. # file - a proprly formatted cor file. $stdin may be passed # allowing the data to be read from it. # returns the populated @model variable # @todo probably broken def create_lemma_model(file) filedata = nil @model = {} # check if $stdin is passed and read from the fiole or # input stream as appropriate if file == $stdin filedata = $stdin.read else File.open(file) do |f| # parse the cor text data text = OBNOText.parse f # collect correct lemma counts and construct model lc = lemma_counts(text) lc.first.each do |k, v| word = k total = v.values.inject { |sum, n| sum + n } lemma_probs = [] v.each do |k, v| lemma_probs << [k, v / total.to_f] end @model[word] = lemma_probs end end end @model end # Writes the lemma model to a file. The first line in the file is a # version header. Subsequent lines contains word forms and lemma/probability # pairs sepated by tabs. The lemma strings and probability are separated by # a ^ (hat) character. # file - the file name to write the model to. # returns nil def write_lemma_model(file) if file == $stdout f = $stdout else f = File.open(file, 'w') end f.puts VERSION_1_FILE_HEADER @model.each do |k, v| f.puts k + "\t" + v.collect{ |e| e.join(LEMMA_DATA_SEP)}.join("\t") end if f != $stdout f.close end end # Reads a lemma model from file, and binds it to the @model instance variable. # file - name of a file containing a properly formatted model. # returns the populated @model instance variable def read_lemma_model(file) @model = {} File.open(file, 'r') do |f| # first line should be a valid header if f.readline.strip() != VERSION_1_FILE_HEADER raise RuntimeError end f.each_line do |l| tokens = l.split("\t") word = tokens[0] lemmadata = tokens[1...tokens.count] lemmas = lemmadata.collect do |e| e = e.split(LEMMA_DATA_SEP) raise RuntimeError if e.count != 2 [e[0], e[1].to_f] end if @model[word] raise RuntimeError end @model[word] = lemmas end end @model end def read_unknown_model File.open nowac_freq_fn(@format) do |f| f.each_line do |line| vals = line.strip.split word = vals[1] if @lemma_backoff_disambiguation == :nowac freq = vals[3].to_i elsif @lemma_backoff_disambiguation == :nowac_full freq = vals[0].to_i else raise RuntimeError end if @unknown_model[word] @unknown_model[word] += freq else @unknown_model[word] = freq end end end end end end