# -*- coding: utf-8 -*-

module TextlabOBTStat

  # stub ActiveRecord classes from tag-annotator
  class Text
    attr_accessor :sentence_count, :sentences, :postamble

    def initialize
      @sentences = []
    end

    # returns a flattened array of all words in the
    # text instance
    def words
      sentences = @sentences.collect do |s|
        s.words
      end

      sentences.flatten
    end
  end

  class Sentence
    attr_accessor :words, :length, :text_index, :attrs

    def initialize
      @words = []
      # attributes on the sentence delimiter tag if any
      # empty hash here means there was a tag with no attributes
      @attrs = nil
    end

    def to_s
      @words.collect {|w| w.normalized_string }.join(' ')
    end

    def to_orig_s
      @words.collect { |w| w.orig_string }.join(' ')
    end
  end

  class Word
    attr_accessor :string, :orig_string, :sentence_index, :tag_count, :tags, :input_string, :preamble,
                  :end_of_sentence_p

    PUNCTUATION_REGEX = Regexp.compile('^\$?[\.\:\|\?\!]$') # .:|!?

    def initialize
      @string = nil
      @orig_string = nil
      @tags = []
      @preamble = []
      @end_of_sentence_p = nil
    end

    def normalized_string
      # remove space in front of percent sign
      if @string.match(/^\d+\s+\%$/)
        return @string.gsub(/^(\d+)\s+(\%)$/, '\1\2')
      end

      # normalize fancy qoutes to ascii ones
      string = @string.gsub(/[«»]/, '"')

      string = string.gsub(/\$([\.\:\|\?\!\,\(\)\-\"\;])/, '\1')
      string.gsub(/\s/, '_')
    end

    # returns the appropriate string for output from the tagger
    # that is the original string if available otherwise the OB
    # word form string
    def output_string
      (@orig_string or @string)
    end

    def tag_by_string(str)
      @tags.each do |t|
        # return t if str == t.clean_out_tag
        return t if t.equal(str)
      end

      nil
    end

    def get_ambiguities
      @tags.length
    end

    def ambigious?
      get_ambiguities > 1
    end

    def match_clean_out_tag(tag)
      # @tags.find { |t| t.clean_out_tag == tag }
      @tags.find { |t| t.equal(tag) }
    end

    def word_count
      string.split(/\s/).length
    end

    def get_correct_tags
      @tags.find_all { |t| t.correct }
    end

    def get_correct_tag
      correct_tags = get_correct_tags
      raise RuntimeError if correct_tags.count > 1

      correct_tags.first
    end

    def get_selected_tag
      selected = @tags.find_all { |t| t.selected }

      raise RuntimeError if selected.length > 1

      # nil implicitly returned if no tag is selected
      selected.first
    end

    def correct_count
      get_correct_tags.length
    end

    # this must be expanded if sentence segmentation is made more complex
    def capitalized?
      if @sentence_index == 0
        true
      elsif get_correct_tags.count > 0
        get_correct_tags.first.capitalized
      else
        @tags.all? { |t| t.capitalized }
      end
    end

    def is_punctuation?
      @string.match(PUNCTUATION_REGEX)
    end

    def remove_duplicate_clean_tags!
      seen = []

      @tags.each do |tag|
        if seen.member?([tag.clean_out_tag, tag.lemma])
          @tags.delete(tag)
        else
          seen << [tag.clean_out_tag, tag.lemma]
        end
      end

      @tags
    end

    def end_of_sentence?
      @end_of_sentence_p
    end
  end

  class Tag
    attr_accessor :lemma, :string, :correct, :selected, :capitalized, :index, :input_string

    CLEAN_TAG_REGEX = Regexp.compile('((i|pa|tr|pr|r|rl|a|d|n)\d+(\/til)?)')

    def initialize
      @string = nil
      @correct = nil
    end

    def clean_out_tag
      tag = @string

      # remove unnecessary info from the tag field for "joined words". These words
      # uniquely have a @ in their tag, with the tag being the token in front of this.
      #
      # ie. "prep+subst prep @adv" is turned into "prep" from the middle field
      if tag.match('@')
        tag = tag.gsub(/^[\w\+]+\s(\w+)\s@.+$/, '\1')
      end

      # we treat clb marked punctuation the same as unmarked
      if tag.match(/^clb /)
        tag = tag.gsub(/^clb (.*)$/, '\1')
      end

      tag.gsub(CLEAN_TAG_REGEX, '').strip.gsub(/\s+/, '_')
    end

    def equal(tag_str)
      # assume we're passed a clean out style tag
      elts = tag_str.split('_')
      tag_elts = clean_out_tag.split('_')

      return false if elts.count != tag_elts.count

      tag_elts.each do |e|
        return nil if not elts.include? e
      end

      true
    end
  end
end