;;; IMPORTANT; The code in this file builds on the functions from the provided
;;; solution to exercise 2a. In other words we assume you have already loaded
;;; the definitions given in `solution2a.lsp'.

;;; Initializing the vector space:
#||
(defparameter vspace 
  (length-normalize-vs
    (read-corpus-to-vs "brown2.txt" "words.txt")))
||#


;; 1 A) Background:
;;
;; A sensible solution here could be to represent the proximity matrix
;; as a hash-table of hash-tables (or some other associative look-up
;; structure, like a-lists). One could then impose some canonical
;; ordering of the keys (i.e. words) for example using
;; `string>'. Another possibility would be to only use a single
;; hash-table and let the keys be a list of the two ordered words
;; (making sure to specify `#'equal' as the test in
;; `make-hash-table').
;;
;;
;; In this sample solution however, we instead implement an abstract
;; data type for symmetric matrices, taking the opportunity to
;; showcase some more advanced Common Lips techniques (like the use of
;; `defsetf'). Note that this solution goes quite a few steps beyond
;; what is expected from your own submissions (and is also much longer
;; than the straightforward hash-table solution outlined above).
;;
;; Based on a linear array, `symat' implements a maximally compact
;; representation of symmetrical matrices. The implementation takes
;; advantage of the fact that a given two-dimensional array can
;; actually be "linearized" as a one-dimensional array, usually
;; referred to as a "row-major order" representation.

(defstruct (symat
	    ;; internal constructor used by make-symat:
	    (:constructor internal-make-symat (dimensions table)))
  dimensions
  table)

;; Redefine the default make-* constructor: 
(defun make-symat (dim)
  (internal-make-symat dim (make-array (/ (* dim (1+ dim)) 2))))

(defun symat-ref (sm i j)
  "Accessor function for the symat data type."
  (let ((i (min i j))
	(j (max i j)))  
   ;; Access is computed based on (a) the row-major order index
   ;; minus (b) the redundant values, corresponding to the sum of 
   ;; positive integers up to i:
    (aref (symat-table sm)
	  (- (+ j (* (symat-dimensions sm) i)) ;; (a)
	     (/ (* i (1+ i)) 2)))))            ;; (b)


(defun set-symat-ref (sm i j v)
  "Destructive modifier function to be used by setf."
  (let ((i (min i j))
	(j (max i j)))
    (setf (aref (symat-table sm)
	      (- 
	       (+ j (* (symat-dimensions sm) i))
	       (/ (* i (1+ i)) 2)))
      v)))

;; Make setf work with symat-ref:
(defsetf symat-ref set-symat-ref) 

(defmethod print-object ((sm symat) stream)
  "symat pretty-printing for the Lisp REPL."
  (print-unreadable-object (sm stream :type t :identity t)
    (format stream "[dim: ~d]" (symat-dimensions sm))))


;; Finally, below we add some functionality for mapping between
;; strings (words) and numerical ids. We'll later use the ids for
;; indexing the proximity matrix. We'll use the hash-table in the
;; vs-slot `string-map' for storing the mapping from strings to ids,
;; and the slot `id-map' for storing an array recording the reverse
;; mapping from numerical identifiers back to strings.

(defun word-count (vs)
  "Looks up the number of words in the vector space."
  (hash-table-count (vs-matrix vs)))

(defun map-strings-to-ids (vs)
  ;; We'll use an array for mapping from numerical ids to strings
  ;; (words), stored in the vs-slot `id-map':
  (setf (vs-id-map vs) (make-array (word-count vs)))
  
  ;; We'll use the hash-table in the slot `string-map' for mapping
  ;; strings to ids. Here we assign the mappings:
  (loop 
     with string-map = (vs-string-map vs)
     with id-map = (vs-id-map vs)
     for word being the hash-keys of (vs-matrix vs)
     for i from 0
     do 
       (setf (gethash word string-map) i)
       (setf (aref id-map i) word)))

(defun id2string (id vs)
  (aref (vs-id-map vs) id))

(defun string2id (string vs)
  (gethash string (vs-string-map vs)))


;;; 1 A)

(defun compute-proximities (vs)
  "Computes and stores a proximity matrix for a given vs structure."
  (let* ((n (word-count vs))
	 (prox (make-symat n))
         (sim-fn (vs-similarity-fn vs)))

    ;; Initilize the mapping between words and numerical ids, as used
    ;; for accessing the proximity matrix:
    (map-strings-to-ids vs)

    (loop 
       for i below n
       for vec-i = (get-feature-vector vs (id2string i vs))
       do (loop 
	     for j from i below n 
	     for vec-j = (get-feature-vector vs (id2string j vs))
	     do (setf (symat-ref prox i j)
		      (funcall sim-fn vec-i vec-j))))
			      
    (setf (vs-proximity-matrix vs) prox)))


(defun get-proximity (vs word1 word2)
  "Retrieves the proximity (by look-up) for a given pair of words."
  (symat-ref (vs-proximity-matrix vs) 
             (string2id (normalize-token word1) vs)
             (string2id (normalize-token word2) vs)))


;;; 1 B)

(defun find-knn (vs word &optional (k 5))
  "Return the k nearest neighbors for a given word in the v space."
  (let ((knn))
    (loop 
        with prox = (vs-proximity-matrix vs)
        with i = (string2id (normalize-token word) vs)
        for j below (word-count vs)
        unless (= i j)
        collect (cons j (symat-ref prox i j))
        into neighbors
        finally (setq knn
                  (subseq (sort neighbors #'> :key #'cdr)
                          0 k)))
    (mapcar #'(lambda (n) (id2string (first n) vs)) knn)))


;; Example call sequence:
;;
;; (compute-proximities space)
;;
;; (find-knn vspace "egypt")
;; ==> 
;; ("congo" "germany" "europe" "italy" "america")
;;
;;(find-knn vspace "butter" 10)
;; ==>
;;("salt" "sauce" "eggs" "pepper" "mustard" "milk" "water" "toast" "meat")


;;; 2 A)

;; A note on how we store the class information: Each class label is
;; used as a key in a hash-table, where for each class we store a
;; property list with two keys, :members and :centroid.

(defun read-classes (vs file)
  "Read class file and store the information in the vs structure."
  (let ((classes (make-hash-table)))
    (with-open-file (stream file)
      (loop 
          for list = (read stream nil nil)
          for class = (first list)
          for members = (mapcar #'normalize-token (second list))
          while list
          do (setf (gethash class classes)
               (list :members members))
          finally (setf (vs-classes vs) classes)))))


;;; 2 B)

;; In addition to the function `compute-class-centroids' we define the
;; helper functions `retrieve-vectors', `vector-average', 
;; `sum-vectors', `get-class-centroid' and `get-class-members'.

(defun sum-vectors (&rest vectors)
  (let ((sum (make-hash-table :test #'equal)))
    (dolist (vec vectors)
      (maphash #'(lambda (dim val)
		   (incf (gethash dim sum 0) val))
	       vec))
    sum))
          
(defun vector-average (&rest vectors)
  "Computes a centroid for an arbitrary number of vectors."
  (let ((n (length vectors))
        (sum (apply #'sum-vectors vectors)))
    (maphash #'(lambda (dim val)
                 (setf (gethash dim sum) (/ val n)))
             sum)
    sum))

(defun retrieve-vectors (vs words)
  "Return a set of feature vectors for a given set of words."
  (mapcar #'(lambda (w) (get-feature-vector vs w)) words))

(defun get-class-centroid (name vs)
  (getf (gethash name (vs-classes vs)) :centroid))

(defun get-class-members (name vs)
  (getf (gethash name (vs-classes vs)) :members))
     

(defun compute-class-centroids (vs)
 "Compute and store the average-vectors for each class."
 (loop 
    with classes = (vs-classes vs)      
    for label being the hash-keys of classes
    using (hash-value class)
    for words = (get-class-members label vs)
    for vectors = (retrieve-vectors vs words)
    for centroid = (length-normalize-vector 
		    (apply #'vector-average vectors))
    ;;; For each class label, store the centroid in the 
    ;;; property list we created in `read-classes':
    do (setf (gethash label classes)
	     (append (list :centroid centroid) class))))


;;; 2 C)


(defun rocchio-classify (vs)
 "Classify all words labeled :unknown according to centroid distance."
 (loop 
    with sim-fn = (vs-similarity-fn vs)
    with classes = (vs-classes vs)
    with unknown-words = (get-class-members :unknown vs)
    with unknown-vectors = (retrieve-vectors vs unknown-words)
    with named-centroids = 
      (loop 
	 for label being the hash-keys of classes 
	 unless (eq label :unknown)
	 collect (cons label (get-class-centroid label vs)))
    for word in unknown-words
    for vec in unknown-vectors
    ;;; Find the class label with the closest centroid 
    ;;; for each unknown (unlabeled) word. 
    collect 
      (loop with max-label with max-sim = 0
	 for (label . center) in named-centroids
	 for sim = (funcall sim-fn vec center)
	 when (> sim max-sim)
	 do (setq max-label label max-sim sim)
	 finally (return (list word max-label max-sim)))))


;; Example call sequence:

#||

(progn 
 (read-classes vspace "classes.txt")
 (compute-class-centroids vspace)
 (rocchio-classify vspace))

||#

;; ==>
;;
;; (("fruit" :FOODSTUFF 0.3667868) ("california" :PERSON_NAME 0.29967904)
;;  ("peter" :PERSON_NAME 0.30886117) ("egypt" :PLACE_NAME 0.3074144)
;;  ("department" :INSTITUTION 0.5497112) ("hiroshima" :PLACE_NAME 0.2304558)
;;  ("robert" :PERSON_NAME 0.59566295) ("butter" :FOODSTUFF 0.3845482)
;;  ("pepper" :FOODSTUFF 0.36386278) ("asia" :PLACE_NAME 0.39868993)
;;  ("roosevelt" :TITLE 0.2597388) ("moscow" :PLACE_NAME 0.4841283)
;;  ("senator" :TITLE 0.35630104) ("university" :INSTITUTION 0.5805089)
;;  ("sheriff" :TITLE 0.22804837))


;;; 2 D) 
;;;
;;;
;;; The main difference between the feature vectors and the centroid vectors,
;;; is that the latter will be much more dense. Being the average of all the
;;; feature vectors in a class, a centroid will typically have a much higher
;;; proportion of active (non-nil) features. This also means that the
;;; hash-table representations might no longer be the optimal choice, and we
;;; might consider switching to arrays instead.
;;;
;;; Elaborating further:
;;;
;;; One solution here would be to use "dynamic" data structures,
;;; ie. changing our data structure depending on properties of the
;;; feature vectors. For example, we could decide to use hash-tables
;;; up to a certain level of density, and then switch to an
;;; array-representation for very dense vectors. For very sparse
;;; vectors, on the other hand, we could choose to use simple
;;; association lists.
;;;
;;; Now, if we wanted to use such dynamic and heterogeneous data types, our
;;; functions that operate on vectors would have to able to do the right thing
;;; depending on the underlying type of the vector arguments. For example, we
;;; would need to implement support for computing the `dot-product' for a pair
;;; of vectors where one is a hash-table and the other is an array.


;; 3 A)
;;
;; Keyword summary of the main differences between Rocchio classification
;; and kNN classification:
;;
;; Rocchio:
;; ----------------
;;
;; +Decision boundary: Linear.
;;
;; +Class representation: Centroid-based.
;;
;; +The classification decision: Objects are assigned to the class with
;; the nearest centroid vector
;;
;; +Disadvantages: Ignores details of the distribution of points within a
;; class and implicitly assumes that classes are equally sized
;; spheres. Only suitable for linearly separable classes.
;;
;; +Advantages: Test time independent of the number of training examples;
;; linear in the number of classes. Conceptually simple.
;;
;;
;;
;; kNN:
;; ----------------
;;
;; +Decision boundary: Non-linear.
;;
;; +Class representation: Exemplar-based.
;;
;; +The classification decision: Objects are assigned to the class of the
;; majority of the $k$ nearest neighbors.
;;
;; +Disadvantages: The parameter $k$ must be specified in advance, either
;; manually or by optimizing on held-out data. No real learning going on,
;; simply memorizes all training examples (an instance of so-called
;; memory-based learning). Large training comes with an efficiency
;; penalty during classification, as there are then more pairwise
;; similarities to compute and compare (test time is linear in the size
;; of the training set).
;;
;; +Advantages: Decision boundary is determined locally. Deals better
;; with classes that cover disconnected or non-spherical regions or other
;; irregular shapes. Applicable to non-linear classification
;; problems. Test time is independent of the number of classes, a
;; potential advantage for problems with many classes.


;; 3 B)
;;
;; Rocchio classification vs k-means clustering:
;;
;; While both methods aim to model categories in the data, they're
;; instances of supervised and unsupervised learning
;; respectively. Rocchio classification is an instance of supervised
;; learning using labeled training data: The class labels that we want to
;; learn to predict are pre-specified for each training example. In
;; contrast, k-means clustering is an instance of unsupervised learning
;; where the goal is to automatically discover groups in unlabeled
;; data. Both methods rely on a centroid-based representation of the
;; categories that they model, and both determine group membership based
;; on centroid distance. In this respect, k-means can be viewed as an
;; unsupervised variant of Rocchio.


;; 3 C)
;; 
;; In order to evaluate our Rocchio classifier above we would at least
;; need to have gold standard class labels for the test instances (the
;; :unknown words) telling us what the correct class assignments
;; should be. This would allow us to determine whether the various
;; decisions of our classifier are wrong or correct, and more
;; specifically whether they represent true positives / false
;; positives / true negatives / false negatives. On the basis of this
;; we could then compute evaluation scores like precision, recall and
;; the combined f-measure. Since we're dealing with a multi-class
;; problem we would also need to compute aggregated scores based on
;; macro- or micro-averaging. The first strategy would mean computing
;; precision, recall etc for each class and then average the scores
;; across the classes, whereas the second strategy would mean first
;; counting all the TP/FP/TN/FNs irrespective of the particular class
;; labels and the compute the global evaluation scores directly from
;; these. Since there are no large differences here with respect to
;; the sizes of the classes, we would expect the specific choice of
;; macro vs micro-averaging to not matter that much, but we would
;; nonetheless need more test data than what we have use here in order
;; to get reliable and representative scores.