;;; -*- Mode: LISP; Package: MORPH; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; ;; Copyright (C) Paul Meurer 2000 - 2007. All rights reserved. ;; paul.meurer@aksis.uib.no ;; Aksis, University of Bergen ;; ;; Reimplementation in ANSI Common Lisp of the compound analyser for Norwegian (C program) ;; written by Helge Hauglin and Lars Joergen Tvedt, UiO 1999 ;; Used in the CG parser/tagger system (Oslo-tagger) developed at UiO ;; (Dokumentasjonsprosjektet; Tekstlaboratoriet) ;; ;;------------------------------------------------------------------------------------- ;; TO DO: ;; - CLOSify (partially done) ;; - get rid of ranking-info structure, lazy eval instead ;; - rewrite the whole thing as a fsa! DONE. ;; - not working: words like "aksjespesifikk" "akjsonaervalgt" "aktuarmessig" ;; - "annualisere" "appresiere" ... ;;------------------------------------------------------------------------------------- (in-package "MORPH") (defclass compound-analyser-lexicon () ((morph-feature-vector :initarg :morph-feature-vector :reader morph-feature-vector) (morph-feature-table :initarg :morph-feature-table :reader morph-feature-table) (fullform-net :initarg :fullform-net :reader fullform-net) (aux-net :initform nil :initarg :aux-net :reader aux-net) (package :initarg :package :initform :morph :reader analyser-package) (compound-regexp :initarg :compound-regexp :reader compound-regexp) (hyphenated-compound-regexp :initform nil :initarg :hyphenated-compound-regexp :reader hyphenated-compound-regexp) (unknown-feature :initarg :unknown-feature :reader unknown-feature) (special-readings-fn :initform nil :initarg :special-readings-fn :reader special-readings-fn))) #+moved (defparameter *bm-morph-feature-vector* #("Noun" "Prop" "Adj" "Verb" "Aux" "Adv" "Prep" "Interj" "Conj" "Pron" "DetPron" "Ord" "Coord" "Punct" "Det" "Art" "Symbol" "Money" "PartInf" "Sent" "Num" "Card" "Pos" "Comp" "Sup" "PastPart" "PresPart" "Masc" "Fem" "Neut" "MF" "MFN" "Def" "Indef" "NoDef" "Def/Pl" "Sg" "Pl" "SP" "Nom" "Acc" "Gen" "Pres" "SForm" "Past" "Impv" "Infin")) #+moved (defparameter *bm-analyser-lexicon* (let ((mfv *bm-morph-feature-vector* #+ignore #("Noun" "Prop" "Adj" "Verb" "Aux" "Adv" "Prep" "Interj" "Conj" "Pron" "DetPron" "Ord" "Coord" "Punct" "Det" "Art" "Symbol" "Money" "PartInf" "Sent" "Num" "Card" "Pos" "Comp" "Sup" "Masc" "Fem" "Neut" "MF" "MFN" "Def" "Indef" "NoDef" "Def/Pl" "Sg" "Pl" "SP" "Nom" "Acc" "Gen" "Pres" "SForm" "Past" "Impv" "Infin" "PastPart" "PresPart"))) (make-instance 'compound-analyser-lexicon :morph-feature-vector mfv :morph-feature-table (let ((mft (dat::make-string-tree))) (loop for i from 0 for f across mfv do (setf (dat:string-tree-get mft (string-downcase f)) i))) :fullform-net (string-net::read-net "~/lisp/projects/xle/morph/bm-morph.net" :translate-p nil)))) (defvar *analyser-lexicon*) (defvar *%strings* (make-array 0 :adjustable t :fill-pointer t)) (defvar *%byte-arrays* (make-array 0 :adjustable t :fill-pointer t)) (defun %vector-pop (vector) (let* ((fill (fill-pointer vector))) (declare (fixnum fill)) (unless (zerop fill) (decf fill) (setf (fill-pointer vector) fill) (aref vector fill)))) (defmacro %with-string ((string) &body body) `(let ((,string (or (%vector-pop *%strings*) (make-array 0 :element-type #+mcl 'base-character #-mcl 'character :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,string) 0) (vector-push-extend ,string *%strings*)))) (defmacro %with-byte-array ((array) &body body) `(let ((,array (or (%vector-pop *%byte-arrays*) (make-array 0 :element-type '(unsigned-byte 8) :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,array) 0) (vector-push-extend ,array *%byte-arrays*)))) (defun vowelp (c) (find c (list #\a #\e #\i #\o #\u #\y #\LATIN_SMALL_LETTER_AE #\LATIN_SMALL_LETTER_O_WITH_STROKE #\LATIN_SMALL_LETTER_A_WITH_RING_ABOVE))) (defun long-chunk-p (chunk) (let* ((first-vowel-group-pos (position-if #'vowelp chunk)) (middel-cons-group-pos (when first-vowel-group-pos (position-if-not #'vowelp chunk :start first-vowel-group-pos)))) (when middel-cons-group-pos (find-if #'vowelp chunk :start middel-cons-group-pos)))) (defun codes (chunk) (cddr chunk)) (defun chunk (chunk) (cadr chunk)) (defun chunk-and-codes (chunk) (cdr chunk)) ;; sjekk_ledd (defun check-chunk (analysis chunk codes last-p) (cond ;; suppress chunks of length 1, except junctions ((and (= (length chunk) 1) (not (find (char chunk 0) "es"))) nil) ((and (all-p #'adjective-p codes) (not (zerop (length analysis))) (all-p #'adverb-p (codes (aref analysis 0)))) nil) ((and last-p (or (all-p #'adverb-p codes) (all-p #'code-98x-p codes))) nil) ((and (not last-p) (not (find-if #'previous-chunk-p codes))) nil) #+to-be-fixed ((all-p #'prefix-p codes) (incf affix-count)) (t t))) (defmethod %has-feature-p ((lexicon compound-analyser-lexicon) feature-vector feature) (when (bit-vector-p feature-vector) (with-slots (morph-feature-table) lexicon (let ((code (dat:string-tree-get morph-feature-table (string-downcase feature)))) (when code (= 1 (sbit feature-vector (the fixnum code)))))))) (defmethod %has-features-p ((lexicon compound-analyser-lexicon) feature-vector features) (loop for feature in features always (%has-feature-p lexicon feature-vector feature))) (defmethod %all-have-features-p ((lexicon compound-analyser-lexicon) codes &rest features) (not (find-if-not (lambda (code) (%has-features-p lexicon code features)) codes))) (defmethod %one-has-features-p ((lexicon compound-analyser-lexicon) codes &rest features) (find-if (lambda (code) (%has-features-p lexicon code features)) codes)) #+obsolete (defun check-chunk-features (analysis chunk codes last-p) (cond ;; suppress chunks of length 1, except junctions ((and (= (length chunk) 1) (not (find (char chunk 0) "es"))) nil) ((and ;;(all-p #'adjective-p codes) (all-have-features-p codes 'adj) (not (zerop (length analysis))) ;;(all-p #'adverb-p (codes (aref analysis 0))) (all-have-features-p (codes (aref analysis 0)) 'adv)) nil) ((and last-p (or (all-have-features-p codes 'adv) ;; (all-p #'adverb-p codes) #+later(all-p #'code-98x-p codes) )) nil) ((and (not last-p) (not (find-if #'f-previous-chunk-p codes))) nil) #+to-be-fixed ((all-p #'prefix-p codes) (incf affix-count)) (t t))) (defun e-juncture-p (codes) (when (find-if (lambda (code) (and (listp code) (find :e-juncture code))) codes) t)) (defun s-juncture-p (codes) (when (find-if (lambda (code) (and (listp code) (find :s-juncture code))) codes) t)) (defun juncture-p (codes) (or (e-juncture-p codes) (s-juncture-p codes))) #+obsolete (defun f-previous-chunk-p (code) (or (has-feature-p code 'subst) ;; ?? (real-noun-p code) (has-feature-p code 'verb) ;; (verb-p code) (has-feature-p code 'adj) ;; (adjective-p code) (has-feature-p code 'adv) ;; (adverb-p code) (has-feature-p code ') ;; (participle-p code) (has-feature-p code ') #+later(code-1000-plus-og-p code) ;?? (has-feature-p code 'kvant) ;; (number-p code) ;; *** ADDED! (has-feature-p code ') #+later(affix-p code))) (defun filter-analysis (analysis) (declare (ignore analysis)) t) (defun all-p (predicate codes) (not (find-if-not predicate codes))) (defclass compound-analyser () ((current-analysis :initform (make-array 0 :fill-pointer t :adjustable t) :accessor current-analysis) (ranking :initform (make-array 0 :fill-pointer t :adjustable t) :accessor ranking) (compound-chunks :initform (make-array 0 :fill-pointer t :adjustable t) :accessor compound-chunks) (analysis-tree :initform nil :accessor analysis-tree))) (defparameter *compound-analyser-pool* ()) (defun get-compound-analyser () (let ((analyser (or (pop *compound-analyser-pool*) (make-instance 'compound-analyser)))) (with-slots (analysis-tree current-analysis ranking compound-chunks) analyser (setf (fill-pointer current-analysis) 0 (fill-pointer ranking) 0 (fill-pointer compound-chunks) 0 analysis-tree nil)) analyser)) (defmacro with-compound-analyser ((analyser) &body body) `(let ((,analyser (get-compound-analyser))) ; thread safe! (multiple-value-prog1 (progn ,@body) (push ,analyser *compound-analyser-pool*)))) (defun analyse-compound (string) (with-compound-analyser (analyser) (analysis-tree (%analyse-compound analyser string)))) ;; TO DO: analyse from last space if string is multi-word. (defmethod %analyse-compound ((analyser compound-analyser) string &key (try-downcase-p t) (hyphened-prefix-unknown-p t) (restore-char-p t) end-hyphen-p listed-compounds) (with-slots (analysis-tree current-analysis compound-chunks) analyser (dotimes (i (1+ (length string))) (vector-push-extend nil compound-chunks)) ; does not cons unnecessarily (let ((hyphen-pos (when hyphened-prefix-unknown-p (position #\- string)))) #+debug(print (list :hyphen-pos hyphen-pos :hyphened-prefix-unknown-p hyphened-prefix-unknown-p)) (setf analysis-tree (or (%sub-analyse-compound-regexp analyser string (if hyphen-pos (1+ hyphen-pos) 0) :restore-char-p restore-char-p :end-hyphen-p end-hyphen-p :listed-compounds listed-compounds) (and try-downcase-p (upper-case-p (char string 0)) (%sub-analyse-compound-regexp analyser (string-downcase string) (if hyphen-pos (1+ hyphen-pos) 0) :restore-char-p restore-char-p :end-hyphen-p end-hyphen-p :listed-compounds listed-compounds)) ;; if there is no valid analysis with known chunks collect all analyses ;; with unknown prefix containing at least one vowel (let ((vowel-pos (position-if #'vowelp string))) (when vowel-pos (collecting (loop for start from (1+ vowel-pos) to (- (length string) 2) with count = -1 and key = 0 and analysis do (multiple-value-bind (analysis cnt) (%sub-analyse-compound-regexp analyser string start :unknown-first-p t :count count :restore-char-p restore-char-p :end-hyphen-p end-hyphen-p :listed-compounds listed-compounds) (when analysis (collect (car analysis)) (if cnt (setf count cnt) (incf count (caaar analysis))))))))))))) analyser) (defvar *net* nil) ;; cp. morph-server-sockets.lisp (defmethod byte-array-to-bit-vector ((lexicon compound-analyser-lexicon) array) (declare (optimize (speed 3) (safety 0))) (let ((feature-bv (make-array (length (morph-feature-vector lexicon)) :element-type 'bit :initial-element 0))) (loop for byte across array for i fixnum from 0 do (dotimes (j 8) (declare (fixnum j)) (unless (zerop (the fixnum (logand byte (the fixnum (ash 1 j))))) (setf (sbit feature-bv (the fixnum (+ (the fixnum (* i 8)) j))) 1)))) feature-bv)) (defmethod byte-array-to-bit-vector ((lexicon t) array) (array-to-bit-vector array)) (defmacro do-chunk-readings ((lexicon node lemma feature-bv) &body body) (with-gensyms (%string %array code) `(%with-string (,%string) (%with-byte-array (,%array) (string-net::nmap-string+array (fullform-net ,lexicon) (lambda (,lemma ,code) (declare (ignorable ,lemma ,code)) (let ((,feature-bv (byte-array-to-bit-vector ,lexicon ,code))) ,@body)) ,%string ,%array ,node #\:))))) ;;;; New regexp pruning of chunking analyses (fsa::def-string-test-function pre-e-juncture (chunk) (let* ((vowel-pos (position-if (lambda (c) (vowelp c)) chunk)) (cons-pos (when vowel-pos (position-if-not (lambda (c) (vowelp c)) chunk :start vowel-pos))) (next-vowel-pos (when cons-pos (position-if (lambda (c) (vowelp c)) chunk :start cons-pos)))) (and cons-pos (not next-vowel-pos)))) (fsa::def-string-test-function pre-s-juncture (chunk) ;; chunk is not first chunk ;; does not end in "s", "sj" or "ch" (let ((length (length chunk))) (and (string/= chunk "s" :start1 (- length 1)) (or (= length 1) (string/= chunk "sj" :start1 (- length 2)) (string/= chunk "ch" :start1 (- length 2)))))) (fsa::def-string-test-function immediate-pre-s-juncture (chunk) ;; the last consonant group does not contain #\s ;; does not end in "s", "sj" or "ch" (let ((length (length chunk))) (and (string/= chunk "s" :start1 (- length 1)) (or (= length 1) (string/= chunk "sj" :start1 (- length 2)) (string/= chunk "ch" :start1 (- length 2)) (let ((last-s-pos (position #\s chunk :from-end t))) (or (not last-s-pos) (position-if (lambda (c) (vowelp c)) chunk :start last-s-pos))))))) #+test (print pre-e-juncture) #+old ;; cgp (defparameter *compound-regexp* (make-instance 'feature-regexp :name "Regexp for compound analyser" :source-regexp `(:or (:seq ;; first chunk (:or ukjent (:and subst ub ent) (:and verb inf) (:and det kvant) (:seq (:and subst ub ent ,immediate-pre-s-juncture) :s-juncture) (:seq (:and subst ub ent ,pre-e-juncture) :e-juncture)) ;; inner chunks (:* (:or (:and subst ub ent) (:and verb inf) (:seq (:and subst ub ent ,pre-s-juncture) :s-juncture) (:seq (:and subst ub ent ,pre-e-juncture) :e-juncture))) ;; last chunk subst) ;; number (:seq (:+ (:and det kvant)) (:? (:seq (:and "og" adv) (:and det kvant))))))) #+test (print-ranked-analyses-regexp "sommerhatt") #+old (defparameter *compound-regexp* (make-instance 'fsa::feature-regexp :name "Regexp for compound analyser" :source-regexp `(:or (:seq ;; first chunk (:or Unknown (:and Noun Indef Sg (:not Gen)) (:and Verb Infin (:not SForm)) #+not-yet(:and det kvant) (:seq (:and Noun Indef Sg (:not Gen) ,immediate-pre-s-juncture) :s-juncture) (:seq (:and Noun Indef Sg (:not Gen) ,pre-e-juncture) :e-juncture)) ;; inner chunks (:* (:or (:and Noun Indef Sg);;(:and subst ub ent) (:and Verb Infin (:not SForm)) (:seq (:and Noun Indef Sg (:not Gen) ,pre-s-juncture) :s-juncture) (:seq (:and Noun Indef Sg (:not Gen) ,pre-e-juncture) :e-juncture))) ;; last chunk Noun) ;; number ;; Problem: det kvant has no counterpart in norgram #+not-yet (:seq (:+ (:and det kvant)) (:? (:seq (:and "og" adv) (:and det kvant))))))) (defun cp-boolean-list-delta-get (features-list state delta) (let ((relation (fsa::relation-get state delta)) (result ())) (when relation (let* ((features features-list) (states (fsa::relation-subsumed-get features relation))) #+debug (print (list :features features :relation relation :states states)) ;; STATE ist a list of state id + new features list (cp. fsa::KEY-SUBSUME) (dolist (state states) (pushnew (append state (list features)) result :test #'equal))) result))) #+test (setf *tagger* *nbo-tagger*) #+test (print-ranked-analyses-regexp "kildebergart") #+test (let ((*analyser-version* :regexp)) (print-ranked-analyses-regexp "kildebergart")) #+ignore (setf *analyser-lexicon* *bm-analyser-lexicon*) #+moved (defparameter *aux-lex-net* (let ((string-net (make-instance 'string-net::list-string-net)) (*print-circle* t)) #+test (string-net::add-string string-net (concat "1::" (cgp::bit-vector-to-string (bv-encode-features *analyser-lexicon* "Digit")))) (setf (list-net string-net) (cadr '(x #1=((#\1 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#) (#\2 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#) (#\3 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#) (#\4 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#) (#\5 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#) (#\6 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#) (#\7 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#) (#\8 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#) (#\9 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#) (#\0 (#\: (#\: (#\null (#\null (#\null (#\null (#\null (#\%null NIL)))))))) . #1#))))) (print (list-net string-net)))) #+test (print (cgp::bit-vector-to-string (bv-encode-features *analyser-lexicon* "Digit"))) (defmethod %sub-analyse-compound-regexp ((analyser compound-analyser) string start &key (restore-char-p t) (count -1) end-hyphen-p listed-compounds &allow-other-keys) #+debug(print (list :string string :start start)) (with-slots (current-analysis ranking compound-chunks) analyser (let* ((word-end-marker #\:) (length (length string)) (memory (make-array (1+ length) :initial-element :not-memoized)) (lexicon (fullform-net *analyser-lexicon*)) (lex-net (list-net lexicon)) (aux-lex-net (aux-net *analyser-lexicon*)) (two-letter-segments (list "en" "to" "ni" "år" "gå" "øl" "is" "by" "bo"))) (when listed-compounds ;; not very efficient! (dat:do-string-tree (string segments listed-compounds) (dolist (pair segments) (when (= (length (car pair)) 2) (pushnew (car pair) two-letter-segments :test #'string=))))) (labels ((analyse (start) (let ((memoized-analysis (aref memory start))) (if (eq memoized-analysis :not-memoized) (setf (aref memory start) (collecting (labels ((walk (pos lex-n aux-lex-n &optional jump-char) #+debug(print (list :walk pos string)) (let* ((char (and (< pos length) (or jump-char (if restore-char-p (string-net::restore-char (char string pos)) (char string pos))))) (lex-sub-node (when (< pos length) (find char lex-n :key #'car))) (lex-end-marker-node (find word-end-marker lex-n :key #'car)) (aux-lex-sub-node (when (< pos length) (find char aux-lex-n :key #'car))) (aux-lex-end-marker-node (find word-end-marker aux-lex-n :key #'car))) (when (or lex-sub-node aux-lex-sub-node) (walk (1+ pos) (cdr lex-sub-node) (cdr aux-lex-sub-node))) (when (and (not jump-char) (or lex-end-marker-node aux-lex-end-marker-node)) ;; then, extract codes using the remaining sub-network (let ((lex-node (find word-end-marker lex-n :key #'car)) (aux-lex-node (find word-end-marker aux-lex-n :key #'car)) (features ()) (chunk (subseq string start pos))) #+debug(print (list :start start :pos pos :chunk chunk)) (when (or (> (- pos start) 2) (and (< pos length) (char= (char string pos) #\-) ;; allow "-" in pos 0 and 1 (> (- pos start) (if (= start 0) 0 2))) (and (= (- pos start) 2) (find chunk two-letter-segments :test #'string-equal))) (do-chunk-readings (*analyser-lexicon* lex-node lemma bv) (push (cons (decompress-string lemma chunk) bv) features)) (do-chunk-readings (*analyser-lexicon* aux-lex-node lemma bv) (push (cons (decompress-string lemma chunk) bv) features)) (when features ;; no juncture (let ((chunk+features (cons chunk features))) (vector-push-extend (cons nil chunk+features) current-analysis) (let ((analysis (analyse pos))) (when (or analysis (= pos length)) (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (collect (cons (cons ;; label the node by the number of complete paths starting from it (* (if analysis (reduce #'+ analysis :key #'caar) 1) (length features)) chunk+features) analysis)))) ;; triple letters (when (and (< pos (1- length)) (find (char string (1- pos)) "bdfgklnprst") (eq (char string (- pos 1)) (char string (- pos 2)))) (let ((analysis (analyse (1- pos)))) (when (or analysis (= pos length)) (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (collect (cons (cons ;; label the node by the number of complete paths starting from it (* (if analysis (reduce #'+ analysis :key #'caar) 1) (length features)) chunk+features) analysis))))) (decf (fill-pointer current-analysis))) ;; juncture or hyphen (unless (= pos length) (let ((char (char string pos))) (when (find char "es-") (let ((chunk+features (cons chunk features)) (juncture (ecase char (#\e (list "e" (list "e" :e-juncture))) (#\s (list "s" (list "s" :s-juncture))) (#\- (list "-" (list "-" :hyphen)))))) #+debug(print juncture) (vector-push-extend (cons nil chunk+features) current-analysis) (vector-push-extend (cons nil juncture) current-analysis) #+debug(print current-analysis) ;; hyphen after juncture (let ((analysis (analyse (1+ pos)))) #+debug(print (list :char char :an analysis)) ;; we allow juncture at the end before hyphen (which is not part of the compound token) (when (or analysis (and (= (1+ pos) length) (or end-hyphen-p (eq char #\-)))) (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (let* ((sub-branch-count (if analysis (reduce #'+ analysis :key #'caar) 1)) (branch-count (* (length features) sub-branch-count))) (collect (list (cons branch-count chunk+features) (cons (cons sub-branch-count juncture) analysis))))) (decf (fill-pointer current-analysis) 2)))))) ;; juncture + hyphen (when (and (< (1+ pos) length) (char= (char string (1+ pos)) #\-)) (let ((char (char string pos))) (when (find char "es") (let ((chunk+features (cons chunk features)) (juncture (ecase char (#\e (list "e" (list "e" :e-juncture))) (#\s (list "s" (list "s" :s-juncture))))) (hyphen (list "-" (list "-" :hyphen)))) (vector-push-extend (cons nil chunk+features) current-analysis) (vector-push-extend (cons nil juncture) current-analysis) (vector-push-extend (cons nil hyphen) current-analysis) (let ((analysis (analyse (+ pos 2)))) #+debug(print (list :char char :an analysis)) (when (or analysis (and (= (+ pos 2) length))) (unless (zerop start) (setf (aref compound-chunks pos) chunk)) (let* ((sub-branch-count (if analysis (reduce #'+ analysis :key #'caar) 1)) (branch-count (* (length features) sub-branch-count))) (collect (list (cons branch-count chunk+features) (list (cons sub-branch-count juncture) (cons (cons sub-branch-count hyphen) analysis)))))) (decf (fill-pointer current-analysis) 3))))))))))))) (walk start lex-net aux-lex-net) ;; this deals with suffixes (which are stored with hyphen in the lexicon) (walk (1- start) lex-net aux-lex-net #\-)))) (values memoized-analysis t))))) (let ((analyses (cond ((zerop start) (analyse start)) (t (vector-push-extend (list nil (subseq string 0 start)) current-analysis) (let* ((sub-analysis (analyse start)) (branch-count (count-analyses sub-analysis))) (prog1 (when sub-analysis (list (cons (list branch-count (subseq string 0 start) (cons (subseq string 0 start) (bv-encode-features *analyser-lexicon* (unknown-feature *analyser-lexicon*)))) sub-analysis))) (decf (fill-pointer current-analysis)))))))) (when analyses #+debug(print (list :analyses analyses)) (let ((pruned-analyses (prune-analyses (if end-hyphen-p (hyphenated-compound-regexp *analyser-lexicon*) (compound-regexp *analyser-lexicon*)) analyses :key (1+ count)))) #+debug(print (list :unknown (subseq string 0 start) :count (1+ count) :pa pruned-analyses)) (labels ((walk-memoized (analysis) ; change name! (dolist (sub-analysis analysis) (destructuring-bind (nkey chunk . readings) (car sub-analysis) (dolist (reading readings) ;;(print (list :pushing nkey chunk (if (consp reading) (cdr reading) (list reading)))) (vector-push-extend (list nkey chunk (if (consp reading) (cdr reading) (list reading))) current-analysis) (if (cdr sub-analysis) (walk-memoized (cdr sub-analysis)) ;; COUNT is a hash key (progn (when (find (incf count) pruned-analyses) #+debug(print :valid) (vector-push-extend (compute-f-ranking-info *analyser-lexicon* count analyser) ranking)) #+debug (print current-analysis))) (decf (fill-pointer current-analysis))))))) (walk-memoized analyses) #+debug(print ranking) #+debug(print (list :pruned-analyses pruned-analyses :anlyses analyses)) (values (when pruned-analyses analyses) count))))))))) (defun print-ranked-analyses-regexp (word &optional (hyphened-prefix-unknown-p t) (restore-char-p t) end-hyphen-p) (with-compound-analyser (analyser) (%analyse-compound analyser word :hyphened-prefix-unknown-p hyphened-prefix-unknown-p :restore-char-p restore-char-p :end-hyphen-p end-hyphen-p) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (dotimes (i (length ranking)) (print (cons (ranking-info-position (aref ranking i)) (nth-analysis-codes-regexp (ranking-info-position (aref ranking i)) analysis-tree t))))))) #+orig (defun compound-optimal-analyses (word &key (max 1) (restore-char-p t) min-length-only-p) (with-compound-analyser (analyser) (%analyse-compound analyser word :hyphened-prefix-unknown-p nil :restore-char-p restore-char-p) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) #+debug(print (list :analysis-tree analysis-tree :ranking ranking)) (when analysis-tree (loop for i from 0 to (1- (min (length ranking) max)) for r across ranking with min = (ranking-info-effective-length (aref ranking 0)) when (or (not min-length-only-p) (= min (ranking-info-effective-length r))) collect (nth-analysis-codes-regexp (ranking-info-position r) analysis-tree t)))))) (defun compound-optimal-analyses (word &key (max 1) (restore-char-p t) min-length-only-p listed-compounds end-hyphen-p try-downcase-p) (labels ((compound-baseform (analysis &optional end) (let ((baseform "")) (loop for (seg . rest) on analysis until (and end (zerop end)) do (when end (decf end)) (let ((rseg (if rest (car seg) (cadr seg)))) (setf baseform ;; possibly, we have to restore the haplology (if (and (> (length baseform) 2) (char= (char baseform (- (length baseform) 1)) (char baseform (- (length baseform) 2)) (char rseg 0))) (concat baseform (subseq rseg 1)) (concat baseform rseg))))) baseform)) (match-p (analysis listed-analysis &optional end) (cond ((or (null analysis) (and end (zerop end))) (null listed-analysis)) ((null listed-analysis) nil) ((find (caddr (car analysis)) '(:e-juncture :s-juncture :hyphen)) (match-p (cdr analysis) listed-analysis (and end (1- end)))) ((and (string= (cadar analysis) (caar listed-analysis)) (subsetp (cdar listed-analysis) (cddar analysis) :test #'equal)) (match-p (cdr analysis) (cdr listed-analysis) (and end (1- end)))) (t nil)))) (with-compound-analyser (analyser) (%analyse-compound analyser word :try-downcase-p try-downcase-p :hyphened-prefix-unknown-p nil :restore-char-p restore-char-p :end-hyphen-p end-hyphen-p :listed-compounds listed-compounds) (with-slots (analysis-tree ranking) analyser #+debug(print (list analysis-tree ranking)) (setf ranking (rank-analyses analyser)) #+debug(print (list :analysis-tree analysis-tree :ranking ranking)) (when analysis-tree (let (#+ignore(partial-listed-analyses ())) (collecting-into (analyses listed-analyses partial-listed-analyses) (loop for i from 0 to (1- (length ranking)) for r across ranking with count = 0 and l-count = 0 and p-l-count = 0 with min = (ranking-info-effective-length (aref ranking 0)) do (let ((analysis (nth-analysis-codes-regexp (ranking-info-position r) analysis-tree t))) #+debug(print (list :analysis analysis :listedp (not (null listed-compounds)))) (cond ((null listed-compounds) (when (and (<= (incf count) max) (or (not min-length-only-p) (= min (ranking-info-effective-length r)))) (collect-into analyses analysis))) (t (block check (let* ((baseform (compound-baseform analysis)) (listed-analysis (dat:string-tree-get listed-compounds baseform))) (when (match-p analysis listed-analysis) #+debug(print (list l-count max :match analysis listed-analysis)) (collect-into listed-analyses analysis) (return-from check)) (labels ((check-listed (tail-analysis end front-analysis foundp) #+debug(print (list :tail-analysis tail-analysis :end end :front-analysis front-analysis :foundp foundp)) (cond ((> end (length tail-analysis)) (when tail-analysis (check-listed (cdr tail-analysis) 0 (append front-analysis (list (car tail-analysis))) foundp))) ((= end (length tail-analysis) 0) (when foundp (collect-into partial-listed-analyses analysis) (return-from check))) (t (let* ((baseform (compound-baseform tail-analysis end)) (listed-analysis (dat:string-tree-get listed-compounds baseform))) #+debug(print baseform) ;;(print (list :tail-analysis tail-analysis :listed-analysis listed-analysis :end end)) (when (and listed-analysis (match-p tail-analysis listed-analysis end)) (check-listed (nthcdr end tail-analysis) 0 (append front-analysis listed-analysis) t))) (check-listed tail-analysis (1+ end) front-analysis foundp))))) (check-listed analysis 0 () nil) #+debug(print (list :an analysis :bf baseform :li listed-analysis)) (when (and (< i max) (or (not min-length-only-p) (= min (ranking-info-effective-length r)))) (collect-into analyses analysis))))))))) #+debug(print (list :partial-listed-analyses partial-listed-analyses :listed-analyses listed-analyses :analyses analyses)) (cond (listed-analyses (values listed-analyses t)) (partial-listed-analyses (values partial-listed-analyses :partial)) (t analyses))))))))) #+orig (defun compound-optimal-analyses (word &key (max 1) (restore-char-p t) min-length-only-p listed-compounds end-hyphen-p try-downcase-p) (labels ((compound-baseform (analysis) (let ((baseform "")) (loop for (seg . rest) on analysis do (setf baseform (concat baseform (if rest (car seg) (cadr seg))))) baseform)) (match-p (analysis listed-analysis) (cond ((null analysis) (null listed-analysis)) ((null listed-analysis) nil) ((find (caddr (car analysis)) '(:e-juncture :s-juncture :hyphen)) (match-p (cdr analysis) listed-analysis)) ((and (string= (cadar analysis) (caar listed-analysis)) (subsetp (cdar listed-analysis) (cddar analysis) :test #'equal)) (match-p (cdr analysis) (cdr listed-analysis))) (t nil)))) (with-compound-analyser (analyser) (%analyse-compound analyser word :try-downcase-p try-downcase-p :hyphened-prefix-unknown-p nil :restore-char-p restore-char-p :end-hyphen-p end-hyphen-p) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) #+debug(print (list :ranking ranking)) (when analysis-tree (collecting-into (analyses listed-analyses) (loop for i from 0 to (1- (length ranking)) for r across ranking with min = (ranking-info-effective-length (aref ranking 0)) do (let* ((analysis (nth-analysis-codes-regexp (ranking-info-position r) analysis-tree t)) (baseform (when listed-compounds (compound-baseform analysis))) (listed-analysis (when listed-compounds (dat:string-tree-get listed-compounds baseform)))) #+debug(print (list :an analysis :bf baseform :li listed-analysis)) (when (and (< i max) (or (not min-length-only-p) (= min (ranking-info-effective-length r)))) (collect-into analyses analysis)) (when (match-p analysis listed-analysis) #+debug(print (list :match analysis listed-analysis)) (collect-into listed-analyses analysis)))) (if listed-analyses (values listed-analyses t) analyses))))))) #+test (print (compound-optimal-analyses "bjeffer" :max 3 :restore-char-p nil :min-length-only-p t)) #+test (print-ranked-analyses-regexp "kildebergart") #+test (compound-optimal-analyses "1830-tallet" :max 10) (defmethod bv-features ((lexicon compound-analyser-lexicon) bv &key symbolp) (with-slots (morph-feature-vector package) lexicon (collecting (loop for pos from 0 for bit across bv do (when (= bit 1) (collect (if symbolp (intern (string-upcase (svref morph-feature-vector pos)) (or package :morph)) (svref morph-feature-vector pos)))))))) ;; still missing: unknown-features (defmethod bv-encode-features ((lexicon compound-analyser-lexicon) &rest features) (with-slots (morph-feature-vector) lexicon (let ((feature-bv (make-array (length morph-feature-vector) :element-type 'bit :initial-element 0)) (unknown-features ())) (loop for f across morph-feature-vector for code from 0 when (find f features :test #'string-equal) do (setf (sbit feature-bv code) 1)) feature-bv))) (defmethod prune-analyses ((fr fsa::feature-regexp) analyses &key (key 0) &allow-other-keys) #+debug(print (list :analyses analyses)) (collecting (let* ((dfa (fsa::regexp-dfa fr)) (delta (fsa:fsa-delta dfa)) (start-state (fsa:fsa-start-state dfa))) (labels ((walk (state analyses readings key chunking) #+debug(print (list state analyses readings key chunking)) #+debug(print chunking) ;; CHUNKING is for debugging only (if readings (let* ((reading (car readings)) (analysis (car analyses)) (chunk (cadar analysis)) (sub-count (/ (caar analysis) (length (cddar analysis)))) (features (if (consp (cdr reading)) reading (cons (car reading) (bv-features *analyser-lexicon* (cdr reading) :symbolp t)))) (new-states (cp-boolean-list-delta-get features state delta))) #+debug(print (list :features features :reading reading :state state :delta delta :new-states new-states :finite (fsa:fsa-final-states dfa))) (dolist (new-state+morph new-states) (destructuring-bind (new-state new-features . morph) new-state+morph (declare (ignore morph)) (cond ((cdr analysis) (destructuring-bind (nkey ch . readings) (caadr analysis) (declare (ignore ch)) (walk new-state (cdr analysis) readings key (cons (cons sub-count chunk) chunking)))) ((fsa:set-member-p new-state (fsa:fsa-final-states dfa)) #+debug(print (list :collect key (reverse (cons (cons sub-count chunk) chunking)))) (collect key)) (t nil)))) (cond ((cdr readings) (walk state analyses (cdr readings) (+ sub-count key) chunking)) ((cdr analyses) (walk state (cdr analyses) nil (+ sub-count key) chunking)) (t nil))) (destructuring-bind (nkey chunk . readings) (caar analyses) (walk state analyses readings key chunking))))) (walk start-state analyses nil key ()))))) (defun nth-analysis-regexp (n analysis) "Uses the perfect hash function in a network where every node is labeled by the number of complete paths starting from it. The n-th path is the following: At each node, look at the adjacent (following) nodes; if the label of the first node is lower or equal n, decrease n by that number and test the next one, and so on; if not, proceed to that node, not altering n." (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (cdaar analysis) (multiple-value-bind (f-number n) (floor n (if (cdar analysis) (reduce #'+ (cdar analysis) :key #'caar) 1)) (cons chunk (nth-analysis-regexp n (cdar analysis))))) (nth-analysis-regexp (- n branch-count) (cdr analysis)))))) (defun nth-analysis-codes-regexp (n analysis &optional features-as-list-p) "Same algorithm as in NTH-ANALYSIS, but returns all chunks and their codes" ;;(print (list :n n analysis)) (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (cdaar analysis) (multiple-value-bind (f-number n) (floor n (if (cdar analysis) (reduce #'+ (cdar analysis) :key #'caar) 1)) (let ((reading (nth f-number f-codes))) (cons (cons chunk (if features-as-list-p (if (consp (cdr reading)) reading (cons (car reading) (bv-features *analyser-lexicon* (cdr reading)))) reading)) (nth-analysis-codes-regexp n (cdar analysis) features-as-list-p))))) (nth-analysis-codes-regexp (- n branch-count) (cdr analysis) features-as-list-p))))) #+test (print (nth-analysis-codes-regexp 5 (analyse-compound "barnevakt") t)) ;; LEKS_S_VERB_FINAL (defmethod final-lexical-s-verb-p ((lexicon compound-analyser-lexicon) analysis) "checks if last chunk is a verb starting with #\s, without preceding juncture" (let* ((length (length analysis)) (last-chunk (aref analysis (1- length))) (next-to-last-chunk (when (> length 1) (aref analysis (- length 2))))) (and (char= (char (chunk last-chunk) 0) #\s) (%all-have-features-p lexicon (codes last-chunk) 'verb) (not (and next-to-last-chunk (juncture-p (codes next-to-last-chunk))))))) ;; LEKS_VERB_INITIAL (defmethod initial-lexical-verb-p ((lexicon compound-analyser-lexicon) analysis) (and (%all-have-features-p lexicon (codes (aref analysis 0)) 'verb) (or (= (length analysis) 1) (not (juncture-p (codes (aref analysis 1))))))) (defun final-s-juncture-p (analysis) (let* ((length (length analysis)) (next-to-last-chunk (when (> length 1) (aref analysis (- length 2))))) (and next-to-last-chunk (s-juncture-p (codes next-to-last-chunk))))) (defun initial-e-juncture-p (analysis) ;;(print :e-j) (and (> (length analysis) 2) (e-juncture-p (codes (aref analysis 1))))) (defun initial-s-juncture-p (analysis) (and (> (length analysis) 2) (s-juncture-p (codes (aref analysis 1))))) ;; *** get rid of it (defstruct ranking-info position effective-length juncture-count unknown-first-chunk-p final-lexical-s-verb-p initial-lexical-verb-p final-s-juncture-p initial-e-juncture-p initial-s-juncture-p possibly-noun-p last-chunk-length first-chunk-compound-p long-first-chunk-p) (defmethod compute-f-ranking-info ((lexicon compound-analyser-lexicon) position analyser) (with-slots (current-analysis compound-chunks) analyser (let ((effective-length (effective-length current-analysis))) (make-ranking-info ;; position in the analysis tree :position position ;; length not counting junctures :effective-length effective-length ;; number of junctures :juncture-count (- (length current-analysis) effective-length) ;; unknown first chunk? :unknown-first-chunk-p (null (codes (aref current-analysis 0))) ;; final lexical s-verb? :final-lexical-s-verb-p (final-lexical-s-verb-p lexicon current-analysis) ;; initial lexical verb? :initial-lexical-verb-p (initial-lexical-verb-p lexicon current-analysis) ;; final s-juncture? :final-s-juncture-p (final-s-juncture-p current-analysis) ;; initial e-juncture? :initial-e-juncture-p (initial-e-juncture-p current-analysis) ;; initial e-juncture? :initial-s-juncture-p (initial-s-juncture-p current-analysis) ;; last chunk might be noun? :possibly-noun-p ;;(find-if #'noun-p (codes (aref current-analysis (1- (length current-analysis))))) #+old (find-if (lambda (code) (%has-feature-p lexicon code 'Noun)) (codes (aref current-analysis (1- (length current-analysis))))) ;; weighted noun-count (let ((wc 0)) (loop for seg across current-analysis for w from 1 when (find-if (lambda (code) (%has-feature-p lexicon code 'Noun)) (codes seg)) do (incf wc w)) wc) ;; length of last chunk :last-chunk-length (length (chunk (aref current-analysis (1- (length current-analysis))))) ;; first chunk has elsewhere been analysed as compound? :first-chunk-compound-p (aref compound-chunks (length (chunk (aref current-analysis 0)))) ;;:long-first-chunk-p (long-chunk-p (chunk (aref current-analysis 0))) )))) (defun regexp-compare-juncture-s-word (analyser n1 n2) #+debug(print (list :start :n1 n1 :n2 n2)) (with-slots (analysis-tree compound-chunks) analyser (labels ((next (a n) (when a (let ((branch-count (caaar a))) (if (< n branch-count) (multiple-value-bind (f-number n) (floor n (if (cdar a) (reduce #'+ (cdar a) :key #'caar) 1)) (values (car a) f-number n)) (next (cdr a) (- n branch-count)))))) (compare (n1 n2 f-n1 f-n2 pos1 pos2 a1 a2 sj1 sj2) #+debug(print (list :n1 n1 :n2 n2 :f-n1 f-n1 :f-n2 f-n2 :pos1 pos1 :pos2 pos2 :a1 a1 :a2 a2)) #+debug (when (or (null a1) (null a2)) (error "null")) (cond ((null a1) ;; ??? :incomparable #+ignore nil) ((null a2) :incomparable #+ignore t) (t (let* ((features1 (nth f-n1 (cddar a1))) (features2 (nth f-n2 (cddar a2))) (chunk1 (cadar a1)) (chunk2 (cadar a2)) (codes1 (cdr features1)) (codes2 (cdr features2))) #+debug(print (list :chunk1 chunk1 :chunk2 chunk2 :f1 features1 :f2 features2)) (cond ((and (= pos1 pos2) (s-juncture-p codes1) (char= (char chunk2 0) #\s)) (multiple-value-bind (a f-n n) (next (cdr a1) n1) ;; advance a1 (compare n n2 f-n f-n2 (1+ pos1) pos2 a a2 t nil))) ((and (= pos1 pos2) (s-juncture-p codes2) (char= (char chunk1 0) #\s)) (multiple-value-bind (a f-n n) (next (cdr a2) n2) ;; advance a2 (compare n1 n f-n1 f-n pos1 (1+ pos2) a1 a nil t))) ((and sj1 ;; condition we wanted to find (string= chunk1 chunk2 :start2 1) (compound-p compound-chunks chunk1 pos1)) t) ((and sj2 ;; condition we wanted to find (string= chunk1 chunk2 :start1 1) (compound-p compound-chunks chunk2 pos2)) nil) ((= pos1 pos2) ;; that's the only case in which we may have reached the end of both strings (multiple-value-bind (a1 f-n1 n1) (next (cdr a1) n1) ; advance both (multiple-value-bind (a2 f-n2 n2) (next (cdr a2) n2) (if (and a1 a2) (compare n1 n2 f-n1 f-n2 (+ pos1 (length chunk1)) (+ pos2 (length chunk2)) a1 a2 nil nil) :incomparable)))) ((< pos1 pos2) (multiple-value-bind (a1 f-n1 n1) (next (cdr a1) n1) ; advance a1 (compare n1 n2 f-n1 f-n2 (+ pos1 (length chunk1)) pos2 a1 a2 nil nil))) ((> pos1 pos2) (multiple-value-bind (a2 f-n2 n2) (next (cdr a2) n2) ; advance a2 (compare n1 n2 f-n1 f-n2 pos1 (+ pos2 (length chunk2)) a1 a2 nil nil))) (t (error "a condition left: ~a" (list n1 n2 pos1 pos2 a1 a2 sj1 sj2))))))))) (multiple-value-bind (a1 f-n1 n1) (next analysis-tree n1) ; advance both (multiple-value-bind (a2 f-n2 n2) (next analysis-tree n2) (compare n1 n2 f-n1 f-n2 0 0 a1 a2 nil nil)))))) (defun compound-p (compound-chunks chunk chunk-start) (or (> chunk-start 0) ; strange condition! (aref compound-chunks (+ chunk-start (length chunk))))) (defun rank-analyses (analyser) (with-slots (ranking) analyser #+debug(print (list :ranking ranking)) (setf ranking (sort ranking (lambda (ri1 ri2) (block rank (cond ((< (ranking-info-effective-length ri1) (ranking-info-effective-length ri2)) (return-from rank t)) ((> (ranking-info-effective-length ri1) (ranking-info-effective-length ri2)) (return-from rank nil))) ;; equal effective length ;; check if diff in juncture count is > 1 (let ((jc1 (ranking-info-juncture-count ri1)) (jc2 (ranking-info-juncture-count ri2))) (cond ((< jc1 (1- jc2)) (return-from rank t)) ((> jc1 (1+ jc2)) (return-from rank nil)) ;; diff in juncture count is 1 ((/= jc1 jc2) (let ((flsv1 (ranking-info-final-lexical-s-verb-p ri1)) (flsv2 (ranking-info-final-lexical-s-verb-p ri2))) (when (and (not flsv1) flsv2 ; <1> (ranking-info-final-s-juncture-p ri1)) (return-from rank t)) (when (and flsv1 (not flsv2) (ranking-info-final-s-juncture-p ri2)) (return-from rank nil)) (if (not (ranking-info-unknown-first-chunk-p ri1)) ; <2> (let ((comp (if t ;;(eq *analyser-version* :regexp) (regexp-compare-juncture-s-word analyser (ranking-info-position ri1) (ranking-info-position ri2)) (compare-juncture-s-word analyser (ranking-info-position ri1) (ranking-info-position ri2))))) (if (not (eq comp :incomparable)) (return-from rank comp) (let ((ilv1 (ranking-info-initial-lexical-verb-p ri1)) (ilv2 (ranking-info-initial-lexical-verb-p ri2))) #+debug(print (list ilv1 ri1 ilv2 ri2)) (cond ((and (not ilv1) ilv2 (ranking-info-initial-e-juncture-p ri1)) (return-from rank t)) ((and ilv1 (not ilv2) (ranking-info-initial-e-juncture-p ri1)) (return-from rank nil)))))) (let ((long-chunk1-p (and (ranking-info-long-first-chunk-p ri1) (ranking-info-initial-s-juncture-p ri1))) (long-chunk2-p (and (ranking-info-long-first-chunk-p ri2) (ranking-info-initial-s-juncture-p ri2)))) (cond ((and long-chunk1-p (not long-chunk2-p)) (return-from rank t)) ((and (not long-chunk1-p) long-chunk2-p) (return-from rank nil))))) (return-from rank (< jc1 jc2)))) ;; no difference in number of junctures #+old ((and (ranking-info-possibly-noun-p ri1) (not (ranking-info-possibly-noun-p ri2))) (return-from rank t)) ((> (ranking-info-possibly-noun-p ri1) (ranking-info-possibly-noun-p ri2)) (return-from rank t)) ((and (not (ranking-info-possibly-noun-p ri1)) (ranking-info-possibly-noun-p ri2)) (return-from rank nil)) ((and (not (ranking-info-unknown-first-chunk-p ri1)) (ranking-info-first-chunk-compound-p ri1) (not (ranking-info-first-chunk-compound-p ri2))) (return-from rank t)) ((and (not (ranking-info-unknown-first-chunk-p ri1)) (not (ranking-info-first-chunk-compound-p ri1)) (ranking-info-first-chunk-compound-p ri2)) (return-from rank nil)) (t (return-from rank (> (ranking-info-last-chunk-length ri1) (ranking-info-last-chunk-length ri2)))))) nil)))))) ;(print-ranked-analyses "sanddekka") (defun print-ranked-analyses (word) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (dotimes (i (length ranking)) (print (cons (ranking-info-position (aref ranking i)) (nth-analysis (ranking-info-position (aref ranking i)) analysis-tree))))))) (defun most-probable-compound-last-chunk (word &optional nil-if-fullform-p) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (unless (or (zerop (length ranking)) ;; *** make this nicer! (and nil-if-fullform-p (= (ranking-info-effective-length (aref ranking 0)) 1))) ;;(print (list :nth-analysis (nth-analysis (ranking-info-position (aref ranking 0)) analysis-tree))) (nth-analysis-last-chunk-and-codes (ranking-info-position (aref ranking 0)) analysis-tree))))) (defun most-probable-compound (word &optional nil-if-fullform-p (divider "+")) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (unless (or (zerop (length ranking)) (and nil-if-fullform-p (= (ranking-info-effective-length (aref ranking 0)) 1))) (let ((prefix "") (features ())) (loop for (chunk . rest) on (nth-analysis-codes (ranking-info-position (aref ranking 0)) analysis-tree nil) do (cond (rest (setf prefix (concat prefix (car chunk) (if (char= (last-char (car chunk)) #\-) "" (or divider ""))))) (t (dolist (suffix-analysis (cddr chunk)) (push (cons (concat prefix (car suffix-analysis)) (cdr suffix-analysis)) features)) (setf prefix (concat prefix (car chunk)))))) (values prefix features)))))) #+test (print (most-probable-compound "abonnementsbibliotek" t)) (defun ranked-analyses (word) (with-compound-analyser (analyser) (%analyse-compound analyser word) (with-slots (analysis-tree ranking) analyser (setf ranking (rank-analyses analyser)) (dotimes (i (length ranking)) (print (cons (ranking-info-position (aref ranking i)) (nth-analysis-last-chunk-and-codes (ranking-info-position (aref ranking i)) analysis-tree))))))) (defun count-analyses (analysis) (reduce #'+ analysis :key #'caar)) #+test (count-analyses (analyse-compound "sildesalgslag")) ;; buggy?? (defun nth-analysis (n analysis) "Uses the perfect hash function in a network where every node is labeled by the number of complete paths starting from it. The n-th path is the following: At each node, look at the adjacent (following) nodes; if the label of the first node is lower or equal n, decrease n by that number and test the next one, and so on; if not, proceed to that node, not altering n." (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (cons (chunk (caar analysis)) (nth-analysis n (cdar analysis))) (nth-analysis (- n branch-count) (cdr analysis)))))) #+copy (defun nth-analysis-regexp (n analysis) "Uses the perfect hash function in a network where every node is labeled by the number of complete paths starting from it. The n-th path is the following: At each node, look at the adjacent (following) nodes; if the label of the first node is lower or equal n, decrease n by that number and test the next one, and so on; if not, proceed to that node, not altering n." (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (cdaar analysis) (multiple-value-bind (f-number n) (floor n (if (cdar analysis) (reduce #'+ (cdar analysis) :key #'caar) 1)) (cons chunk (nth-analysis-regexp n (cdar analysis))))) (nth-analysis-regexp (- n branch-count) (cdr analysis)))))) (defun nth-analysis-last-chunk-and-codes (n analysis &optional (valid-only-p t)) "Same algorithm as in NTH-ANALYSIS, but returns the last chunk and all of its codes, or the valid one only if valid-only-p is T" #+debug(print n) (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (cdaar analysis) (multiple-value-bind (f-number n) (floor n (if (cdar analysis) (reduce #'+ (cdar analysis) :key #'caar) 1)) (or (nth-analysis-last-chunk-and-codes n (cdar analysis) valid-only-p) (let ((cac (chunk-and-codes (caar analysis)))) (if valid-only-p (list (car cac) (nth f-number (cdr cac))) cac)) (chunk-and-codes (caar analysis))))) (nth-analysis-last-chunk-and-codes (- n branch-count) (cdr analysis) valid-only-p))))) (defun nth-analysis-codes (n analysis &optional (features-as-list-p t)) "Same algorithm as in NTH-ANALYSIS, but returns all chunks and their codes" (when analysis (let ((branch-count (caaar analysis))) (if (< n branch-count) (destructuring-bind (chunk . f-codes) (chunk-and-codes (caar analysis)) (list* (list* chunk f-codes (remove-duplicates (collecting (dolist (f-code f-codes) (collect-append (if (stringp f-code) (grammar-code-to-features chunk f-code :features-as-list-p features-as-list-p) (list (list chunk f-code)))))) :test (lambda (fl1 fl2) (and (string= (car fl1) (car fl2)) (equal (cdr fl1) (cdr fl2)))))) (nth-analysis-codes n (cdar analysis) features-as-list-p))) (nth-analysis-codes (- n branch-count) (cdr analysis) features-as-list-p))))) ;; test function; not needed (defun print-all-analyses (analysis) (let ((n (count-analyses analysis))) (dotimes (i n) (print (nth-analysis i analysis))))) (defun effective-length (analysis) (let ((i 0)) (loop for chunk across analysis unless (juncture-p (codes chunk)) do (incf i)) i)) (defun previous-chunk (analysis) (let ((length (length analysis))) (if (juncture-p (codes (aref analysis (- length 2)))) (aref analysis (- length 3)) (aref analysis (- length 2))))) ;(analyse-compound "seksogfemti") ;(analyse-compound "tusenniogtretti") ;(analyse-compound "Steiro") ;(analyse-compound "beckettstykke") ;(print-ranked-analyses "tusenniogtretti") ;(analyse-compound "fattig-Norge") ;(%analyse-compound "Norge" 0) ;(analyse-compound "fem-seks") ;(tag-compound "debilografienes") ;(analyse-compound "debilografienes") ;(get-features "Norge") ;(string-values (fullforms *tagger*) "seks") ;; CURRENT-ANALYSIS is a vector containing the current partial compound analysis :eof