;;; -*- Mode: LISP; Package: CGP; BASE: 10; Syntax: ANSI-Common-Lisp; -*- ;; Copyright (C) Paul Meurer 1999, 2000. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; Version 0.9 ;;------------------------------------------------------------------------------------- ;; TO DO: ;; ;; +- optimize translation of features into Lisp keywords ;; - make split char a slot in the net ;; - -> etc. ;; ;;------------------------------------------------------------------------------------- ;; QUESTIONS: ;; ;;------------------------------------------------------------------------------------- (in-package "CGP") '("*avtale" "*berg" "*blad" "*bok" "*bolig" "*bre" "*bukt" "*by" "*dal" "*elv" "*film" "*fjell" "*fjord" "*foss" "*fred" "*fylke" "*gate" "*hall" "*hav" "*hjem" "*hotell" "*hus" "*kirke" "*kommune" "*krig" "*kyst" "*land" "*lov" "*løkke" "*minister" "*myr" "*nes" "*pakt" "*park" "*plass" "*president" "*prinsipp" "*pris" "*program" "*protokoll" "*roman" "*sang" "*sen" "*senter" "*serie" "*seter" "*sjø" "*skog" "*skole" "*smug" "*son" "*stad" "*strand" "*sund" "*syndrom" "*teorem" "*torg" "*torv" "*vann" "*veg" "*vei" "*verk" "*vidde" "*vik" "*ø" "*ørken" "*øy" "*ås") ;;; compression #-bit-vector-coding (defparameter *bm-feature-table* (make-hash-table :test #'equal)) #+bit-vector-coding-xx (defparameter *bm-feature-table* (make-hash-table)) ;; features statistics #+no-class (defparameter *statistics-table* (make-hash-table)) #+no-class (defparameter *cg-statistics-table* (make-hash-table)) #+only-once (let* ((count 0) (*tagger* *nny-tagger*) (statistics-table (statistics-table *tagger*))) (clrhash statistics-table) (with-file-lines (line "projects:cgp;nets;nny-lexicon.txt") (setf line (subst-substrings line '("< " "<" " >" ">"))) (destructuring-bind (fullform code inflection-nr lemma+features) (string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (when (zerop (mod (incf count) 1000)) (print (list count fullform))) (destructuring-bind (lemma &rest features) (string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (declare (ignore lemma)) (dolist (feature (abbreviate-features features)) (incf (gethash feature statistics-table 0))))))) ;; collect unabbreviated features #+only-once (let* ((count 0) (*tagger* *nny-tagger*) (features-list ())) (u:with-file-lines (line "projects:cgp;nets;nny-lexicon.txt") (setf line (u:subst-substrings line '("< " "<" " >" ">"))) (destructuring-bind (fullform code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (when (zerop (mod (incf count) 1000)) (print (list count fullform))) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (declare (ignore lemma)) (dolist (feature features) (let ((fixed-features (fix-feature-string feature))) (unless (eq fixed-features :ignore) (dolist (feature fixed-features) (pushnew feature features-list :test #'string=)))))))) features-list) #+testxx (let* ((count 0) (*tagger* *nny-tagger*) (statistics-table (make-hash-table :test #'equal))) (clrhash statistics-table) (with-file-lines (line "projects:cgp;nets;nny-lexicon.txt") (destructuring-bind (fullform code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (when (zerop (mod (incf count) 1000)) (print (list count fullform))) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (declare (ignore lemma)) (dolist (feature features) (incf (gethash feature statistics-table 0)))))) (let ((features-list ())) (maphash (lambda (key value) (push (cons key value) features-list)) statistics-table) (print (sort features-list #'> :key #'cdr)))) #+test (let ((features-list ())) (maphash (lambda (key value) (push (cons key value) features-list)) (statistics-table *nny-tagger*)) ;(print (setf features-list (sort features-list #'> :key #'cdr))) (setf (ordered-simplified-features *nny-tagger*) (mapcar #'car (sort features-list #'> :key #'cdr)))) ;; frequency ordering ;; DON'T CHANGE! (setf (ordered-simplified-features *nbo-tagger*) '(subst appell ent ub be fl mask noeyt adj verb fem tr1 pos m/f pres inf i1 sup i2 tr11 perf-part pret imp pass komp pa1 rl4 d5 rl9 a3 prop pa4 rl5 tr2 pa2 d1 a6 n pa5 tr11/til unorm tr10 a8 tr5 tr12 i4 d5/til rl6 tr9 a7 n1 pa1/til a12 tr3 d4 adv rl9/til a11 fork pa3 tr6 uboey a4 a5 i3 pr8 rl14 rl1 d6/til pa3/til tr15 rl15 rl3/til prep pa2/til tr8 rl3 tr7 tr18 rl12 a9 pa4/til pr9 @adv rl11 rl13 pr7 tr13 n3 d6 pref d8/til pr6 rl2 tr4 rl10 interj det d3 tr19 tr21 d2 pr3 rl8 tr12/til tr23 prep+subst rl7 tr20 a2 d7/til n4/til a15 pr10 pr13 pa5/til tr14 tr16 tr rl14/til pr1 pr2 tr17 a14 d7 rl16 rl10/til kvant tr22 prob d4/til pr12 d8 rl17 pa/til pa pa6 pr4 a13 @tittel pa1refl4 tr13/til pa7 pr4/til n2 d9/til symb rl18 sbu dem pron poss rl17/til rl16/til tr21/til rl12/til @s-pred @interj pers @ pr11 d9 n4 pa11 pa8 adj+subst hum konj+adj \3 forst prep+adj pr5 pr10/til hop> @loes-np @ prep+subst+subst subst+subst det+adj prep+det+subst res clb prep+prep @adv> interj+adv verb+det konj+adv+adj \2 subst+prep+subst pron+verb+verb \1 adj+verb konj+adv+prep @i-obj gen @det> prep+konj+prep + subst+prep adj+det prep+adv nynorsk adv+prep verb+verb sbu+adj adv+adj prep+adj+adj interj+adj subst+konj+subst konj+det+adj adv+subst verb+det+subst prep+perf-part+subst prep+adv+subst v+v @kon @adj> prep+det+subst+kon+det+subst adj+prep+subst verb+subst subst+kvant prep+subst+prep+sbu adv+adv+prep mask/fem/noeyt prep+det+sbu ub/be inf-merke det+adj+det subst+prep+adj+subst pron+prep+adj det+subst+prep+subst mask/fem adj+kon+adj part+prep adv+prep+subst refl adj+adj inf/pres prep+subst+konj+subst adv+adj+prep fl/be subst+adj subst+v+subst subst+perf-part ent/fl ;; refl4 a1 t ;; those are fishy ;; added after calculation below ukjent samset inter bu >>> @sbu <<< @infmerke ;; fra multi-tagger.lisp @ @s-gr @ ;; named entity features foreign &person &sted &org &verk &hend &annet <*avtale> <*berg> <*blad> <*bok> <*bolig> <*bre> <*bukt> <*by> <*dal> <*elv> <*film> <*fjell> <*fjord> <*foss> <*fred> <*fylke> <*gate> <*hall> <*hav> <*hjem> <*hotell> <*hus> <*kirke> <*kommune> <*krig> <*kyst> <*land> <*lov> <*løkke> <*minister> <*myr> <*nes> <*pakt> <*park> <*plass> <*president> <*prinsipp> <*pris> <*program> <*protokoll> <*roman> <*sang> <*sen> <*senter> <*serie> <*seter> <*sjø> <*skog> <*skole> <*smug> <*son> <*stad> <*strand> <*sund> <*syndrom> <*teorem> <*torg> <*vann> <*vei> <*verk> <*vidde> <*vik> <*ørken> <*øy> <*ås> <*aksjon> <*bevegelse> <*departement> <*direktorat> <*forbund> <*forening> <*forum> <*institutt> ;; <*kirke> ;; <*kommune> <*kontor> <*lag> <*monopol> <*møte> <*nemnd> <*organisasjon> <*parti> <*rett> <*revisjon> <*råd> ;; <*senter> <*stand> <*tilsyn> <*utvalg> <*as> <*avis> <*bygning> <*dir.> <*direktør> <*fond> <*formann> <*forsker> <*gjeng> <*gruppe> <*gård> <*hytte> <*ingeniør> <*ist> <*kamp> <*klubb> <*koordinator> <*leder> <*leilighet> <*list> <*log> <*lokale> <*misjon> <*museum> <*område> <*produsent> <*sal> <*selskap> <*sjef> <*spesialist> <*styre> <*bedrift> <*foretak> <**institutt> rx )) #+:nny-parser (setf (ordered-simplified-features *nny-tagger*) '(subst appell eint ub bu fl adj noeyt fem mask pos verb tr1 sup i1 inf m/f pres komp i2 imp tr11 pa1 perf-part pret rl4 st-form d5 a3 rl9 prop pa4 rl5 pa2 pa5 tr2 a6 tr12 tr5 tr11/til d1 n i4 n1 a8 a12 unorm pa3 a7 d5/til tr9 tr10 d4 adv tr3 rl14 pa1/til tr6 fork rl15 rl6 i3 rl9/til a5 a11 pr8 tr18 tr15 d6/til rl13 rl1 prep a4 pa3/til d6 pa4/til pa2/til tr8 rl3 @adv pr9 a9 pr6 rl12 rl3/til pref tr12/til d7 a15 tr7 det tr19 pr7 d3 tr13 d2 tr21 interj tr22 rl10 n2 rl8 tr4 n4 rl2 pa5/til prep+subst a2 tr n3 kvant rl11 pr13 tr16 tr20 rl7 pa6 tr17 rl17 pr2 d8/til pr1 pa8 tr23 tr14 sideform a14 pr11 pa7 pr3 k1 rl14/til k2 a13 dem rl17/til tr13/til rl16 rl10/til symb pr12 poss sbu i pron d7/til rl21 adj+subst tr20/til rl12/til rl16/til n4/til d4/til uttr pers pr4 rl18 d9 @interj prob prep+adj @ i12 d9/til pa11 prep+subst+prep pr10 tr21/til rl19 nom subst+kon+subst @subj tr24 forst pr5 prep+adj+subst prep+subst+subst @fv prep+subst+kon+subst be konj+adv+prep prep+prop \1 @iv sp subst+verb interj+adv suff prep+konj+prep res subst+subst clb hoeflig bokmaal \2 adj+verb prep+adv adv+adj subst+prep+subst det+adj prep+det+subst verb+det subst+prep @adv> konj+adv+adj prep+adv+subst pron+verb+verb @det> verb+verb adj+det adv+adv gen @i-obj adv+verb @adj> prep+perf-part+subst adv+adv+prep adv+subst prep+prep prep+adj+adj konj+det+adj adj+kon+adj @kon verb+det+subst prep+det+sbu subst+perf-part subst+kvant subst+verb+subst ikke-hum prep+kon+subst @sbu interj+adj subst+prep+adj+subst adj+adj sbu+prep inf-merke prep+det+subst+kon+subst part+prep prep+subst+konj+adv verb+subst refl1 det+subst+prep+subst prep+subst+prep+sbu prep+det+subst+kon+det+subst prep+subst+konj+subst ;; added after calculation below ukjent samset @infmerke test uboey >>> <<< ;; fra multi-tagger.lisp ;; fra norsk-map.lisp; evaluate form below @ @app @ foreign rx)) ; @iv-hj @fv-ho @fv-hj @iv-ho @f-subj #+test (maphash (lambda (key val) (declare (ignore val)) (let ((*cg* *nny-cg*) (*tagger* *nny-tagger*)) (print (list key (feature-code key))))) (syntactic-functions *nny-cg*)) #+test (let ((*cg* *nbo-cg*) (syn-features ())) (maphash (lambda (key rule-list) (declare (ignore key)) ;(Print rule-list) (dolist (rule rule-list) (dolist (label (rule-labels (cdr rule))) (pushnew label syn-features)))) (morphosyntactic-mappings *cg*)) (set-difference syn-features (ordered-simplified-features *nbo-tagger*))) #+test (let ((*cg* *nbo-cg*) (syn-features ())) (maphash (lambda (key rule-list) (declare (ignore key)) ;(Print rule-list) (dolist (rule rule-list) (dolist (label (rule-labels (cdr rule))) (pushnew label syn-features)))) (morphosyntactic-mappings *cg*)) (set-difference (collecting (maphash (lambda (f val) (declare (ignore val)) (collect f)) (syntactic-functions *cg*))) syn-features #+ignore (ordered-simplified-features *nbo-tagger*))) ;; find out which features are added later in the CG parser #+test (let ((cg-features ())) (maphash (lambda (set definition) (declare (ignore set)) (dolist (def definition) (when (atom def) (setf def (list def))) (dolist (f def) (unless (or (stringp f) (find f *bm-ordered-simplified-features*)) (pushnew f cg-features))))) (set-declarations *cg*)) cg-features) #+test (let ((cg-features ())) (maphash (lambda (set definition) (declare (ignore set)) (dolist (def definition) (when (atom def) (setf def (list def))) (dolist (f def) (unless (or (stringp f) (find f (ordered-simplified-features *nny-tagger*))) (pushnew f cg-features))))) (set-declarations *nn-cg*)) cg-features) ;; frequency ordering ;; DON'T CHANGE! (setf (ordered-features *nbo-tagger*) '("subst" "appell" "ent" "ub" "be" "fl" "mask" "noeyt" "adj" "verb" "fem" "" "pos" "" "m/f" "pres" "inf" "" "sup" "" "perf-part" "pret" "imp" "" "pass" "komp" "" "" "" "" "" "prop" "" "" "" "" "" "" "" "" "" "" "unorm" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adv" "" "" "" "" "fork" "" "" "uboey" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@adv" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "pref" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "interj" "" "" "" "" "" "" "" "" "" "" "" "" "det" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "kvant" "prob" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@tittel" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "symb" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "sbu" "dem" "pron" "poss" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@s-pred" "" "@interj" "pers" "@" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "vei>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adj+subst" "hum" "" "konj+adj" "3" "forst" "" "prep+adj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "hop>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@" "@loes-np" "" "@obj" "" "@subj" "det+subst" "prep+subst+prep" "konj" "" "akk" "" "nom" "" "prep+adj+subst" "subst+kon+subst" "hoeflig" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "suff" "" "" "prep+subst+kon+subst" "sp" "@fv" "@iv" "subst+verb" "prep+prop" "prep+subst+subst" "" "res" "prep+det+subst" "det+adj" "subst+subst" "clb" "" "prep+prep" "verb+det" "interj+adv" "subst+prep+subst" "1" "konj+adv+adj" "2" "pron+verb+verb" "konj+adv+prep" "@adv>" "adj+verb" "@i-obj" "+" "@det>" "prep+konj+prep" "adj+det" "prep+adv" "subst+prep" "gen" "v+v" "interj+adj" "sbu+adj" "prep+adv+subst" "@kon" "eint" "subst+konj+subst" "prep+adj+adj" "@adj>" "prep+perf-part+subst" "adv+subst" "perf-part." "adv+prep" "verb+verb" "verb+det+subst" "konj+det+adj" "adv+adj" "pron+prep+adj" "prep+det+sbu" "ub/be" "det+subst+prep+subst" "det+adj+det" "adv+prep+subst" "adj+prep+subst" "mask/fem/noeyt" "adv+adv+prep" "prep+subst+konj+subst" "part+prep" "fl/be" "subst+v+subst" "prep+subst+prep+sbu" "inf-merke" "verb+subst" "" "adj+kon+adj" "prep+det+subst+kon+det+subst" "subst+adj" "mask/fem" "adj+adj" "adv+adj+prep" "ent/fl" "subst+perf-part" "subst+prep+adj+subst" "subst+kvant" "refl" "inf/pres")) ;; ordered by frequency #-bit-vector-coding (loop for i from 0 for feature in ;; 71 *bm-ordered-features* do (setf (gethash (string-downcase feature) *bm-feature-table*) i)) #+bit-vector-coding (loop for i from 0 for feature in (ordered-simplified-features *nbo-tagger*) do (setf (gethash feature (feature-table *nbo-tagger*)) i)) #+(and :bit-vector-coding :nny-parser) (loop for i from 0 for feature in (ordered-simplified-features *nny-tagger*) do (setf (gethash feature (feature-table *nny-tagger*)) i)) ;(gethash 'pos (feature-table *nny-tagger*)) #-bit-vector-coding (defun feature-code (feature) (let ((feature-name (string-downcase (if (stringp feature) feature (symbol-name feature))))) (gethash feature-name (feature-table *tagger*)))) #+bit-vector-coding (defun feature-code (feature) (gethash feature (feature-table *tagger*))) ;; inverse of previous (setf (feature-vector *nbo-tagger*) (make-array (hash-table-count (feature-table *nbo-tagger*)))) (maphash (lambda (cat code) (setf (aref (feature-vector *nbo-tagger*) code) cat)) (feature-table *nbo-tagger*)) #+:nny-parser (setf (feature-vector *nny-tagger*) (make-array (hash-table-count (feature-table *nny-tagger*)))) #+:nny-parser (maphash (lambda (cat code) (setf (aref (feature-vector *nny-tagger*) code) cat)) (feature-table *nny-tagger*)) #+no-classes (defparameter *code-vector-length* (length *feature-vector*)) (defmacro code-vector-length () `(length (feature-vector *tagger*))) #+no-classes (defparameter *code-vector-sort-array* (make-array (code-vector-length))) (dolist (coding (list *nbo-tagger* #+:nny-parser *nny-tagger*)) (let ((*tagger* coding)) (setf (code-vector-sort-array *tagger*) (make-array (code-vector-length))))) (defun code-feature (code) (svref (feature-vector *tagger*) code)) ;(code-feature (feature-code 'konj)) (defun has-feature-p (feature-vector feature) (= 1 (sbit feature-vector (the fixnum (feature-code feature))))) (defun has-features-p (feature-vector features) (loop for feature in features always (has-feature-p feature-vector feature))) (defun thereis-feature-p (feature-vector features) "Checks if FEATURE-VECTOR has at least one of features" (loop for feature in features thereis (has-feature-p feature-vector feature))) (defun has-feature-code-p (feature-vector code) (declare (fixnum code)) (= 1 (the fixnum (sbit feature-vector code)))) (defun has-feature-codes-p (feature-vector codes) (loop for code fixnum in codes always (has-feature-code-p feature-vector code))) (defun thereis-feature-code-p (feature-vector codes) (loop for code fixnum in codes thereis (has-feature-code-p feature-vector code))) (defparameter *simplify-subcats-p* t) #+test (length (code-features (make-array 679 :element-type 'bit :initial-element 1))) ; -> 309 (defparameter *string-to-symbol-table* (make-hash-table :test #'equal)) (defun %intern (string) (or (gethash string *string-to-symbol-table*) (setf (gethash string *string-to-symbol-table*) (intern (string-upcase string) :cgp)))) #-bit-vector-coding (defun code-features (code-bv) "Returns a list of features, in order determined by *CODE-VECTOR-SORT-ARRAY*. Abbreviates some features." (let ((used-abbrevs ())) (macrolet ((has-feature-p (features) ; true if has one of the features `(find-if (lambda (f) (= 1 (sbit code-bv (feature-code f)))) ,features)) (starts-with-p (pfx string) ;; pfx has to be a constant! `(let ((pfx-length (length ,pfx))) (and (<= pfx-length (length ,string)) (string= ,pfx ,string :end2 pfx-length)))) (abbreviate-feature (pfx abbr fstr) `(when (and (starts-with-p ,pfx ,fstr) (string/= ,fstr "")) (let* ((pfx-length (length ,pfx)) (slash-pos (position #\/ ,fstr)) (angle-pos (position #\> ,fstr)) (feature-sfx (cond ((null slash-pos) (subseq ,fstr pfx-length angle-pos)) ((string= ,fstr "TIL" :start1 (1+ slash-pos) :end1 angle-pos) (subseq ,fstr pfx-length angle-pos)) (t (subseq ,fstr pfx-length slash-pos)))) (abbreviation (%intern (concat ,abbr feature-sfx)))) (unless (find abbreviation used-abbrevs) ; don't collect an abbrev twice (collect abbreviation) (push abbreviation used-abbrevs)) (return-from abbreviate))))) (collecting (loop for pos across *code-vector-sort-array* for bit = (sbit code-bv pos) do (when (= bit 1) (let* ((feature-string (string-upcase (code-feature pos))) (feature (%intern feature-string))) (declare (dynamic-extent feature-string)) (cond ((not *simplify-subcats-p*) (collect feature)) (t (block abbreviate (mapc (lambda (prefix abbr) (abbreviate-feature prefix abbr feature-string)) '("" "" "") ("" "") ("" "") ("" "") ; ?? ("" "" "") ("" "") ("" "" "") ("" "" "") ("" "" "") ("" "" "") ("<.trans11/med>" "") ("" "") ("" "") ("" "" "") ("" "" "") ("" "") ("" "") ("" "" "") ("/opp>" . :ignore) ("" "") ("<>" . :ignore) ("" "" "") ("" "" "") ("" "") ("" "") ("" . :ignore) ("" "") ("" "" "") ("") ("" "") ("" "" "") ("") ("" "") ("" "" "") ("" "" "") ("" "" "") ("veg>" . :ignore) ("" "" "") ("" "" "") ("" "") ("" "") ("" "" "") ("" "" "") ("" "") ("" "") ("" "") ("" "") ("refl4>" "") ("" "") ("" "" "") ("" "") ("" "") ; ?? ("*verb" "verb") ; ?? ("" "") ("" "") ("v+v" "verb+verb") ("perf-part." "perf-part") ("n¿ut" "n¿yt") ("eint/fl" "eint") ;; kommer fra "seg" ("subst+v+subst" "subst+verb+subst") ("susbt" "subst") ("refl" "refl1")) :key #'car :test #'string= ))) (list feature-string))) (defun abbreviate-features (features) (let ((used-abbrevs ())) (macrolet ((starts-with-p (pfx string) ;; pfx has to be a constant! `(let ((pfx-length (length ,pfx))) (and (<= pfx-length (length ,string)) (string= ,pfx ,string :end2 pfx-length)))) (abbreviate-feature (pfx abbr fstr) `(when (and (starts-with-p ,pfx ,fstr) (string/= ,fstr "")) (let* ((pfx-length (length ,pfx)) (slash-pos (position #\/ ,fstr)) (angle-pos (position #\> ,fstr)) (feature-sfx (cond ((null slash-pos) (subseq ,fstr pfx-length angle-pos)) ((string= ,fstr "TIL" :start1 (1+ slash-pos) :end1 angle-pos) (subseq ,fstr pfx-length angle-pos)) (t (subseq ,fstr pfx-length slash-pos)))) (abbreviation (%intern (concat ,abbr feature-sfx)))) (unless (find abbreviation used-abbrevs) ; don't collect an abbrev twice (collect abbreviation) (push abbreviation used-abbrevs)) (return-from abbreviate))))) (collecting (loop for feature-string in features for fixed-features = (fix-feature-string feature-string) unless (eq fixed-features :ignore) do (dolist (feature-string fixed-features) (let* ((feature (%intern (string-trim "." feature-string)))) (cond ((not *simplify-subcats-p*) (collect feature)) (t (block abbreviate (mapc (lambda (prefix abbr) (abbreviate-feature prefix abbr (string-upcase feature-string))) '("" ""))) ;; => (tr11/til tr11) ;(abbreviate-features (list "" "<>" "" "" "pos")) #+old (defun code-features (code-bv) "Returns a list of features, in order determined by *CODE-VECTOR-SORT-ARRAY*" (macrolet ((has-feature-p (features) ; true if has one of the features `(find-if (lambda (f) (= 1 (sbit code-bv (feature-code f)))) ,features))) (collecting (loop for pos across *code-vector-sort-array* for bit = (sbit code-bv pos) do (let ((feature (intern (string-upcase (code-feature pos)) :cgp))) (cond ((= bit 1) (cond ((eq feature 'm/f) nil) ( ) (t (collect feature)))) ((and (find feature '(mask fem)) (has-feature-p '(m/f))) (collect feature)) ((and (has-feature-p '(adj)) ;; if gender, number or definiteness are not determined, ;; add all possibilities in proper order (or (and (find feature '(ent fl)) (not (has-feature-p '(ent fl)))) (and (find feature '(ub be)) (not (has-feature-p '(ub be)))) (and (find feature '(mask fem noeyt)) (not (has-feature-p '(mask fem noeyt m/f fl)))))) (collect feature)) (t nil))))))) #+old (defun code-features (code-bv) "Returns a list of features, in order determined by *CODE-VECTOR-SORT-ARRAY*" (let ((features (collecting (loop for pos across *code-vector-sort-array* for bit = (aref code-bv pos) when (= bit 1) do (let ((feature (intern (string-upcase (code-feature pos)) :cgp))) (case feature ('m/f (collect 'mask) (collect 'fem)) (otherwise (collect feature)))))))) ;; if gender, number or definiteness are not determined, add all possibilities (when (find 'adj features) (unless (find-if (lambda (f) (find f '(ent fl))) features) (setf features (append features (list 'ent 'fl)))) (unless (find-if (lambda (f) (find f '(ub be))) features) (setf features (append features (list 'ub 'be)))) ;; not necessary for plural (unless (find-if (lambda (f) (find f '(mask fem noeyt fl))) features) (setf features (append features (list 'mask 'fem 'noeyt))))) features)) #+old (defun code-features (code-bv) "Returns a list of features" (let ((features (collecting (loop for bit across code-bv and i from 0 when (= bit 1) do (let ((feature (intern (string-upcase (code-feature i))))) (case feature ('m/f (collect 'mask) (collect 'fem)) (otherwise (collect feature)))))))) ;; if gender, number or definiteness are not determined, add all possibilities (when (find 'adj features) (unless (find-if (lambda (f) (find f '(ent fl))) features) (setf features (list* 'ent 'fl features))) (unless (find-if (lambda (f) (find f '(ub be))) features) (setf features (list* 'ub 'be features))) ;; not necessary for plural (unless (find-if (lambda (f) (find f '(mask fem noeyt fl))) features) (setf features (list* 'mask 'fem 'noeyt features)))) features)) ;; features order #+only-once (defparameter *features-order-table* (make-hash-table)) ;; calculate feature order in database #+only-once (let ((count 0)) (u:with-file-lines (line "projects:cgp;multitagger;bm-lexicon.txt") (destructuring-bind (word code inflection-nr lemma+features) (u:string-parse line :whitespace ":") (declare (ignore code inflection-nr)) (destructuring-bind (lemma &rest features) (u:string-parse lemma+features :whitespace " " :delimiter-pairs '((#\" . #\"))) (declare (ignore lemma)) (loop for (f . rest) on (u:collecting (loop for f in features do (cond ((string= f "m/f") (u:collect 'mask) (u:collect 'fem)) (t (u:collect (intern (string-upcase f) :cgp)))))) do (loop for rf in rest do (pushnew rf (gethash f *features-order-table*)))) (when (zerop (mod (incf count) 1000)) (format t "~%~5d ~a" count word)))))) ;; does not work because there is no partial ordering among the features #+ignore (defun sort-features (features) (stable-sort features (lambda (f1 f2) (let ((to-the-right (gethash (intern (string-upcase f1)) *features-order-table*)) (to-the-left (gethash (intern (string-upcase f2)) *features-order-table*))) (and (or (find (intern (string-upcase f2)) to-the-right) (not (find (intern (string-upcase f1)) to-the-left)))))))) ;; ordered by hand #-bit-vector-coding (loop for i from 0 and f across #("fork" "symb" "subst" "adj" "verb" "prep" "adv" "sbu" "konj" "det" "dem" "kvant" "prob" "pron" "pers" "poss" "refl" "pos" "komp" "sup" "res" "" "" "" "1" "2" "3" "mask" "fem" "noeyt" "m/f" "mask/fem" "appell" "prop" "ub" "be" "ent" "fl" "ent/fl" "uboey" "inf" "pres" "inf/pres" "perf-part" "pret" "imp" "pass" "hum" "nom" "akk" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@adv" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "pref" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "interj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@tittel" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@s-pred" "" "@interj" "@" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "vei>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adj+subst" "" "konj+adj" "forst" "" "prep+adj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "hop>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@" "@loes-np" "" "@obj" "" "@subj" "det+subst" "prep+subst+prep" "" "" "" "prep+adj+subst" "subst+kon+subst" "hoeflig" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "suff" "" "" "prep+subst+kon+subst" "sp" "@fv" "@iv" "subst+verb" "prep+prop" "prep+subst+subst" "" "prep+det+subst" "det+adj" "subst+subst" "clb" "" "prep+prep" "verb+det" "interj+adv" "subst+prep+subst" "konj+adv+adj" "pron+verb+verb" "konj+adv+prep" "@adv>" "adj+verb" "@i-obj" "+" "@det>" "prep+konj+prep" "adj+det" "prep+adv" "subst+prep" "gen" "v+v" "interj+adj" "sbu+adj" "prep+adv+subst" "@kon" "eint" "subst+konj+subst" "prep+adj+adj" "@adj>" "prep+perf-part+subst" "adv+subst" "perf-part." "adv+prep" "verb+verb" "verb+det+subst" "konj+det+adj" "adv+adj" "pron+prep+adj" "prep+det+sbu" "ub/be" "det+subst+prep+subst" "det+adj+det" "adv+prep+subst" "adj+prep+subst" "mask/fem/noeyt" "adv+adv+prep" "prep+subst+konj+subst" "part+prep" "fl/be" "subst+v+subst" "prep+subst+prep+sbu" "inf-merke" "verb+subst" "" "adj+kon+adj" "prep+det+subst+kon+det+subst" "subst+adj" "adj+adj" "adv+adj+prep" "subst+perf-part" "subst+prep+adj+subst" "subst+kvant" "unorm") do (setf (aref *code-vector-sort-array* i) (position f *feature-vector* :test #'string-equal))) #+test (print (length (remove-duplicates (ordered-simplified-features *nbo-tagger*)))) #+bit-vector-coding (let ((*tagger* *nbo-tagger*) (ordered-simplified-features ())) (loop for i from 0 to (1- (length (code-vector-sort-array *tagger*))) do (setf (aref (code-vector-sort-array *tagger*) i) i)) (loop for f in (ordered-simplified-features *nbo-tagger*) do (pushnew (car (abbreviate-features (list (string-downcase (symbol-name f))))) ordered-simplified-features)) (loop for i from 0 and f in (nreverse ordered-simplified-features) do (setf (aref (code-vector-sort-array *tagger*) i) (or (position f (feature-vector *tagger*)) (progn (print (list i f)) nil))))) #+ignore (let ((*tagger* *nbo-tagger*)) (loop for code across (code-vector-sort-array *nbo-tagger*) do (print (code-feature code)))) ;; ** "eint" and "ent" are features! ;; ordered by hand #+(and :bit-vector-coding :nny-parser) (let ((*tagger* *nny-tagger*) (ordered-simplified-features ())) (loop for i from 0 to (1- (length (code-vector-sort-array *tagger*))) do (setf (aref (code-vector-sort-array *tagger*) i) i)) (loop for f across #("fork" "symb" "subst" "adj" "verb" "prep" "adv" "sbu" "konj" "det" "dem" "kvant" "prob" "pron" "pers" "poss" "pos" "komp" "sup" "res" "" "" "" "1" "2" "3" "mask" "fem" "noeyt" "m/f" "appell" "prop" "ub" "be" "bokmaal" "eint" "fl" "inf" "pres" "perf-part" "pret" "imp" "hum" "ikke-hum" "nom" "akk" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "pref" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "interj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adj+subst" "" "konj+adj" "forst" "" "prep+adj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "det+subst" "prep+subst+prep" "" "" "" "prep+adj+subst" "subst+kon+subst" "hoeflig" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "suff" "" "" "prep+subst+kon+subst" "sp" "subst+verb" "prep+prop" "prep+subst+subst" "prep+det+subst" "det+adj" "subst+subst" "CLB" "prep+prep" "verb+det" "interj+adv" "subst+prep+subst" "konj+adv+adj" "pron+verb+verb" "konj+adv+prep" "adj+verb" "prep+konj+prep" "adj+det" "prep+adv" "subst+prep" "gen" "interj+adj" "prep+adv+subst" "prep+adj+adj" "prep+perf-part+subst" "adv+subst" "verb+verb" "verb+det+subst" "konj+det+adj" "adv+adj" "prep+det+sbu" "det+subst+prep+subst" "adv+adv+prep" "prep+subst+konj+subst" "part+prep" "prep+subst+prep+sbu" "inf-merke" "verb+subst" "adj+kon+adj" "prep+det+subst+kon+det+subst" "adj+adj" "subst+perf-part" "subst+prep+adj+subst" "subst+kvant" "unorm" "bu" "st-form" "" "" "" "" "" "" "" "" "" "sideform" "" "" "" "" "" "" "" "refl1" "adv+verb" "uttr" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adv+adv" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "subst+verb+subst" "" "" "" "" "" "" "" "" "sbu+prep" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+kon+subst" "prep+subst+konj+adv" "" "" "" "" "" "" "" "prep+det+subst+kon+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "samset" "bu" "ukjent" "" "" "" "" "" "" "" "" "" "" "" "" "" ">>>" "<<<" "" "" @ @s-gr @" "@I-OBJ" "@DET>" "@KON" "@ADJ>" "@SBU" "@OBJ" "@SUBJ" "@" "foreign") do (pushnew (car (abbreviate-features (list f))) ordered-simplified-features)) (loop for i from 0 and f in (nreverse ordered-simplified-features) do (setf (aref (code-vector-sort-array *tagger*) i) (or (position f (feature-vector *tagger*)) (progn (print (list i f)) nil))))) #+test (defparameter *nbo-vector* #("fork" "symb" "subst" "adj" "verb" "prep" "adv" "sbu" "konj" "det" "dem" "kvant" "prob" "pron" "pers" "poss" "refl" "pos" "komp" "sup" "res" "" "" "" "1" "2" "3" "mask" "fem" "noeyt" "m/f" "mask/fem" "appell" "prop" "ub" "be" "ent" "fl" "ent/fl" "uboey" "inf" "pres" "inf/pres" "perf-part" "pret" "imp" "pass" "hum" "nom" "akk" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@adv" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "pref" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "interj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@tittel" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@s-pred" "" "@interj" "@" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "vei>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adj+subst" "" "konj+adj" "forst" "" "prep+adj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "hop>" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@" "@loes-np" "" "@obj" "" "@subj" "det+subst" "prep+subst+prep" "" "" "" "prep+adj+subst" "subst+kon+subst" "hoeflig" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "suff" "" "" "prep+subst+kon+subst" "sp" "@fv" "@iv" "subst+verb" "prep+prop" "prep+subst+subst" "" "prep+det+subst" "det+adj" "subst+subst" "clb" "" "prep+prep" "verb+det" "interj+adv" "subst+prep+subst" "konj+adv+adj" "pron+verb+verb" "konj+adv+prep" "@adv>" "adj+verb" "@i-obj" "+" "@det>" "prep+konj+prep" "adj+det" "prep+adv" "subst+prep" "gen" "v+v" "interj+adj" "sbu+adj" "prep+adv+subst" "@kon" "eint" "subst+konj+subst" "prep+adj+adj" "@adj>" "prep+perf-part+subst" "adv+subst" "perf-part." "adv+prep" "verb+verb" "verb+det+subst" "konj+det+adj" "adv+adj" "pron+prep+adj" "prep+det+sbu" "ub/be" "det+subst+prep+subst" "det+adj+det" "adv+prep+subst" "adj+prep+subst" "mask/fem/noeyt" "adv+adv+prep" "prep+subst+konj+subst" "part+prep" "fl/be" "subst+v+subst" "prep+subst+prep+sbu" "inf-merke" "verb+subst" "" "adj+kon+adj" "prep+det+subst+kon+det+subst" "subst+adj" "adj+adj" "adv+adj+prep" "subst+perf-part" "subst+prep+adj+subst" "subst+kvant" "unorm")) #+test (stable-sort (nreverse '("" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "subst+kvant" "verb+det" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adj+adj" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+det+subst+kon+subst" "" "" "" "adv+adv+prep" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+subst+konj+adv" "prep+subst+prep+sbu" "adj+kon+adj" "verb+det+subst" "prep+kon+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@ADJ>" "" "" "" "" "det+adj" "pron+verb+verb" "" "" "" "subst+perf-part" "" "" "" "@SBU" "sbu+prep" "subst+verb" "res" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@" "" "" "" "" "" "" "" "" "subst+verb+subst" "" "" "" "" "" "" "" "" "" "" "prep+prop" "" "" "forst" "@KON" "" "" "" "" "" "" "" "" "" "" "" "prep+konj+prep" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adv+subst" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adv+adv" "" "" "" "" "" "" "" "" "konj+adv+adj" "konj+adv+prep" "konj+det+adj" "" "" "" "konj+adj" "" "" "" "" "" "" "" "" "" "" "prep+perf-part+subst" "" "" "" "" "@DET>" "adj+det" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep+adv+subst" "" "" "prep+adj" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "@ADV>" "" "" "" "adj+subst" "" "" "" "symb" "" "uttr" "verb+verb" "adv+adj" "@I-OBJ" "det+subst+prep+subst" "part+prep" "adv+verb" "prep+det+sbu" "prep+subst+konj+subst" "be" "ent" "subst+prep+adj+subst" "subst+subst" "verb+subst" "interj+adj" "prep+prep" "adj+verb" "@L¯S-NP" "@" "@IV" "inf-merke" "1" "@O-PRED" "prep+adv" "refl1" "" "" "" "" "ikke-hum" "sp" "prep+subst+prep" "prep+subst" "CLB" "2" "prob" "@ADV" "det+subst" "poss" "akk" "nom" "hum" "3" "pers" "" "konj" "" "sbu" "pron" "" "" "" "" "" "" "" "" "" "" "suff" "dem" "kvant" "det" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "prep" "" "" "" "" "" "" "sideform" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "adv" "" "" "" "" "" "" "unorm" "" "" "" "" "" "" "pref" "" "" "" "" "" "" "" "" "" "" "" "interj" "" "" "" "" "" "" "fork" "" "" "" "" "" "" "sup" "komp" "pos" "" "" "" "" "m/f" "perf-part" "pret" "st-form" "pres" "imp" "" "" "adj" "" "inf" "verb" "n¿yt" "fem" "fl" "bu" "ub" "eint" "appell" "mask" "subst")) (lambda (x y) (let ((x-pos (position (%convert-string x) *nbo-vector* :test #'string-equal)) (y-pos (position (%convert-string y) *nbo-vector* :test #'string-equal))) (or (and x-pos (not y-pos)) (and x-pos y-pos (<= x-pos y-pos)))))) ;; returns feature-code and list of unencoded features (defun encode-features (&rest features) (let ((feature-code (make-array (code-vector-length) :element-type 'bit :initial-element 0)) (unknown-features ())) (dolist (feature features) (let ((code (feature-code feature))) (if code (setf (sbit feature-code code) 1) (push feature unknown-features)))) (values feature-code unknown-features))) (defun add-features (feature-code &rest features) (dolist (feature features) (let ((code (feature-code feature))) (when code (setf (sbit feature-code code) 1)))) feature-code) (defun translate-feature (feature) (declare (ignore feature))) #|| (time (dotimes (i 10000) (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (time (let* ((bit-vector (make-array *code-vector-length* :element-type 'bit :initial-element 0)) (codes '(1 6 9 30))) (dolist (i '(6 9 30 50 40)) (setf (sbit bit-vector i) 1)) (dotimes (i 10000) (loop for i fixnum in codes always (= (sbit bit-vector i) 1))))) (time (let ((features '(zuzu gaga fifi roro loeloe))) (dotimes (i 10000) (loop for c in '(gaga fifi roro loeloe) always (find c features))))) ||# (defun %convert-string (str) (let ((length (length str))) (labels ((convert (pos result) (if (= pos length) result (let ((start pos) (entity nil)) (loop for char = (char str pos) do (setf entity (case char (#\¿ "oe") (#\¾ "ae") (#\Œ "aa") (#\¯ "OE") (#\® "AE") (#\ "AA") (otherwise nil)) #+ignore (and (not (<= (the fixnum #.(char-code #\A)) (the fixnum (char-code char)) (the fixnum #.(char-code #\z)))) (char/= char #\<) (char/= char #\>) (gethash char *sgml-char-entity-table*))) (incf pos) until (or (= pos length) entity)) (cond ((not entity) (concat result (subseq str start))) ((> pos start) (convert pos (concat result (subseq str start (1- pos)) entity))) (t (convert pos (concat result entity)))))))) (convert 0 "")))) #-bit-vector-coding (defun code-from-features (features) "Translates a list of features into a bit vector." (let ((feature-code (make-array *code-vector-length* :element-type 'bit :initial-element 0))) (dolist (feature features) (let* ((f (string-downcase feature)) (code (gethash f *bm-feature-table*))) (if code (setf (sbit feature-code code) 1) (warn "Could not find code for ~s" f)))) feature-code)) #+bit-vector-coding (defun code-from-features (features) "Translates a list of features into a bit vector." (unless (eq *tagger* *nny-tagger*) (setf features (mapcar #'%convert-string features))) (let ((feature-code (make-array (code-vector-length) :element-type 'bit :initial-element 0))) (dolist (feature (abbreviate-features features)) (let ((code (gethash feature (feature-table *tagger*)))) (if code (setf (sbit feature-code code) 1) (warn "Could not find code for ~s in ~s" feature features)))) feature-code)) (defun set-feature (bit-vector feature) (setf (sbit bit-vector (feature-code feature)) 1) bit-vector) (defun reset-feature (bit-vector feature) (setf (sbit bit-vector (feature-code feature)) 0) bit-vector) ;(abbreviate-features '("mask" "fem" "noeyt")) ;(code-from-features '("mask" "fem" "noeyt")) ;(code-from-features '("mask" "fem")) (defun features-statistics (features code-vector) (dolist (feature features) (let* ((f (string-downcase (symbol-name feature))) (f-list (translate-feature f))) (cond (f-list (dolist (f f-list) (let ((code (gethash f (feature-table *tagger*)))) (when code (incf (aref code-vector code)))))) (t (let ((code (gethash f (feature-table *tagger*)))) (when code (incf (aref code-vector code))))))))) #+ignore (defparameter *code-vector* (make-array (code-vector-length) :element-type 'fixnum :initial-element 0)) #+old (defun bit-vector-to-string (cv) (let ((string (make-string (ceiling (/ (length cv) 8)))) (byte 0)) (loop for bit across cv with i = 0 and pos = 0 do (when (= bit 1) (setf byte (logxor byte (ash 1 i)))) (when (= (incf i) 8) (setf i 0 (char string pos) (code-char byte) byte 0) (incf pos)) ;finally (setf (char string pos) (code-char byte)) ) ;; *** SOMEWHAT BUGGY (let ((last-non-null (position-if-not (lambda (c) (char= c #\Null)) string :from-end t))) (if last-non-null (subseq string 0 (1+ last-non-null)) "")))) (defun bit-vector-to-string (cv) (let ((string (make-string (ceiling (/ (length cv) 8)))) (byte 0)) (loop for bit across cv with i = 0 and pos = 0 do (when (= bit 1) (setf byte (logxor byte (ash 1 i)))) (when (= (incf i) 8) (setf i 0 (char string pos) (code-char byte) byte 0) (incf pos)) finally (unless (zerop i) (setf (char string pos) (code-char byte)))) ;; *** SOMEWHAT BUGGY? (let ((last-non-null (position-if-not (lambda (c) (char= c #\Null)) string :from-end t))) (if last-non-null (subseq string 0 (1+ last-non-null)) "")))) (defun string-to-bit-vector (string) (declare (optimize (speed 3) (safety 0))) (let ((bv (make-array (code-vector-length) :element-type 'bit :initial-element 0))) (loop for c across string for i fixnum from 0 do (let ((byte (the fixnum (char-code c)))) (dotimes (j 8) (declare (fixnum j)) (unless (zerop (the fixnum (logand byte (the fixnum (ash 1 j))))) (setf (sbit bv (the fixnum (+ (the fixnum (* i 8)) j))) 1))))) bv)) (defun array-to-bit-vector (array) (declare (optimize (speed 3) (safety 0))) (let ((bv (make-array (code-vector-length) :element-type 'bit :initial-element 0))) (loop for byte fixnum across array for i fixnum from 0 do (dotimes (j 8) (declare (fixnum j)) (unless (zerop (the fixnum (logand byte (the fixnum (ash 1 j))))) (setf (sbit bv (the fixnum (+ (the fixnum (* i 8)) j))) 1)))) bv)) #+unoptimized (defun string-to-bit-vector (string) (let ((bv (make-array (code-vector-length) :element-type 'bit :initial-element 0))) (loop for c across string for i from 0 do (let ((byte (char-code c))) (dotimes (j 8) (unless (zerop (logand byte (ash 1 j))) (setf (sbit bv (+ (* i 8) j)) 1))))) bv)) ;(split "asdf:qwer:tyui:xcvb" #\: 3) ;(pushnew :bit-vectors *features*) (defun fullform-values (word) (let ((values (string-values (code-lexicon *tagger*) word))) (mapcar (lambda (value) (destructuring-bind (code+inflection-nr compressed-lemma feature-vector) (split value #\: 3) ;; OBS: feature vector might contain split char! (list* (multiple-value-call #'decimal-to-alpha (%decode-str code+inflection-nr)) (decompress-string compressed-lemma word) #-bit-vectors(code-features (string-to-bit-vector feature-vector)) #+bit-vectors(string-to-bit-vector feature-vector)))) values))) #+test (time (dotimes (i 1000) (fullform-values "kommet"))) ; #-bit-vectors (2.590 seconds, 6,424,056 bytes) ; #+bit-vectors (2.352 seconds, 5,712,056 bytes) #+test (let ((*tagger* *nbo-tagger*)) (print (fullform-features "mye"))) ;; returns a list of feature lists (defun fullform-features (word &optional net) (let ((values (string-values (or net (lexicon *tagger*)) word))) (mapcar (lambda (value) (destructuring-bind (compressed-lemma feature-vector) (split value #\: 2) (declare (ignore compressed-lemma)) #-bit-vectors(code-features (string-to-bit-vector feature-vector)) #+bit-vectors(string-to-bit-vector feature-vector))) values))) #+test (time (dotimes (i 1000) (fullform-features "kommet"))) ;#-bit-vectors (0.939 seconds, 2,456,016 bytes) ;#+bit-vectors (0.796 seconds, 2,032,016 bytes) (0.948 seconds without optimizing of string-to-bit-vector) ;(lemma-and-features "se") #+test (time (dotimes (i 1000) (lemma-and-features "kommet"))) ; 0.921 seconds, 2,256,016 bytes #+old+old (defun lemma-and-features (word &key decompress-base (net *bm-lexicon*)) (declare (optimize (speed 3) (safety 0)) (string word)) (let ((values (string-values net word))) (mapcar (lambda (value) (destructuring-bind (compressed-lemma feature-vector) (split value #\: 2) (cons (decompress-string compressed-lemma (or decompress-base word)) #-bit-vectors(code-features (string-to-bit-vector feature-vector)) #+bit-vectors(string-to-bit-vector feature-vector)))) values))) #+old (defun lemma-and-features (word &key decompress-base (net *bm-lexicon*)) (declare (optimize (speed 3) (safety 0)) (string word)) (u:collecting (nmap-string-values net word (lambda (value) (destructuring-bind (compressed-lemma feature-vector) (split value #\: 2) (u:collect (cons (decompress-string compressed-lemma (or decompress-base word)) #-bit-vectors(code-features (string-to-bit-vector feature-vector)) #+bit-vectors(string-to-bit-vector feature-vector)))))))) (defvar *%strings* (make-array 0 :adjustable t :fill-pointer t)) (defvar *%byte-arrays* (make-array 0 :adjustable t :fill-pointer t)) (defun %vector-pop (vector) (let* ((fill (fill-pointer vector))) (declare (fixnum fill)) (unless (zerop fill) (decf fill) (setf (fill-pointer vector) fill) (aref vector fill)))) (defmacro %with-string ((string) &body body) `(let ((,string (or (%vector-pop *%strings*) (make-array 0 :element-type #+mcl 'base-character #-mcl 'character :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,string) 0) (vector-push-extend ,string *%strings*)))) (defmacro %with-byte-array ((array) &body body) `(let ((,array (or (%vector-pop *%byte-arrays*) (make-array 0 :element-type '(unsigned-byte 8) :adjustable t :fill-pointer t)))) (unwind-protect (progn ,@body) (setf (fill-pointer ,array) 0) (vector-push-extend ,array *%byte-arrays*)))) #+test (let ((*tagger* (multi-tagger *nbo-cg*))) (lemma-and-features "" :net (expressions *tagger*))) (defun lemma-and-features (word &key decompress-base (net (lexicon *tagger*))) (declare (optimize (speed 3) (safety 0)) (string word)) ;;(print (list word decompress-base net)) (%with-string (%string) (%with-byte-array (%array) (collecting (nmap-string+array-values net word (lambda (compressed-lemma feature-vector) (collect (cons (decompress-string compressed-lemma (or decompress-base word)) #-bit-vectors(code-features (array-to-bit-vector feature-vector)) #+bit-vectors(array-to-bit-vector feature-vector)))) %string %array))))) (defun word-lemma (word &key decompress-base (net (lexicon *tagger*))) (declare (optimize (speed 3) (safety 0)) (string word)) (let ((lemma-list ())) (%with-string (%string) (%with-byte-array (%array) (nmap-string+array-values net word (lambda (compressed-lemma feature-vector) (declare (ignore feature-vector)) (pushnew (decompress-string compressed-lemma (or decompress-base word)) lemma-list :test #'string=)) %string %array))) lemma-list)) #+test (let ((*tagger* *nbo-tagger*) (count 0)) (block map (nmap-strings (lexicon *tagger*) (lambda (string) (destructuring-bind (word compressed-lemma &optional fvector) (string-parse string :separating-chars ":") (let ((lemma (decompress-string compressed-lemma word))) (print (list word lemma fvector)) (print (concat lemma ":" (string-net::compress-string word lemma) ":" fvector)))) (when (= (incf count) 20) (return-from map)))))) #+test (write-lemma-wordforms-net *nbo-tagger* "projects:cgp;nets;nbo-lemmata-forms.net") #+test (write-lemma-wordforms-net *nny-tagger* "projects:cgp;nets;nny-lemmata-forms.net") (defun write-lemma-wordforms-net (tagger net-file) (let ((count 0) (net (make-instance 'string-net::list-string-net))) (block map (nmap-strings (lexicon tagger) (lambda (string) (destructuring-bind (word compressed-lemma &optional fvector) (split string #\: 3) (let* ((lemma (decompress-string compressed-lemma word)) (l-w-f-string (concat lemma ":" (string-net::compress-string word lemma) ":" fvector))) (string-net::add-string net l-w-f-string) (when (zerop (mod (incf count) 10000)) (print l-w-f-string)) #+ignore(when (= count 20) (return-from map))))))) (minimize-net net) (string-net::write-string-net net net-file))) #+old (defun %alpha-to-decimal (code) (multiple-value-bind (code form) (alpha-to-decimal code) (let ((int (+ (* code 16) form))) (with-output-to-string (stream) (loop with i = int until (zerop i) do (write-char (code-char (logand i 255)) stream) (setf i (ash i -8))))))) (defun %alpha-to-decimal (code) (multiple-value-bind (code form) (alpha-to-decimal code) (let ((int (+ (* code 16) form))) (with-output-to-string (stream) (loop with i = int until (zerop i) do (let ((c (logand i 255))) (write-char (code-char (string-net::translate-char-code c)) stream)) (setf i (ash i -8))))))) #+old (defun %decode-str (str) (let ((code 0)) (loop for i from 0 and c across str do (setf code (+ code (ash (char-code c) (* 8 i))))) (floor code 16))) ;(time (dotimes (i 10000) (%decode-str (%alpha-to-decimal "200:3")))) ;(alpha-to-decimal "200:3") ;(multiple-value-call #'decimal-to-alpha (alpha-to-decimal "200:3")) #+test (let ((*cg* (gethash "nny-juni02" *cg-table*)) (*tagger* *nny-tagger*)) (print *cg*) (disambiguate-from-string "Vi har det bra." :tagging-niveau :syntactic-disambiguation :cg *cg*)) #| ; features frequency 826310 subst 820674 appell 540346 ent 515119 ub 471557 be 453587 fl 379957 mask 363895 noeyt 250145 adj 125571 verb 120919 fem 112882 91021 pos 83339 43586 m/f 41796 pres 41784 inf 41037 36545 sup 27363 20949 perf-part 20945 pret 20928 imp 20843 20841 pass 18279 komp 12155 8767 7439 7252 5863 5103 prop 4950 4764 4678 3890 3622 3158 3155 2982 2948 2926 2734 unorm 2662 2662 2518 2377 2321 2271 2250 2222 2181 2069 1958 1848 1826 1793 1771 1710 1639 1595 1584 1553 1529 1496 1485 1485 1320 adv 1309 1298 1265 1233 1200 fork 1188 1166 1154 uboey 1134 1133 1099 1085 1078 1034 1023 1012 1001 979 968 963 891 869 858 847 759 737 726 715 706 prep 704 704 704 703 693 685 682 682 674 649 638 595 594 572 550 550 550 539 531 517 509 @ADV 506 495 463 456 440 429 418 418 397 396 390 385 385 385 385 369 pref 333 330 330 330 330 330 319 319 308 308 308 308 308 308 302 297 297 286 286 286 286 283 interj 279 275 275 275 264 264 264 253 253 243 243 242 242 det 242 231 231 225 220 220 220 210 209 209 209 198 198 198 198 190 187 187 187 184 prep+subst 177 176 176 176 176 176 176 165 165 165 165 165 165 165 165 154 154 154 154 143 143 143 143 143 143 143 143 132 132 132 132 132 132 132 132 132 132 132 131 127 125 121 121 121 121 121 121 121 121 114 111 110 110 110 110 110 110 104 kvant 104 prob 99 99 99 99 99 99 99 99 91 88 88 88 88 88 88 88 88 88 88 88 88 81 77 77 77 77 77 77 77 77 77 77 77 77 77 77 74 @TITTEL 67 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 66 58 57 55 55 55 55 55 55 symb 55 55 55 55 55 55 55 55 55 55 55 55 55 55 52 sbu 50 dem 47 pron 46 poss 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 41 @S-PRED 38 37 @INTERJ 37 pers 34 @ 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 vei> 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 30 adj+subst 29 hum 28 26 konj+adj 26 3 25 forst 25 25 prep+adj 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 hop> 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 21 @ 21 @LoeS-NP 20 20 @OBJ 20 17 @SUBJ 17 det+subst 16 prep+subst+prep 15 konj 14 14 akk 14 14 nom 14 14 prep+adj+subst 13 subst+kon+subst 12 hoeflig 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 10 suff 10 10 9 prep+subst+kon+subst 9 sp 9 @FV 8 @IV 8 subst+verb 8 prep+prop 7 prep+subst+subst 7 6 res 6 prep+det+subst 6 det+adj 6 subst+subst 5 CLB 5 5 prep+prep 4 verb+det 4 interj+adv 4 subst+prep+subst 4 1 4 konj+adv+adj 4 2 4 pron+verb+verb 4 konj+adv+prep 4 @ADV> 4 adj+verb 3 @I-OBJ 3 + 3 @DET> 3 prep+konj+prep 3 adj+det 3 prep+adv 3 subst+prep 3 gen 2 v+v 2 interj+adj 2 sbu+adj 2 prep+adv+subst 2 @KON 2 eint 2 subst+konj+subst 2 prep+adj+adj 2 @ADJ> 2 prep+perf-part+subst 2 adv+subst 2 perf-part. 2 adv+prep 2 verb+verb 2 verb+det+subst 2 konj+det+adj 2 adv+adj 1 pron+prep+adj 1 prep+det+sbu 1 ub/be 1 det+subst+prep+subst 1 det+adj+det 1 adv+prep+subst 1 adj+prep+subst 1 mask/fem/noeyt 1 adv+adv+prep 1 prep+subst+konj+subst 1 part+prep 1 fl/be 1 subst+v+subst 1 prep+subst+prep+sbu 1 inf-merke 1 verb+subst 1 1 adj+kon+adj 1 prep+det+subst+kon+det+subst 1 subst+adj 1 mask/fem 1 adj+adj 1 adv+adj+prep 1 ent/fl 1 subst+perf-part 1 subst+prep+adj+subst 1 subst+kvant 1 refl 1 inf/pres |# ;;; EOF