(in-package :tsdb) ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 2014 -- 2014 Stephan Oepen (oe@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; #+:null (setf angelina (analyze "erg/trunk/angelina/14-10-02/pet.1" :condition "t-active == 1" :thorough '(:derivation) :inputp t) derivation (get-field :derivation (first (get-field :results (first angelina))))) (defun micro-tokenize (string) (loop for token in (lkb::repp string :repp :micro :format :raw) collect (lkb::token-form token))) (defun derivation-lexical-items (derivation) (let* ((top (derivation-root derivation)) (rule (ignore-errors (lkb::get-lex-rule-entry (intern top :lkb)))) (daughters (derivation-daughters derivation))) ;; ;; we detect lexical items by one of two conditions: either the top node is ;; an instance of a lexical rule, or it is a preterminal, i.e. its one and ;; only daughter is a tokens list. ;; (if (or rule (null (derivation-daughters (first daughters)))) (list derivation) (loop for daughter in daughters append (derivation-lexical-items daughter))))) (defun dt-annotate (derivation &optional (table (make-hash-table)) head) (let ((daughters (derivation-daughters derivation))) (if (null (derivation-daughters (first daughters))) (when head (setf (gethash derivation table) (cons head nil))) (let* ((top (derivation-root derivation)) (match (rest (assoc top *derivation-heads*))) (arity (get-field :arity match)) (n (and arity (> arity 1) (get-field :head match)))) (loop for daughter in daughters for i from 0 do (dt-annotate daughter table (if (and n (not (= i n))) derivation head)))))) (loop for head being each hash-value in table for leaf = (derivation-head (first head)) do (setf (rest head) leaf)) table) (defun dt-generalize (label) (if label (let* ((label (string-downcase label)) (break (position #\_ label))) (subseq label 0 break)) "root")) #+:null (defun dt-convert (item &key (pos t) (percent 100) stream tokens) (loop with result = nil with p-input = (get-field :p-input item) with results = (or (get-field :ranks item) (get-field :results item)) with derivation = (get-field :derivation (first results)) with table = (dt-annotate derivation) for preterminal in (derivation-preterminals derivation) for ids = (remove-duplicates (derivation-token-ids preterminal) :test #'=) for initials = (loop for id in ids for initial = (find id p-input :key #'(lambda (token) (get-field :id token))) for micro = (micro-tokenize (get-field :form initial)) collect (cons initial micro)) collect (cons preterminal initials) into lexicals finally ;; ;; at this point, .lexicals. is a list whose top-level entries describe ;; lexical tokens, each of the form (( +)+), where the ;; elements are the underlying initial token(s), and ;; elements are the micro-tokenization of its surface form. ;; ;; ;; one initial token could be part of multiple lexical tokens, because ;; additional splits can be introduced in micro-tokenization. ;; ‘constraint-based’, for example, would be split after the hyphen, ;; and (barring the existence of a multi-word lexical entry), there ;; will be lexical tokens |contraint-| and |based|. in our initial ;; .lexicals. list, however, both micro-tokens will be on both lexical ;; tokens, and hence we need to distribute micro-tokens accordingly. ;; (loop with i = 0 with initial = nil for lexicals on lexicals for current = (rest (first lexicals)) for next = (rest (second lexicals)) do (cond ((eq (first (first (last current))) (first (first next))) (setf initial (first (first next))) (setf (first (last current)) (list initial (nth i (rest (first (last current)))))) (incf i)) (initial (setf (first current) (list initial (nth i (rest (first current))))) (setf initial nil) (setf i 0)))) ;; ;; now we have computed everything to output in micro tokenization ;; (loop with n = 0 for lexical in lexicals for preterminal = (first lexical) for head = (gethash preterminal table) for pid = (derivation-id preterminal) do (loop with finitial = (first (rest lexical)) for initial in (rest lexical) for token = (first initial) do (loop with fmicro = (first (rest initial)) for micro in (rest initial) do (when (and (< (random 100) percent) (eq initial finitial) (eq micro fmicro)) (push (format nil "(~a0000, ~a, ~a, 1, ~ \"⌊→¦~(~a~)~@[¦~a~]⌋\", 0, \"null\")" n n (incf n) (dt-generalize (derivation-root (first head))) (and (rest head) (derivation-from (rest head)))) result)) (push (format nil "(~a~4,'0d, ~a, ~a, <~a:~a>, 1, ~s, ~ 0, \"null\"~@[,~{ ~s~}~])" n pid n (incf n) (derivation-from preterminal) (derivation-to preterminal) micro (and pos (get-field :tags token))) result) (when tokens (format tokens "~a~c~{~a~^ ~}~%" micro #\tab (and pos (get-field :tags token))))))) finally (when tokens (terpri tokens)) (setf result (nreverse result)) (cond (stream (format stream "[~a] |~{~a~^ ~}~%" (get-field :i-id item) result))) (return result))) (defun dt-convert (item &key (pos t) (percent 100) stream tokens) (loop with result = nil with p-input = (get-field :p-input item) with results = (or (get-field :ranks item) (get-field :results item)) with derivation = (get-field :derivation (first results)) with table = (dt-annotate derivation) for preterminal in (derivation-preterminals derivation) for ids = (remove-duplicates (derivation-token-ids preterminal) :test #'=) for initials = (loop for id in ids for initial = (find id p-input :key #'(lambda (token) (get-field :id token))) for micro = (micro-tokenize (get-field :form initial)) collect (cons initial micro)) collect (cons preterminal initials) into lexicals finally ;; ;; at this point, .lexicals. is a list whose top-level entries describe ;; lexical tokens, each of the form (( +)+), where the ;; elements are the underlying initial token(s), and ;; elements are the micro-tokenization of its surface form. ;; ;; ;; one initial token could be part of multiple lexical tokens, because ;; additional splits can be introduced in micro-tokenization. ;; ‘constraint-based’, for example, would be split after the hyphen, ;; and (barring the existence of a multi-word lexical entry), there ;; will be lexical tokens |contraint-| and |based|. in our initial ;; .lexicals. list, however, both micro-tokens will be on both lexical ;; tokens, and hence we need to distribute micro-tokens accordingly. ;; (loop with i = 0 with initial = nil for lexicals on lexicals for current = (rest (first lexicals)) for next = (rest (second lexicals)) do (cond ((eq (first (first (last current))) (first (first next))) (setf initial (first (first next))) (setf (first (last current)) (list initial (nth i (rest (first (last current)))))) (incf i)) (initial (setf (first current) (list initial (nth i (rest (first current))))) (setf initial nil) (setf i 0)))) ;; ;; now we have computed everything to output in micro tokenization ;; (loop with n = 0 for lexical in lexicals for preterminal = (first lexical) for head = (gethash preterminal table) for pid = (derivation-id preterminal) do (loop with finitial = (first (rest lexical)) for initial in (rest lexical) for token = (first initial) do (loop with fmicro = (first (rest initial)) for micro in (rest initial) do (when (and (< (random 100) percent) (eq initial finitial) (eq micro fmicro)) (push (format nil "(~a0000, ~a, ~a, 1, ~ \"⌊→¦~(~a~)~@[¦~a~]⌋\", 0, \"null\")" n n (incf n) (dt-generalize (derivation-root (first head))) (and (rest head) (derivation-from (rest head)))) result)) (push (format nil "(~a~4,'0d, ~a, ~a, <~a:~a>, 1, ~s, ~ 0, \"null\"~@[,~{ ~s~}~])" n pid n (incf n) (derivation-from preterminal) (derivation-to preterminal) micro (and pos (get-field :tags token))) result) (when tokens (format tokens "~a~c~{~a~^ ~}~%" micro #\tab (and pos (get-field :tags token))))))) finally (when tokens (terpri tokens)) (setf result (nreverse result)) (cond (stream (format stream "[~a] |~{~a~^ ~}~%" (get-field :i-id item) result))) (return result))) #+:null (let ((items (analyze "gold/deepbank/wsj00a" :condition "t-active == 1" :thorough '(:derivation) :inputp t))) (with-open-file (stream "/tmp/wsj00a.none.yy" :direction :output :if-exists :supersede) (loop for item in items do (dt-convert item :pos t :percent 0 :stream stream))) (with-open-file (stream "/tmp/wsj00a.10.yy" :direction :output :if-exists :supersede) (loop for item in items do (dt-convert item :pos t :percent 10 :stream stream))) (with-open-file (stream "/tmp/wsj00a.25.yy" :direction :output :if-exists :supersede) (loop for item in items do (dt-convert item :pos t :percent 25 :stream stream))) (with-open-file (stream "/tmp/wsj00a.half.yy" :direction :output :if-exists :supersede) (loop for item in items do (dt-convert item :pos t :percent 50 :stream stream))) (with-open-file (stream "/tmp/wsj00a.all.yy" :direction :output :if-exists :supersede) (loop for item in items do (dt-convert item :pos t :percent 100 :stream stream))))