;;; Hey, emacs(1), this is -*- Mode: Common-Lisp; -*-, got it? ;;; ;;; PAL --- PCFG Approximation and Parsing Library for DELPH-IN ;;; ;;; Copyright (c) 2009 -- 2012 Johan Benum Evensberget (johan.benum@gmail.com) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (in-package :pcfg) (defun add-tt-mapping (symbol top-type) (setf (gethash symbol *TT-table*) top-type)) ;;; In order to create CFGs containing more information from ;;; the TFSes of the actual edges, we create a layer between ;;; our approximated edges and the real edges (defstruct approx-edge real-edge label leaves children) (defmethod print-object ((object approx-edge) stream) (format stream "#[aprx-edge: Label: ~a Edge ~a]" (approx-edge-label object) (approx-edge-real-edge object))) ;;; Edge approximation (defun skip-edge (edge) (let ((name (symbol-name (tsdb::edge-root edge *pcfg-use-preterminal-types-p*)))) (or (string= name "W_ITALICS_DLR") (string= (subseq name (- (length name) 4)) "_PLR")))) (defun create-transitive-inclusion (edge) (loop while (skip-edge edge) do (setf edge (first (lkb::edge-children edge)))) (let (path terminal-path name terminal) (loop with current-edge = edge for id = (tsdb::edge-root current-edge *pcfg-use-preterminal-types-p*) while (lkb::edge-children current-edge) do (push current-edge path) (push id terminal-path) (setf current-edge (first (lkb::edge-children current-edge))) finally (push id terminal-path) (setf terminal (first (lkb::edge-leaves current-edge))) (push current-edge path) (setf terminal-path (nreverse terminal-path)) (setf path (nreverse path))) (setf name (intern (format nil "~{~a~^/~}" terminal-path) :pcfg)) (make-approx-edge :label (string name) :children nil :real-edge (first path) :leaves (list (string terminal))))) (defun create-edge-label (edge &key root) (unless root (setf root (tsdb::edge-root edge *pcfg-use-preterminal-types-p*))) (when (lkb::edge-dag edge) (let* ((dag (lkb::edge-dag edge)) (tdfs (lkb::tdfs-indef dag)) (paths (mapcar #'cdr *include-TDF-label-paths*)) (lables (mapcar #'car *include-TDF-label-paths*)) (values (mapcar (lambda (path label) (list label (lkb::get-value-at-end-of tdfs path))) paths lables)) (lists (loop for (label listpath valuepath) in *include-TDF-lists* for list = (get-dag-list-at tdfs listpath) for values = (mapcar (lambda (x) (lkb::get-value-at-end-of x valuepath)) list) when values collect (list label values)))) (intern (format nil "~a::~{~{[~a]~a~}~^:~}:~{~{[~a]{~{~a~^,~}}~^:~}~}" root values lists) :lkb)))) (defun get-dag-list-at (dag path) (let ((list-head (loop with res = dag for attrib in path while res do (setf res (lkb::get-dag-value res attrib)) finally (return res)))) (if (or (null list-head) (null (lkb::get-dag-value list-head 'lkb::first))) nil (cons (lkb::get-dag-value list-head 'lkb::first) (get-dag-list-at list-head '(lkb::rest)))))) (defun dag-list-to-heads (dag-list) (mapcar (lambda (x) (lkb::get-value-at-end-of x '(lkb::local lkb::cat lkb::head))) dag-list)) #+:null (defparameter *transitive-inclusion-predicate* (lambda (&rest rest) nil)) ;; Controls when a label-collapsing will occur. (defun lexical-item-p (edge) (or (stringp (tsdb::edge-root edge)) (lkb::get-lex-rule-entry (tsdb::edge-root edge)))) (defparameter *transitive-inclusion-predicate* #'lexical-item-p) (defparameter *collapser* #'create-transitive-inclusion) (defparameter *edge-labeler* #'create-edge-label) (defun prepare-edge (edge) ;;; Go through the derivation top-down (cond ((and (eq *pcfg-lexical-mode* :collapse) (lexical-item-p edge)) (create-transitive-inclusion edge)) (t (let* ((ignorep (and (eq *pcfg-lexical-mode* :ignore) (lexical-item-p edge))) (children (unless ignorep (mapcar #'prepare-edge (lkb::edge-children edge)))) (root (when ignorep (loop with edge = edge while (lkb::edge-children edge) do (setf edge (first (lkb::edge-children edge))) finally (return (tsdb::edge-root edge *pcfg-use-preterminal-types-p*)))))) (make-approx-edge :real-edge edge :label (funcall *edge-labeler* edge :root root) :children children :leaves (if children (loop for x in children append (approx-edge-leaves x)) (lkb::edge-leaves edge))))))) ;;; Now we can create the actual CFGs from a list of items. (defun estimate-cfg (items &key (stream tsdb::*tsdb-io*) cfg-table chatter) ;; ;; _fix_me_ ;; need to parameterize as to which result(s) to use in estimation, e.g. ;; when running off non-treebanked WikiWoods, or when simulating lattice- ;; based input (by training on the full lexical lattice of the test data). ;; (declare (special tsdb::*reconstruct-cache*)) (loop with *package* = (find-package :lkb) with lkb::*edge-registry* = nil with cfg = (make-cfg) with length = (length items) initially (when cfg-table (setf (cfg-table cfg) cfg-table)) for c-count from 1 for r-count from 1 for item in items for id = (get-field :i-id item) for ranks = (get-field :ranks item) for edges = (loop with tsdb::*reconstruct-cache* = nil for rank in ranks for i = (get-field :rank rank) for derivation = (get-field :derivation rank) for edge = (when (and i (= i 1)) (tsdb::reconstruct derivation t :cachep nil)) when edge do (setf (lkb::edge-baz edge) derivation) and collect edge) when (and chatter (= 0 (mod c-count 250))) do (format stream "~&[~a] estimate-cfg: processing item ~a out of ~a~%" (tsdb::current-time :long :short) c-count length) when edges do (loop for edge in edges do (edge-to-cfrs (prepare-edge edge) cfg)) (incf (cfg-samples cfg)) else do (format stream "~&[~a] estimate-cfg(): ignoring item # ~d (no edge);~%" (tsdb::current-time :long :short) id) finally (estimate-probabilities cfg) (return cfg))) (defun edge-to-cfrs (edge cfg) (let ((rule (edge-to-cfr edge cfg)) (sponsor (when (consp (lkb::edge-baz (approx-edge-real-edge edge))) (tsdb::derivation-sponsor (lkb::edge-baz (approx-edge-real-edge edge))))) (root (approx-edge-label edge))) (when (and sponsor root) (let* ((table (cfg-table cfg)) (lhs (symbol-to-code 'root table)) (rhs (list (symbol-to-code root table)))) (add-tt-mapping lhs 'root) (add-tt-mapping (car rhs) (tsdb::edge-root (approx-edge-real-edge edge))) (record-cfr (make-cfr :type :root :lhs lhs :rhs rhs) cfg))) (record-cfr rule cfg) (unless (tsdb::smember (cfr-type rule) '(:root :irule :word)) (loop for daughter in (approx-edge-children edge) do (edge-to-cfrs daughter cfg))))) (defun edge-to-cfr (edge cfg) ;; ;; to support CFG extraction from packed forests, we use the temporary edge ;; slot .foo. to (destructively) associate CFG rules with edges. hence, in ;; case an edge is part of more than one derivation, its CFG rule is stored ;; within the edge, and since the rule includes a pointer to `its' grammar, ;; we can even tell whether we return to a stored rule in the same context. ;; (let* ((cfr (lkb::edge-foo (approx-edge-real-edge edge))) (table (cfg-table cfg)) (root (approx-edge-label edge)) (daughters (approx-edge-children edge))) (cond ((and (cfr-p cfr) (eq (cfr-cfg cfr) cfg)) cfr) ((and root (null daughters) *pcfg-include-leafs-p*) (let* ((lhs (symbol-to-code root table)) (rhs (symbol-to-code (first (approx-edge-leaves edge)) table))) (if (funcall *transitive-inclusion-predicate* (approx-edge-real-edge edge)) (add-tt-mapping lhs lhs) (add-tt-mapping lhs (tsdb::edge-root (approx-edge-real-edge edge)))) (add-tt-mapping rhs rhs) (setf (lkb::edge-foo (approx-edge-real-edge edge)) (make-cfr :type :word :lhs lhs :rhs rhs)))) (root (let* ((lhs (symbol-to-code root table)) (rhs (loop for edge in daughters for root = (approx-edge-label edge) collect (symbol-to-code root table)))) (add-tt-mapping lhs (tsdb::edge-root (approx-edge-real-edge edge))) (setf (lkb::edge-foo (approx-edge-real-edge edge)) (make-cfr :type :rule :lhs lhs :rhs rhs))))))) ;;; Further support functions: (defun get-cfg-rule-table (cfg) (with-slots (rule-table) cfg (unless rule-table (let ((table (make-hash-table :test #'equal))) (maphash (lambda (k v) (declare (ignore k)) (mapc (lambda (x) (setf (gethash (cfr-hash x) table) x)) v)) (cfg-rules cfg)) (setf rule-table table))) rule-table)) ;;; these two are now deprecated! (defun cfg-rules-to-id-set (cfg) (let ((result nil)) (maphash (lambda (k v) (declare (ignore k)) (mapc (lambda (x) (push (list (cfr-hash x) x) result)) v)) (cfg-rules cfg)) result)) (defun make-cfg-shallow-rule-table (cfg) (let ((table (make-hash-table :test #'equal))) (mapcar (lambda (x) (setf (gethash (car x) table) (cadr x))) (cfg-rules-to-id-set cfg)) table)) (defun cfg-unseen-rules (cfg1 cfg2 &key (count-only-type nil)) "List the rules in cfg2 not in cfg1" (let ((shallow-cfg1-rule-table (get-cfg-rule-table cfg1)) (shallow-cfg2-rule-table (get-cfg-rule-table cfg2)) (result nil)) (maphash (lambda (k v) (when (or (null count-only-type) (eq (cfr-type v) count-only-type)) (unless (gethash (remap-cfr-hash k (cfg-table cfg2) (cfg-table cfg1)) shallow-cfg1-rule-table) (push k result)))) shallow-cfg2-rule-table) result)) (defun filter-terminals (rules) (mapcan (lambda (x) (unless (stringp (cadr x)) (list x))) rules)) (defun table-key-union (table1 table2) (let ((result (make-hash-table :test #'equal))) (maphash (lambda (k v) (declare (ignore v)) (setf (gethash k result) k)) table1) (maphash (lambda (k v) (declare (ignore v)) (setf (gethash k result) k)) table2) result)) (defun add-cfgs (cfg1 cfg2) "Creates a new cfg with the rules from cfg1 and cfg2" (let* ((shallow-cfg1-rule-table (make-cfg-shallow-rule-table cfg1)) (shallow-cfg2-rule-table (make-cfg-shallow-rule-table cfg2)) (key-union (table-key-union shallow-cfg1-rule-table shallow-cfg2-rule-table)) (result (make-cfg :table (cfg-table cfg2)))) (loop for rule being each hash-key in key-union for cfg1-entry = (gethash rule shallow-cfg1-rule-table) for cfg2-entry = (gethash rule shallow-cfg2-rule-table) for new-rule = (make-cfr) for count = (+ (if cfg1-entry (cfr-count cfg1-entry) 0) (if cfg2-entry (cfr-count cfg2-entry) 0)) when (and cfg1-entry cfg2-entry (not (eq (cfr-type cfg1-entry) (cfr-type cfg2-entry)))) do (format t "add-cfgs(): WARN: ~a , ~a does not have the same type~%" cfg1-entry cfg2-entry) ;; we should probably include both here then? do (setf (cfr-count new-rule) count (cfr-type new-rule) (cfr-type (or cfg1-entry cfg2-entry)) (cfr-lhs new-rule) (cfr-lhs (or cfg1-entry cfg2-entry)) (cfr-rhs new-rule) (cfr-rhs (or cfg1-entry cfg2-entry))) (record-cfr new-rule result)) (setf (cfg-samples result) (+ (cfg-samples cfg1) (cfg-samples cfg2))) result)) (defun remap-code (code from-table to-table) (symbol-to-code (code-to-symbol code from-table) to-table)) (defun remap-cfr-hash (cfr-hash from-table to-table) (if (eq from-table to-table) cfr-hash (maptree (lambda (x) (remap-code x from-table to-table)) cfr-hash))) ;;; TODO destructively modifies symbol table (defun merge-cfgs (cfgs &key (merge-only nil)) (declare (ignore merge-only)) "Merges a list of cfgs, destructively modifies symbol table in first elem" (loop ;; prepare the first cfg, add-cfgs w/ new cfg clones the first cfg with result = (add-cfgs (make-cfg) (car cfgs)) with master-table = (get-cfg-rule-table result) for cfg in (cdr cfgs) do (loop with rule-table = (get-cfg-rule-table cfg) for rule-id being each hash-key in (get-cfg-rule-table cfg) for remapped-id = (remap-cfr-hash rule-id (cfg-table cfg) (cfg-table result)) for new-entry = (gethash rule-id rule-table) for old-entry = (gethash remapped-id master-table) for count = (+ (cfr-count new-entry) (if old-entry (cfr-count old-entry) 0)) when (and new-entry old-entry (not (eq (cfr-type old-entry) (cfr-type new-entry)))) do (format t "merge-cfgs(): WARN: ~a , ~a does not have the same type~%" new-entry old-entry) ;; now update the count, or create and record new rule: when old-entry do (setf (cfr-count old-entry) count) (incf (gethash (cfr-lhs old-entry) (cfg-counts result))) else do (record-cfr (destructuring-bind (lhs rhs) remapped-id (make-cfr :type (cfr-type new-entry) :count count :lhs lhs :rhs rhs)) result)) (incf (cfg-samples result) (cfg-samples cfg)) finally (return result))) (defun merge-tt-tables (tables grammars) "Merges a list of tt tables, it is assumed that the first grammar has the merged symbol-table of the grammars" (let ((result-table (make-hash-table :test #'equal))) (loop with master-symbol-table = (cfg-table (Car grammars)) for table in tables for grammar in grammars for symbol-table = (cfg-table grammar) do (loop for mapping-code being the hash-keys in table for top-type = (gethash mapping-code table) for internal-symbol = (code-to-symbol mapping-code symbol-table) for merged-code = (symbol-to-code internal-symbol master-symbol-table) if (numberp top-type) do (setf (gethash merged-code result-table) merged-code) else do (setf (gethash merged-code result-table) top-type)) finally (return result-table)))) (defun collect-rules (cfg) (let (result) (loop for ruleset being each hash-value in (cfg-rules cfg) do (loop for rule in ruleset do (push rule result))) result)) (defun split-rules (cfg) (let (rules words (n-rules 0) (n-words 0)) (loop for ruleset being each hash-value in (cfg-rules cfg) do (loop for rule in ruleset for type = (cfr-type rule) when (or (eq :RULE type) (eq :ROOT type)) do (push rule rules) (incf n-rules) when (or (eq :WORD type)) do (push rule words) (incf n-words))) (pairlis '(:rules :words :n-rules :n-words) (list rules words n-rules n-words)))) ;;; fixme do not add known words (defun merge-words (cfg-1 cfg-2 &key (true-distribution nil)) "Merge all the words from cfg-2 into a clone of cfg-1, cfg-1 and cfg-2 should share symbol-table" (let ((split (split-rules cfg-2)) (clone (add-cfgs (make-cfg) cfg-1))) (loop for word in (get-field :words split) do (with-slots (lhs rhs type count) word (when (gethash lhs (cfg-rules cfg-1)) ;; do not merge in unusable rules or bitpar chokes (record-cfr (make-cfr :type type :rhs rhs :lhs lhs :count (if true-distribution count 1)) clone))) finally (return clone)))) (defun make-start-symbol (rules) (loop with start-symbol = "start" with seen = (make-hash-table :test #'equal) for x in rules when (and (eq (cfr-type x) :ROOT)) do (let ((count (if (gethash (cfr-lhs x) seen) (cfr-count (gethash (cfr-lhs x) seen)) 0))) (setf (gethash (cfr-lhs x) seen) (make-cfr :lhs start-symbol :rhs (list (cfr-lhs x)) :count (+ count (cfr-count x))))) finally (return (let (res) (maphash (lambda (k v) (declare (ignore k)) (push v res)) seen) res)))) (defun make-word-counts (rules) (loop with table = (make-hash-table :test #'equal) for rule in rules when (eq (cfr-type rule) :WORD) do (let ((word (gethash (cfr-rhs rule) table))) (if (null word) (setf (gethash (cfr-rhs rule) table) (list (list (cfr-lhs rule) (cfr-count rule)))) (push (list (cfr-lhs rule) (cfr-count rule)) (gethash (cfr-rhs rule) table)))) finally (return table))) (defun report-undergeneration-unseen-rules (master-cfg cfgs &key (stream t)) (let* ((master-cfg-table (make-cfg-shallow-rule-table master-cfg)) (card (cfg-count master-cfg))) (loop for cfg in cfgs for part = (/ (cfg-samples cfg) (cfg-samples master-cfg)) for cfg-table = (make-cfg-shallow-rule-table cfg) for unseen = 0 for unseen-rule = 0 do (loop for rule being each hash-key in master-cfg-table do (when (null (gethash rule cfg-table)) (incf unseen)) (when (and (null (gethash rule cfg-table)) (not (stringp (cadr rule)))) (incf unseen-rule))) (format stream "~a ~a ~a ~%" (float part) (float (/ (- card unseen) card)) (float (/ (- card unseen-rule) card)))))) (defun check-useless-productions (cfg) (let ((rules (collect-rules cfg))) (let ((lhs-table (make-hash-table :test #'equal)) (rhs-table (make-hash-table :test #'equal))) ;; Maintain two tables, each lhs that is not root should be seen in RHS of a rule. ;; Each RHS should be an LHS of a rule except for words: (loop for rule in rules for lhs = (cfr-lhs rule) for rhs = (cfr-rhs rule) when (or (null lhs) (null rhs)) do ;; LHS/RHS should never be nil: (format t "CFG ~a is unsound, rule ~a has illegal lhs or rhs: ~a -> ~a~%" cfg rule lhs rhs) (return) do (unless (eq (cfr-type rule) :ROOT) (setf (gethash lhs lhs-table) t)) (unless (eq (cfr-type rule) :WORD) (mapcar (lambda (x) (setf (gethash x rhs-table) t)) rhs))) (maphash (lambda (k v) (declare (ignore v)) (unless (gethash k rhs-table) (format t "CFG ~a is unsound, useless LHS label: ~a~%" cfg k))) lhs-table) (maphash (lambda (k v) (declare (ignore v)) (unless (gethash k lhs-table) (format t "CFG ~a is unsound, useless RHS label: ~a~%" cfg k))) rhs-table)))) (defun report-grammar-types (grammar) (loop with words = 0 with roots = 0 with rules = 0 for rule in (collect-rules grammar) when (eq (cfr-type rule) :word) do (incf words) when (eq (cfr-type rule) :rule) do (incf rules) when (eq (cfr-type rule) :root) do (incf roots) finally (return (pairlis '(:word-count :rule-count :root-count :label-count) (list words rules roots (hash-table-count (cfg-rules grammar))))))) (defun approx-edge-to-tree (approx-edge) (if (null (approx-edge-children approx-edge)) (list (approx-edge-label approx-edge) (car (approx-edge-leaves approx-edge))) (cons (approx-edge-label approx-edge) (mapcar #'approx-edge-to-tree (approx-edge-children approx-edge))))) (defun tree-to-rules (tree) (cond ((null tree) nil) ((atom tree) tree) ((atom (cadr tree)) (list tree)) (t (cons (list (car tree) (mapcar #'car (cdr tree))) (loop for x in (cdr tree) append (tree-to-rules x)))))) (defun tree-minus-preterminal (tree) (if (and (= (length tree) 2) (atom (second tree))) nil (cons (car tree) (loop for daughter in (rest tree) when (tree-minus-preterminal daughter) collect it)))) (defun pret-yield (tree) (let (accu) (labels ((walk (tree) (if (and (= (length tree) 2) (atom (second tree))) (push (car tree) accu) (mapcar #'walk (rest tree))))) (walk tree) accu))) (defun perkele (gold-path blue-path tt-path symbol-path) (let* ((*tt-table* (read-hash-table tt-path)) (*pcfg-symbol-table* (read-symbol-table symbol-path)) (gold (mapcar #'normalize-tree (read-and-tt-code-trees gold-path))) (blue (mapcar #'normalize-tree (read-and-tt-code-trees blue-path))) (gold* (tree-minus-preterminal gold)) (blue* (tree-minus-preterminal blue)) (gold-tags (mapcar #'pret-yield gold)) (blue-tags (mapcar #'pret-yield blue)) (covered (loop for x in blue count x)) (evalb-ex (/ (loop for blue in blue* for gold in gold* when (and blue gold) count (equal blue gold)) covered)) (total-ex (/ (loop for blue in blue for gold in gold when (and blue gold) count (equal blue gold)) covered))) (destructuring-bind (prec rec parseval-ex) (parseval-trees gold blue) (pairlis '(:evalb-ex :total-ex :prec :rec :parseval-ex :f1 :tagacc) (mapcar #'float (list evalb-ex total-ex prec rec (/ parseval-ex (length blue)) (/ (* 2 prec rec) (+ prec rec)) (loop with total = 0 with correct = 0 for blues in blue-tags for golds in gold-tags when (and blues golds) do (loop for blue in blues for gold in golds when (eq gold blue) do (incf correct) do (incf total)) finally (return (/ correct total))))))))) (defun merge-all (path) (let ((grammars (excl.osi:command-output (concatenate 'string "ls " path)))) (loop with cfgs with symbol-tables with tt-tables for grammar in grammars for lpath = (concatenate 'string path "/" grammar "/") for symbol-table = (read-symbol-table (concatenate 'string lpath "symbol-table")) for tt-table = (read-hash-table (concatenate 'string lpath "tt-table")) for cfg = (read-grammar (concatenate 'string lpath "grammar") symbol-table) do (push tt-table tt-tables) (push symbol-table symbol-tables) (push cfg cfgs) finally (return (pairlis '(:grammar :symbol-table :tt-table) (list (merge-cfgs cfgs) (cfg-table (first cfgs)) (merge-tt-tables tt-tables cfgs)))))))