;;; Hey, emacs(1), this is -*- Mode: Common-Lisp; -*-, got it? ;;; ;;; PAL --- PCFG Approximation and Parsing Library for DELPH-IN ;;; ;;; Copyright (c) 2009 -- 2012 Johan Benum Evensberget (johan.benum@gmail.com) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (in-package :pcfg) (defun item-first-derivation (item) (declare (inline)) (get-field :derivation (car (get-field :results item)))) (defun print-bitpar-lexicon (word-counts stream) (maphash (lambda (k v) (format stream "~a~c~{~{~a ~a~}~^ ~}~%" k #\tab v)) word-counts)) (defun print-bitpar-rules (rules stream) (let ((cfg (make-cfg :table *pcfg-symbol-table*))) (mapcar (lambda (x) (unless (eq (cfr-type x) :WORD) (print-cfr x cfg :format :bitpar :stream stream))) rules))) (defun export-bitpar-grammar (grammar name path) (let* ((rules (collect-rules grammar)) (start (make-start-symbol rules)) (word-counts (make-word-counts rules))) (with-open-file (stream (concatenate 'string path "/" name ".bprl") :direction :output :if-exists :supersede) (print-bitpar-rules (append start rules) stream)) (with-open-file (stream (concatenate 'string path "/" name ".bplx") :direction :output :if-exists :supersede) (print-bitpar-lexicon word-counts stream)))) (defun reconstruct-bitpar-tree (tree) (mapcar (lambda (x) (if (atom x) (if (numberp x) (code-to-symbol x *pcfg-symbol-table*) x) (reconstruct-bitpar-tree x))) tree)) (defun construct-le-type-lexicon (grammar) (let* ((result (make-hash-table :test #'equal)) (words (remove-if-not (lambda (x) (eq (cfr-type x) :WORD)) (collect-rules grammar)))) (loop for word in words do (incf (gethash (cfr-lhs word) result 0) (cfr-count word))) result)) (defun create-bitpar-gold-file (items file grammar &key (symbol-table *pcfg-symbol-table*) (covered-items-list nil)) (declare (special tsdb::*reconstruct-cache*)) (let ((*print-pretty* nil)) (let () (with-open-file (stream file :direction :output :if-exists :supersede) (loop for tsdb::*reconstruct-cache* = nil with lkb::*edge-registry* = nil with rule-table = (get-cfg-rule-table grammar) with unseen-rules = 0 with unrecoverable-parses = 0 with *package* = (find-package :lkb) for id from 0 for coverage = (if covered-items-list (member id covered-items-list) t) for res in items for item = (item-first-derivation res) for rec = (tsdb::reconstruct item) for root = (and rec (prepare-edge rec)) for tree = (and rec (approx-edge-to-tree root)) for coded-tree = (maptree (lambda (x) (or (symbol-to-code x symbol-table :rop t) x)) tree) for rules = (tree-to-rules coded-tree) for rooted-tree = (cons "start" (list (cons (symbol-to-code 'root symbol-table :rop t) (list coded-tree)))) do (loop for rule in rules with complete-coverage = t unless (gethash rule rule-table) do (incf unseen-rules) (setf complete-coverage nil) finally (unless complete-coverage (incf unrecoverable-parses))) when (and rec coverage) do (format stream "~a~%" rooted-tree) finally (return (pairlis '(:unseen-rules :unrecoverable-trees) (list unseen-rules unrecoverable-parses)))))))) ;;; TODO, we need some sort of setting here to be able to reconstruct ;;; everything properly (defun create-bitpar-parse-file (items file grammar &key (coverage-list nil)) (with-open-file (stream file :direction :output :if-exists :supersede) (loop with *package* = (find-package :lkb) for tsdb::*reconstruct-cache* = nil with lkb::*edge-registry* = nil with unseen-word-map = (make-hash-table :test #'equal) with lexicon = (make-word-counts (collect-rules grammar)) with unseen-words = 0 with unseen-sentences = 0 with lexically-covered-items = nil for id from 0 for unseen = nil for bail = nil for item in items for rec = (tsdb::reconstruct (item-first-derivation item)) for prep = (and rec (prepare-edge rec)) for leaves = (and rec (mapcar (lambda (x) (or (symbol-to-code x (cfg-table grammar) :rop t) x)) (approx-edge-leaves prep))) unless rec do (setf bail t) when coverage-list do (unless bail (setf bail (not (member id coverage-list)))) do (loop for word in leaves if (null (gethash word lexicon)) do (setf unseen t (gethash word unseen-word-map) t) (incf unseen-words)) if (not (or bail unseen )) do (format stream "~{~a~^~%~}~%~%" leaves) (push id lexically-covered-items) else do (when unseen (incf unseen-sentences)) finally (return (pairlis '(:lexically-covered-items :unseen-words :unseen-sentences :unseen-word-map) (list (nreverse lexically-covered-items) unseen-words unseen-sentences unseen-word-map)))))) (defun create-test-bitpar-files (test-treebanks names path grammar &key (generation-list nil)) (loop for test-treebank in test-treebanks for name in names for i from 0 for list = (nth i generation-list) for items = (tsdb::analyze test-treebank :condition "readings > 0 && t-active == 1" :thorough '(:derivation) :gold test-treebank) collect (acons :name name (create-bitpar-parse-file items (format nil "~a/~a.items" path name ) grammar :coverage-list list)))) (defun create-gold-bitpar-files (test-treebanks names path grammar &key (symbol-table *pcfg-symbol-table*) (coverage-lists nil)) (loop for test-treebank in test-treebanks for name in names for id from 0 for coverage-list = (and coverage-lists (nth id coverage-lists)) for items = (tsdb::analyze test-treebank :condition "readings > 0 && t-active == 1" :thorough '(:derivation) :gold test-treebank) collect (acons :name name (create-bitpar-gold-file items (format nil "~a/~a.gold" path name) grammar :symbol-table symbol-table :covered-items-list coverage-list))))