;;; Hey, emacs(1), this is -*- Mode: Common-Lisp; -*-, got it? ;;; ;;; PAL --- PCFG Approximation and Parsing Library for DELPH-IN ;;; ;;; Copyright (c) 2009 -- 2012 Johan Benum Evensberget (johan.benum@gmail.com) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (in-package :pcfg) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; a simple context free grammar, represented as a structure (separating out ;;; lexical vs. syntactic rewrite rules), and leaving the set of terminal ;;; symbols implicit (i.e. all left-hand sides in rewrite rules). ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defstruct grammar (roots (make-hash-table :test #'equal)) (rules (make-hash-table :test #'eql)) ;; rhs (ptab (make-hash-table :test #'equal)) ;; list lhs rhs (ltab (make-hash-table :test #'eql)) tsdb-grammar-p (index (make-hash-table)) ;; first rhs (lexicon (make-hash-table :test #'equal))) (defmethod print-object ((object grammar) stream) (format stream "#" (hash-table-count (grammar-lexicon object)) (hash-table-count (grammar-index object)) (hash-table-count (grammar-rules object)))) (defstruct rule lhs rhs (log-prob 0.0 :type single-float) count) (defstruct lexeme category (log-prob 0.0 :type single-float) count) (defparameter *lhs-table* (make-hash-table)) (defparameter *bin-lhs-table* (make-hash-table)) (defmacro rule-encode (x y) `(the fixnum (logior (the fixnum (* 1361 (the fixnum ,x))) (the fixnum (ash (the fixnum (* 1363 (the fixnum ,y))) 32))))) (defvar *grammar*) (defun read-serialized-grammar (path) (clrhash *lhs-table*) (clrhash *bin-lhs-table*) (let ((new-grammar (make-grammar)) tasks) (with-open-file (stream path :direction :input) (read stream nil :eof) (loop for rule = (read stream nil :eof) until (eq rule :eof) do (destructuring-bind ((lhs rhs) type count) rule (when (eql type :rule) (let ((rule (make-rule :lhs lhs :rhs rhs :count count))) (push rule (gethash (first rhs) (grammar-index new-grammar))) (push rule tasks) (when (= 2 (length rhs)) (incf (gethash lhs *bin-lhs-table* 0))) (incf (gethash lhs *lhs-table* 0) count))) (when (eq type :root) (let ((rule (make-rule :lhs lhs :rhs rhs :count count))) (push rule (gethash (first rhs) (grammar-index new-grammar))) (push rule tasks) (incf (gethash lhs *lhs-table* 0) count) (setf (gethash lhs (grammar-roots new-grammar)) t))) (when (eq type :word) (let ((word (make-lexeme :category lhs :count count))) (setf (gethash (list lhs rhs) (grammar-ptab new-grammar)) word) (push word (gethash rhs (grammar-lexicon new-grammar))) (incf (gethash lhs *lhs-table* 0) count) (push word tasks)))))) (loop for item in tasks do (if (lexeme-p item) (setf (lexeme-log-prob item) (log (/ (lexeme-count item) (gethash (lexeme-category item) *lhs-table*)))) (progn (setf (gethash (list (rule-lhs item) (rule-rhs item)) (grammar-ptab new-grammar)) item) (push item (gethash (rule-lhs item) (grammar-ltab new-grammar))) (setf (rule-log-prob item) (log (/ (rule-count item) (gethash (rule-lhs item) *lhs-table*)))) (push item (gethash (rule-encode (first (rule-rhs item)) (or (second (rule-rhs item)) -1)) (grammar-rules new-grammar)))))) (setf *grammar* new-grammar))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; the edge structure: entries in our chart, using a specialized print method ;;; to make .edge. objects print in the format assumed during our lectures. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (let ((n 0)) (declare (type fixnum n)) (defun reset-edge-id () (setf n 0)) (defun new-edge-id () (let ((id n)) (incf n) id))) (declaim (ftype (function new-edge-id () fixnum))) (defstruct edge (to -1 :type fixnum) (from -1 :type fixnum) (id (new-edge-id)) (category -1 :type fixnum) unpacking sponsor surface (children nil :type (cons edge t)) (alternates nil) trees) (defmethod print-object ((object edge) stream) ;; ;; arrange for objects of type `edge' to print nice, using the compact format ;; discussed on slide #11 of 2-nov-10. ;; (format stream "#[E ~a: (~a-~a) ~a -->" (edge-id object) (edge-from object) (edge-to object) (edge-category object)) (when (edge-children object) (loop for daughter in (edge-children object) do (format stream " ~a" (edge-id daughter)))) (format stream " .") (when (edge-alternates object) (format stream " {") (loop for edge in (edge-alternates object) do (format stream " ~a" (edge-id edge))) (format stream " }")) (format stream "]")) (defparameter *cky-chart* nil) (declaim (type (simple-array t (* *)) *cky-chart*)) (defun cky-initialize (n) (setf *cky-chart* (make-array (list (1+ n) (1+ n)) :initial-element nil))) (defmacro push-edge (edge from to) `(push ,edge (aref *cky-chart* ,from ,to))) (defmacro chart-edges (from to) `(aref *cky-chart* ,from ,to)) ;;; FIXME this makes selectively-unpack edges confused :( (defun cky-augment-unary-rules (edges) (loop with agenda = edges with edge for next = (pop agenda) while next if (listp next) do (setf edge (make-edge :from (edge-from (first next)) :to (edge-to (first next)) :sponsor (second next) :category (rule-lhs (second next)) :children (list (first next)))) else do (setf edge next) unless (pack-edge edge :hosts (chart-edges (edge-from edge) (edge-to edge))) do (push-edge edge (edge-from edge) (edge-to edge)) (loop for rule in (gethash (rule-encode (edge-category edge) -1) (grammar-rules *grammar*)) do (let ((new (list edge rule))) (push new agenda))))) (defun cky-parse (input) (let ((n (if (every #'consp input) (loop for item in input maximize (getf item :to)) (length input)))) (reset-edge-id) (cky-initialize n) ;;; push all lexical items (if (every #'consp input) (loop with bucket for item in input for edge = (apply #'make-edge item) do (push edge bucket) finally (cky-augment-unary-rules bucket)) (loop with bucket = nil for i fixnum from 0 for word in input do (loop for lexeme of-type lexeme in (gethash word (grammar-lexicon *grammar*)) for edge = (make-edge :from i :to (the fixnum (+ i 1)) :category (lexeme-category lexeme) :sponsor lexeme :surface word) do (push edge bucket)) finally (cky-augment-unary-rules bucket))) (loop for l of-type (integer 0 1024) from 1 to (the fixnum (1- n)) do (loop for i of-type (integer 0 1024) from 0 to (the fixnum (- n (1+ l))) do (loop for j of-type (integer 0 1024) from 1 to l for left-cell = (chart-edges i (the fixnum (+ i j))) for right-cell = (chart-edges (the fixnum (+ i j)) (the fixnum (+ i l 1))) for bucket = nil do (loop for item in left-cell when (not (null (gethash (edge-category item) (grammar-index *grammar*)))) do (loop for member in right-cell for code = (rule-encode (edge-category item) (edge-category member)) for rules = (gethash code (grammar-rules *grammar*)) do (loop for rule in rules for new = (make-edge :from i :to (the fixnum (+ i l 1)) :sponsor rule :category (rule-lhs rule) :children (list item member)) do (push new bucket)))) (cky-augment-unary-rules (nreverse bucket))))) (loop for edge in (chart-edges 0 n) when (and (= (edge-to edge) n) (gethash (edge-category edge) (grammar-roots *grammar*))) collect edge))) (defun pack-edge (edge &key (hosts :emp)) (if (null hosts) nil (progn (when (eq hosts :emp) (setf hosts nil)) (labels ((daughterp (host edge) ;; ;; a local function to test whether .host. is (transitively) ;; embedded as a daughter somewhere below .edge. ;; (let ((daughters (edge-children edge))) (unless (or (null daughters) (rest daughters)) (let ((daughter (first daughters))) (or (eq daughter host) (daughterp host daughter))))))) (loop with category = (edge-category edge) with to = (edge-to edge) for host in hosts when (and (eql category (edge-category host)) (= to (edge-to host)) (= (edge-from edge) (edge-from host))) do (unless (or (daughterp host edge)) (push edge (edge-alternates host)))))))) (defun edge-equal (x y) (or (eq x y) (and (equal (edge-category x) (edge-category y)) (equal (edge-children x) (edge-children y))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; the parser computes a parse forest, but in the end we want to recover the ;;; full parse trees (aka derivations), i.e. multiply out packed amgiguities. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun cross-product (sets) ;; ;; given a set of sets, compute all the tuples (represented as lists) that ;; contain exactly one element from each of the input sets, e.g. ;; ;; (cross-product '((1 2 3) (A B) (X Y))) ;; --> ((1 A X) (1 A Y) (1 B X) (1 B Y) (2 A X) (2 A Y) ;; (2 B X) (2 B Y) (3 A X) (3 A Y) (3 B X) (3 B Y)) ;; (if (null (rest sets)) (loop for foo in (first sets) collect (list foo)) (loop for bar in (cross-product (rest sets)) nconc (loop for foo in (first sets) collect (cons foo bar))))) (defun unpack-edge (edge) ;; ;; unpack-edge() computes the list of trees corresponding to .edge. once all ;; packed ambiguity inside of it has been multiplied out. unpacking is sort ;; of interesting: for `leaf' edges (those that have no children) we simply ;; return the category of .edge. (i.e. an atomic tree). for all other edges, ;; combine the result of unpacking the children of .edge. itself with that ;; of unpacking all its alternates. each daughter could unpack into multiple ;; trees, so that with two children, say, both unpacking into two trees, we ;; have to construct four trees: combining the four possible combinations of ;; children with the root node (the category) of .edge. itself. ;; (or (edge-trees edge) (setf (edge-trees edge) (if (edge-children edge) (nconc (loop for children in (cross-product (loop for daughter in (edge-children edge) collect (unpack-edge daughter))) collect (cons (edge-category edge) children)) (loop for alternate in (edge-alternates edge) append (unpack-edge alternate))) (list (edge-category edge)))))) (defun tree-prob (tree) (let ((lhs (first tree)) (rhs (if (consp (second tree)) (mapcar #'car (rest tree)) (second tree)))) (if (atom rhs) (lexeme-log-prob (gethash (list lhs rhs) (grammar-ptab *grammar*))) (+ (rule-log-prob (gethash (list lhs rhs) (grammar-ptab *grammar*))) (apply #'+ (mapcar #'tree-prob (rest tree))))))) (defun edge-to-tree (edge) (if (edge-children edge) (cons (edge-category edge) (mapcar #'edge-to-tree (edge-children edge))) (list (edge-category edge) (edge-surface edge)))) (defun parse (input) (cky-parse (loop for token in input for code = (symbol-to-code token *pcfg-symbol-table* :rop t) collect code)))