;;; -*- mode: common-lisp; coding: utf-8; package: tsdb -*- (in-package :tsdb) ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 2008 -- 2017 Stephan Oepen (oe@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (defun starsem-summarize-tokens (tokens &key (i 0)) (labels ((anchor (string token) (let* ((form (get-field :form token)) (length (length form)) (start (search string form))) (if start (let* (#+:null (base (* (get-field :id token) 100)) (base (get-field :start token)) (end (+ start (length string))) (affix (cond ((and (zerop start) (= end length)) :form) ((zerop start) :prefix) ((= end length) :suffix) (t :infix)))) (list (+ base start) (min (+ base end) (get-field :end token)) affix)) (error "summarize-starsem(): unable to anchor `~a' in ~a." string token)))) (collect (index) (loop for token in tokens for string = (nth index (get-field :starsem token)) unless (or (null string) (string= string "_") (string= string "***")) collect (anchor string token)))) (loop with n = (loop for token in tokens maximize (length (get-field :starsem token))) with cues with scopes with events for base from 0 to (- n 1) by 3 for i from i for cue = (collect base) for scope = (collect (+ base 1)) for event = (collect (+ base 2)) when cue do (push (cons i cue) cues) when scope do (push (cons i scope) scopes) when event do (push (cons i event) events) finally (labels ((massage (entities) (loop for entity in (nreverse entities) for id = (acons :id (first entity) nil) for span = (combine-spans (rest entity)) collect (nconc id (acons :span span nil))))) (return (when (or cues scope events) (pairlis '(:ncues :nscopes :nevents) (list (massage cues) (massage scopes) (massage events))))))))) (defun starsem-output (tokens &optional cues scopes events &key id gscopes chapter sentence (stream t) (format :string)) (when (get-field :i-tokens tokens) (setf tokens (get-field :i-tokens tokens)) (setf cues (get-field :ncues tokens)) (setf scopes (get-field :nscopes tokens)) (setf events (get-field :nevents tokens))) (cond ((stringp stream) (with-open-file (stream stream :direction :output :if-exists :supersede) (starsem-output tokens cues scopes events :chapter chapter :sentence sentence :id id :gscopes gscopes :stream stream :format format))) ((null stream) (with-output-to-string (stream) (starsem-output tokens cues scopes events :chapter chapter :sentence sentence :id id :gscopes gscopes :stream stream :format format))) (t (cond ((smember format '(:string :xml)) (let* ((starts (make-hash-table :test #'eql)) (ends (make-hash-table :test #'eql))) (labels ((index (id type span) (let ((entry (cons id (cons type span)))) (push entry (gethash (first span) starts)) (push entry (gethash (second span) ends)))) (record (id type spans) (loop for span in spans do (index id type span))) (open-span (id type) (declare (ignore id)) (case format (:string (write-char (case type (:cue #\<) (:scope #\{) (:event #\|)) stream)))) (close-span (id type) (declare (ignore id)) (case format (:string (write-char (case type (:cue #\>) (:scope #\}) (:event #\|)) stream))))) ;; ;; index scope and cue elements by start and end character positions ;; (loop for cue in cues when (or (null id) (= id (get-field :id cue))) do (record (get-field :id cue) :cue (get-field :span cue))) (loop for scope in scopes when (or (null id) (= id (get-field :id scope))) do (record (get-field :id scope) :scope (get-field :span scope))) (loop for event in events when (or (null id) (= id (get-field :id event))) do (record (get-field :id event) :event (get-field :span event))) ;; ;; for each position, sort elements starting or ending here according ;; to `size', with larger scopes `nesting around' smaller ones and ;; scopes nesting around events nesting around cues. ;; (loop for start being each hash-key in starts do (setf (gethash start starts) (sort (gethash start starts) #'(lambda (spana spanb) (let ((enda (fourth spana)) (endb (fourth spanb)) (typea (second spana))) (or (> enda endb) (and (= enda endb) (eq typea :scope)) (and (= enda endb) (eq typea :event) (eq (second spanb) :cue)))))))) (loop for end being each hash-key in ends do (setf (gethash end ends) (sort (gethash end ends) #'(lambda (spana spanb) (let ((starta (third spana)) (startb (third spanb)) (typea (second spana))) (or (> starta startb) (and (= starta startb) (eq typea :cue)) (and (= starta startb) (eq typea :event) (eq (second spanb) :scope)))))))) ;; ;; if gold scopes are available, determine whether there is a match ;; (if gscopes (let ((gold (if id (loop for foo in gscopes thereis (when (= (get-field :id foo) id) foo)) gscopes)) (system (if id (loop for foo in scopes thereis (when (= (get-field :id foo) id) foo)) scopes))) (setf gscopes (equal gold system)) (format stream "~:[-~;+~]" gscopes)) (setf gscopes t)) ;; ;; finally, write out the annotated string, one token at a time and, ;; (in string-based mode) within each token, one character at a time ;; (loop for token in tokens for form = (get-field :form token) for id = (get-field :id token) for start = (or (get-field :start token) (* id 100)) do (write-char #\space stream) (loop for i from 0 to (- (length form) 1) do (loop for span in (gethash (+ start i) starts) do (open-span (first span) (second span))) (write-char (schar form i) stream) (loop for span in (gethash (+ start i 1) ends) do (close-span (first span) (second span)))))) gscopes)) ((smember format '(:starsem :starsem+ :epe)) (labels ((intersect (token spans type) (loop with start = (get-field :start token) with end = (get-field :end token) with span = (and start end (list start end)) for foo in spans for intersection = (when (and span foo) (first (span-intersection (get-field :span foo) span))) when intersection collect (let* ((offset start) (start (- (first intersection) offset)) (end (- (second intersection) offset))) (list (get-field :id foo) type start end))))) (loop finally (terpri stream) for token in tokens for json = (when (eq format :epe) (make-string-output-stream)) for cspans = (intersect token cues :cue) for sspans = (intersect token scopes :scope) for espans = (intersect token events :event) do #+:null (pprint (list (get-field :form token) cspans sspans espans)) (case format ((:starsem :starsem+) (format stream "~{~a~^ ~}" (loop for key in (append '(:chapter :sentence :id) (if (eq format :starsem+) '(:start :end :form :lemma :pos :features :head :deprel) '(:form :lemma :pos :ptb))) collect (or (case key (:chapter (or chapter (get-field key token))) (:sentence (or sentence (get-field key token))) (:pos (or (get-field key token) (get-field :xpos token))) (:features (or (get-field key token) (get-field :feat token))) (t (get-field key token))) "_"))))) (if (and (null cues) (null scopes) (null events)) (case format ((:starsem :starsem+) (format stream " ***"))) (labels ((output (token start end) (let ((form (get-field :form token)) (special '("``" "''" "--" "---" "..."))) (if (and (= (- end start) 1) (member form special :test #'string=)) form (subseq form start end))))) (loop with initial = t for cue in cues for id = (get-field :id cue) for cspan = (find id cspans :key #'first) for sspan = (find id sspans :key #'first) for espan = (find id espans :key #'first) do (case format ((:starsem :starsem+) (format stream " ~a ~a ~a" (if cspan (output token (third cspan) (fourth cspan)) "_") (if sspan (output token (third sspan) (fourth sspan)) "_") (if espan (output token (third espan) (fourth espan)) "_"))) (:epe (when (or cspan sspan espan) (unless initial (format json ", ")) (setf initial nil) (format json "{\"id\": ~a" id) (when cspan (format json ", \"cue\": \"~a\"" (json-escape-string (output token (third cspan) (fourth cspan))))) (when sspan (format json ", \"scope\": \"~a\"" (json-escape-string (output token (third sspan) (fourth sspan))))) (when espan (format json ", \"event\": \"~a\"" (json-escape-string (output token (third espan) (fourth espan))))) (format json "}"))))))) (case format (:epe (let ((json (get-output-stream-string json))) (unless (string= json "") (set-field :negation (format nil "[~a]" json) token)))) (t (format stream "~%")))))) (t (error "starsem-output(): unknown format `~(~a~)'." format)))))) (defun span-intersection (one two &optional (base 1)) (if (consp (first one)) (loop for span in one append (span-intersection span two)) (if (consp (first two)) (loop for span in two append (span-intersection one span)) (let* ((ostart (truncate (first one) base)) (oend (truncate (second one) base)) (tstart (truncate (first two) base)) (tend (truncate (second two) base)) (from (max ostart tstart)) (to (min oend tend))) (and (< from to) (list (list from to))))))) (defun span-inclusion (one two) (if (consp (first one)) (loop for span in one always (span-inclusion span two)) (if (consp (first two)) (loop for span in two thereis (span-inclusion one span)) (let* ((base 1) (ostart (truncate (first one) base)) (oend (truncate (second one) base)) (tstart (truncate (first two) base)) (tend (truncate (second two) base))) (and (>= ostart tstart) (<= oend tend)))))) (defun span-equal (one two) (if (consp (first one)) (when (consp (first two)) (loop for one in one for two in two always (span-equal one two))) (unless (consp (first two)) (let* ((base 1) (ostart (truncate (first one) base)) (oend (truncate (second one) base)) (tstart (truncate (first two) base)) (tend (truncate (second two) base))) (and (= ostart tstart) (= oend tend)))))) (defun combine-spans (spans) ;; ;; _fix_me_ ;; i believe the main benefit of this function is to minimize the number of ;; contiguous spans, which presumably will directly correspond to the number ;; of opening and closing braces in the pretty-printed textual format (unless ;; it were the case that starsem-output() were doing something smart itself; ;; not sure just now). the code below depends on the scheme used originally ;; to put :start and :end character positions on tokens, even while only ;; working with the pre-tokenized *SEM 2012 files. the method to detect ;; token adjacency used below, thus, does not work in the new (EPE 2017) ;; universe. it should not be too hard to generalize accordingly, but ;; presumably we will need access to actual tokens (where adjacency is ;; obvious, if looking at the full sequence). for just now, we can make do ;; without the extra normalization step, though. (16-mar-17; oe) ;; (return-from combine-spans spans) (labels ((reduce (spans) (if (or (null spans) (null (rest spans))) spans (let ((combination (combine (first spans) (second spans)))) (if combination (reduce (cons combination (rest (rest spans)))) (cons (first spans) (reduce (rest spans))))))) (combine (left right) (when (and (numberp (first left)) (numberp (first right))) (if (equal left right) right (when (and (= (+ (truncate (second left) 100) 1) (truncate (first right) 100)) (or (null (third left)) (smember (third left) '(:suffix :form))) (or (null (third right)) (smember (third right) '(:prefix :form)))) (let ((type (cond ((and (or (null (third left)) (eq (third left) :form)) (or (null (third right)) (eq (third right) :form))) :form) ((or (null (third right)) (eq (third right) :form)) :suffix) ((or (null (third left)) (eq (third left) :form)) :prefix) (t :infix)))) (list (first left) (second right) type))))))) (reduce (sort (copy-list spans) #'(lambda (left right) (or (< (first left) (first right)) (and (= (first left) (first right)) (< (second left) (second right))))))))) (defun index-negations (items &optional (table (make-hash-table))) (loop with all for item in items for cues = (get-field :ncues item) for scopes = (get-field :nscopes item) for events = (get-field :nevents item) do (loop for cue in cues for start = (loop for span in (get-field :span cue) minimize (first span)) for end = (loop for span in (get-field :span cue) maximize (second span)) for id = (get-field :id cue) for scope = (find id scopes :key #'(lambda (foo) (get-field :id foo))) for event = (find id events :key #'(lambda (foo) (get-field :id foo))) do (let ((cue (append cue (pairlis '(:start :end) (list start end))))) (setf (gethash id table) (list cue scope event)) (push cue all))) finally (return (sort all #'(lambda (foo bar) (let ((start1 (get-field :start foo)) (start2 (get-field :start bar))) (or (< start1 start2) (and (= start1 start2) (< (get-field :end foo) (get-field :end bar)))))))))) (defun lookup-negation (item cues table) (loop with tokens = (get-field :i-tokens item) with start = (loop for token in tokens for start = (get-field :start token) when start minimize start) with end = (loop for token in tokens for end = (get-field :end token) when end maximize end) for cue in cues while (<= (get-field :end cue) end) when (<= start (get-field :start cue)) collect cue into result finally (let (scopes events) (loop for cue in result for bucket = (gethash (get-field :id cue) table) do (push (second bucket) scopes) (push (third bucket) events)) (return (when (or result scopes events) (pairlis '(:ncues :nscopes :nevents) (list result scopes events))))))) #+:null (let* ((root "~/src/starsem/") (baskervilles (format nil "~aprimary/~a" root "SEM-2012-SharedTask-CD-SCO-training-09032012.txt")) (wisteria (format nil "~aprimary/~a" root "SEM-2012-SharedTask-CD-SCO-dev-09032012.txt")) (*conll-type* :starsem)) (do-import-items wisteria "tmp/ssd" :format :conll) (purge-profile-cache :all) (with-open-file (stream (format nil "~a/ssd.txt" root) :direction :output :if-exists :supersede) (loop for item in (analyze "tmp/ssd" :tokensp t :commentp t) when (get-field :ncues item) do (format stream "[~a] " (get-field :i-id item)) (starsem-output (get-field :i-tokens item) (get-field :ncues item) (get-field :nscopes item) (get-field :nevents item) :stream stream :format :string) (terpri stream))) (do-import-items baskervilles "tmp/sst" :format :conll) (purge-profile-cache :all) (with-open-file (stream (format nil "~a/sst.txt" root) :direction :output :if-exists :supersede) (loop for item in (analyze "tmp/sst" :tokensp t :commentp t) when (get-field :ncues item) do (format stream "[~a] " (get-field :i-id item)) (starsem-output (get-field :i-tokens item) (get-field :ncues item) (get-field :nscopes item) (get-field :nevents item) :stream stream :format :string) (terpri stream)))) #+:null (let* ((root "~/src/starsem/") (circle (format nil "~aprimary/~a" root "SEM-2012-SharedTask-CD-SCO-test-circle-GOLD.txt")) (cardboard (format nil "~aprimary/~a" root "SEM-2012-SharedTask-CD-SCO-test-cardboard-GOLD.txt")) (*conll-type* :starsem)) (do-import-items circle "tmp/ssi" :format :conll) (purge-profile-cache :all) (with-open-file (stream (format nil "~a/ssi.txt" root) :direction :output :if-exists :supersede) (loop for item in (analyze "tmp/ssi" :tokensp t :commentp t) when (get-field :ncues item) do (format stream "[~a] " (get-field :i-id item)) (starsem-output (get-field :i-tokens item) (get-field :ncues item) (get-field :nscopes item) (get-field :nevents item) :stream stream :format :string) (terpri stream))) (do-import-items cardboard "tmp/ssa" :format :conll) (purge-profile-cache :all) (with-open-file (stream (format nil "~a/ssa.txt" root) :direction :output :if-exists :supersede) (loop for item in (analyze "tmp/ssa" :tokensp t :commentp t) when (get-field :ncues item) do (format stream "[~a] " (get-field :i-id item)) (starsem-output (get-field :i-tokens item) (get-field :ncues item) (get-field :nscopes item) (get-field :nevents item) :stream stream :format :string) (terpri stream)))) #+:null (loop with *conll-type* = :starsem+ for section in '("training" "development" #+:null "evaluation") for gold = (format nil "~~/lib/epe/2017/negation/~a/gold.*sem+" section) for profile = (format nil "tmp/~a" section) for cin = (format nil "~~/lib/epe/2017/negation/~a/corenlp.conllu+" section) for uin = (format nil "~~/lib/epe/2017/negation/~a/udpipe.conllu+" section) for cout = (format nil "~~/lib/epe/2017/negation/~a/corenlp.epe" section) for uout = (format nil "~~/lib/epe/2017/negation/~a/udpipe.epe" section) do (purge-profile-cache :all) (format t "importing `~a' ...~%" gold) (do-import-items gold profile :format :conll :create :purge) (purge-profile-cache :all) (let ((items (analyze profile :tokensp t :commentp t))) (when (probe-file cin) (format t "reading `~a' ...~%" cin) (with-open-file (stream cout :direction :output :if-exists :supersede) (loop with table = (make-hash-table) with cues = (index-negations items table) for item in (read-items-from-conll-file cin :type :conllu+ :rawp t) for sentence from 0 for gold = (lookup-negation item cues table) do (starsem-output (get-field :i-tokens item) (get-field :ncues gold) (get-field :nscopes gold) (get-field :nevents gold) :sentence sentence :stream nil :format :epe) (conll-output item :stream stream :type :epe) (terpri stream)))) (when (probe-file uin) (format t "reading `~a' ...~%" uin) (with-open-file (stream uout :direction :output :if-exists :supersede) (loop with table = (make-hash-table) with cues = (index-negations items table) for item in (read-items-from-conll-file uin :type :conllu+ :rawp t) for sentence from 0 for gold = (lookup-negation item cues table) do (starsem-output (get-field :i-tokens item) (get-field :ncues gold) (get-field :nscopes gold) (get-field :nevents gold) :sentence sentence :stream nil :format :epe) (conll-output item :stream stream :type :epe) (terpri stream))))))