;;; -*- mode: common-lisp; coding: utf-8; package: tsdb -*-
(in-package :tsdb)
;;;
;;; [incr tsdb()] --- Competence and Performance Profiling Environment
;;; Copyright (c) 2017 -- 2017 Stephan Oepen (oe@ifi.uio.no)
;;;
;;; This program is free software; you can redistribute it and/or modify it
;;; under the terms of the GNU Lesser General Public License as published by
;;; the Free Software Foundation; either version 2.1 of the License, or (at
;;; your option) any later version.
;;;
;;; This program is distributed in the hope that it will be useful, but WITHOUT
;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
;;; License for more details.
;;;
(defun epe-preprocess (raw &optional tt &key debug segments alignp ids)
(let* ((*genia-tokenize-p* nil)
(user (current-user))
(pid (current-pid))
(tokenizer
(format
nil "exec ~a -L en-u8 -S -N -P -x -E '' '~a'"
(logon-file "bin" "tokenizer" :string) raw)
#+:null
(format
nil "exec ~a -L en-u8 -S -N -x -E '~a' '~a'"
(logon-file "bin" "tokenizer" :string) #\newline raw))
(sentences
(format
nil "~a/.epe.tokenizer.~a.~a.~a"
(tmp :tsdb) user pid (string-downcase (gensym ""))))
(repp
(format
nil "exec ~a -c '~a' --format line"
(logon-file "bin" "repp" :string)
(logon-file "lingo/erg/pet" "repp.set" :string)))
(tokens
(format
nil "~a/.epe.repp.~a.~a.~a"
(tmp :tsdb) user pid (string-downcase (gensym ""))))
(alignment
(format
nil "~a/.epe.resa.~a.~a.~a"
(tmp :tsdb) user pid (string-downcase (gensym ""))))
(resa
(format
nil "exec ~a -f TAB -r '~a' -i '~a' -o '~a'"
(logon-file "bin" "resaalign" :string)
(namestring raw) tokens alignment)))
(unless (probe-file raw)
(error "epe-preprocess(): invalid input file ‘~a’." raw))
(unless segments
(run-process
tokenizer :wait t :output sentences :error-output nil)
(unless (probe-file sentences)
(error "epe-preprocess(): mysterious error in sentence splitting."))
;;
;; _fix_me_
;; REPP appears to ignore the final sentence unless there is a trailing
;; newline; tell bec, one day. (17-apr-17; oe)
;;
(with-open-file (stream sentences :direction :output :if-exists :append)
(terpri stream)))
;;
;; furthermore, tokenizer actually appears to apply some normalization: it
;; converts (initial, at least) HT (ASCII 9) to SP (32) and keeps it as part
;; of the sentence: REPP will return correct offsets, but when we then call
;; RESAalign, it considers the sentence start without whitespace, hence the
;; summing of sentence plus token offsets goes wrong. (18-apr-17; oe)
;;
;; but it turned out impossible in the end to make tokenizer not normalize
;; sentence-internal whitespace, i.e. sequences of HT, SP, or NL, which it
;; will just read much like a single space. (4-jun-17; oe)
;;
#+:null
(let ((lines
(with-open-file (stream sentences)
(loop
for line = (read-line stream nil nil)
while line collect line))))
(with-open-file (stream sentences
:direction :output :if-exists :supersede)
(labels ((whitespacep (char)
(member
char
'(#\space #\tab #\newline #\page #\return #\linefeed)
:test #'char=)))
(loop
with initial = t
for line in lines
for string
= (if initial (string-left-trim '(#\space #\tab) line) line)
for eos = (search "" string)
unless (and initial (every #'whitespacep string)) do
(setf initial nil)
(cond
(eos
(write-string (subseq string 0 eos) stream)
(terpri stream)
(setf initial t))
(t
(if (zerop (length string))
(write-char #\space stream)
(write-string string stream))))))))
(if segments
(with-open-file (stream tokens :direction :output
:if-exixts :supersede)
(loop
for sentence in segments
do
(format stream "~{~a~%~}" sentence)
(terpri stream)))
(run-process
repp :wait t :input sentences :output tokens :error-output nil))
(unless (probe-file tokens)
(error "epe-preprocess(): mysterious error in tokenization."))
(if alignp
(run-process resa :wait t :output nil :error-output nil)
(with-open-file (stream alignment :direction :output
:if-exixts :supersede)))
(unless (probe-file alignment)
(error "epe-preprocess(): mysterious error in RESA alignment."))
(genia-shutdown)
(unless debug (delete-file sentences))
(let* ((pairs
(with-open-file (stream alignment)
(loop
with scanner = (ppcre:create-scanner "\\t")
for line = (read-line stream nil nil)
for fields = (and line (ppcre:split scanner line))
while line
when (and fields (string= (third fields) "TOK"))
collect (cons
(parse-integer (first fields))
(parse-integer (second fields))))))
(sentences
(with-open-file (stream tokens)
(loop
with sentence
for line = (read-line stream nil nil)
when (and (or (null line) (string= line "")) sentence)
collect (nreverse sentence)
and do (setf sentence nil)
else when (and line (not (string= line ""))) do
(let ((pair (pop pairs)))
(push
(pairlis
'(:start :end :form)
(list (or (first pair) -1) (or (rest pair) -1) line))
sentence))
while line)))
(tnt (loop for sentence in sentences collect (lkb::tnt sentence)))
(genia (loop for sentence in tnt collect (genia sentence))))
(unless debug
(delete-file tokens)
(delete-file alignment))
(when (or (stringp tt) (pathnamep tt))
(with-open-file (stream tt :direction :output :if-exists :supersede)
(loop
for id in ids
for sentence in genia
do
(when (numberp id) (format stream "#~a~%" id))
(loop
for token in sentence
when (numberp id)
do
(format
stream "~a ~a~%"
(get-field :stem token)
(get-field :pos token))
else do
(format
stream "~a ~a ~a ~a ~a~%"
(get-field :start token)
(get-field :end token)
(get-field :form token)
(get-field :stem token)
(get-field :pos token)))
(terpri stream))))
(when (and alignp debug)
(with-open-file (stream tt)
(loop
with text
= (with-open-file (stream raw)
(loop
for c = (read-char stream nil nil)
while c collect c into result
finally (return (coerce result 'string))))
with scanner = (ppcre:create-scanner "\\t")
for i from 0
for line = (read-line stream nil nil)
for fields = (and line (ppcre:split scanner line))
while line
when fields
do
(let* ((start (parse-integer (first fields)))
(end (parse-integer (second fields)))
(token (subseq text start end)))
(unless (string= token (third fields))
(format
t "[~a] <~a:~a> ‘~a’ vs. ‘~a’.~%"
i start end token (third fields)))))))
sentences)))
(defun epe-project (input gold output)
(unless (probe-file input)
(error "epe-project(): invalid input file ‘~a’." input))
(unless (probe-file gold)
(error "epe-project(): invalid negation file ‘~a’." gold))
(let* ((*conll-type* :starsem+)
(profile (format nil "tmp/epe.~a" (gensym ""))))
(purge-profile-cache :all)
(do-import-items gold profile :format :conll :create :purge)
(purge-profile-cache :all)
(with-open-file (stream output :direction :output :if-exists :supersede)
(loop
with items = (analyze profile :tokensp t :commentp t)
with table = (make-hash-table)
with cues = (index-negations items table)
for item in (read-epe input)
for gold = (lookup-negation item cues table)
do
(starsem-output
(get-field :i-tokens item)
(get-field :ncues gold)
(get-field :nscopes gold)
(get-field :nevents gold)
:stream nil :format :epe)
(conll-output item :stream stream :type :epe)
(terpri stream)))))
(defun read-epe (stream &key debug)
(cond
((and (stringp stream) (string= stream "-"))
(setf stream *standard-input*))
((or (pathnamep stream) (stringp stream))
(when (probe-file stream)
(with-open-file (stream stream :direction :input)
(read-epe stream :debug debug))))
((streamp stream)
(labels ((clone (object keys)
(loop
for key in keys
for value
= (or (get-field key object)
(let ((key (intern (string-upcase key) :keyword)))
(get-field key object))
(let ((key (intern (string-downcase key) :keyword)))
(get-field key object)))
when value append (acons key value nil))))
(let ((json:*json-identifier-name-to-lisp* #'string)
items)
(handler-case
(loop
for object = (json:decode-json stream)
for id = (get-field :|id| object)
for nodes = (get-field :|nodes| object)
for graph
= (loop
for node in nodes
for token
= (clone node '(:id :start :end :top :form :properties))
do
;;
;; _fix_me_
;; the initial set of UDPipe outputs has ‘feats’ in UD
;; format as one property; patch this up here.
;; (3-may-17; oe)
(let* ((properties (get-field :properties token))
(feats (get-field :|feats| properties)))
(when feats
(setf properties
(delete :|feats| properties :key #'first))
(nconc properties (conll-parse-features feats))))
collect token)
do
(loop
for old in nodes
for new in graph
for edges
= (loop
for edge in (get-field :|edges| old)
for label = (get-field :|label| edge)
for target = (get-field :|target| edge)
for match
= (find
target graph
:key #'(lambda (node) (get-field :id node)))
when match collect (cons label match))
when edges do (set-field :edges edges new))
(push
(pairlis '(:i-id :i-tokens) (list id graph))
items))
(end-of-file ()
(setf items (nreverse items))
#+:null
(when debug
(with-open-file (stream debug :direction :output
:if-exists :supersede)
(loop
for item in items
for iid = (get-field :i-id item)
do
(loop
with graph = (get-field :i-tokens item)
with ids
= (loop
for node in graph
for id
= (let ((id (get-field :id node)))
(if (stringp id)
id #+:null(ignore-errors (parse-integer id))
id))
when (null id) do
(format
t "[~a] invalid node identifier: ‘~a’.~%"
iid (get-field :id node))
else when (member id ids :test #'equal) do
(format
t "[~a] duplicate node identifier: ‘~a’.~%"
iid id)
else collect id into ids)
for node in graph
do
(loop
for edge in (get-field :edges node)
for target
= (let ((target (get-field :target edge)))
(if (stringp target)
(ignore-errors (parse-integer target))
target))
when (null target) do
(format
t
"[~a] invalid edge identifier: ‘~a’ (‘~a’).~%"
iid (get-field :target edge)
(get-field :label edge))
else unless (member target ids :test #'equal) do
(format
t
"[~a] unkown edge identifier: ‘~a’ (‘~a’).~%"
iid target (get-field :label edge)))))))
items)))))))
(defun conll-barcelona-expand (graph tokens &key (from :dsynt))
(loop
for node in (get-field :i-tokens graph)
do
(set-field :top nil node)
(set-field :edges nil node))
(loop
with graph = (get-field :i-tokens graph)
with tokens = (get-field :i-tokens tokens)
with filter
= (if (eq from :dsyn)
'(:|id1| :|id2| :|id3| :|id4| :|id5| :|anteposed| :|arguments|
:|circum_paradigm| :|conj_num| :|coord_conj| :|elided|
:|hasDuplicate| :|head_of_relative| :|hasAntecedent| :|hasRelPro|
:|I| :|II| :|III| :|IV| :|root|)
'(:|benefactive| :|connect_check| :|direction| :|extent|
:|manner| :|place| :|pred_Name| :|pred_Value| :|purpose|
:|root| :|time| :|vn|))
with rename
= (when (eq from :bpa) '((:|word| . :|lemma|) (:|dpos| . :|pos|)))
for node in graph
for governors
= (loop
for string in (ppcre:split "," (get-field :governors node))
collect (ignore-errors (parse-integer string)))
for dependencies = (ppcre:split "," (get-field :dependencies node))
for features
= (conll-parse-features
(get-field :features node)
:filter filter :rename rename)
for id0 = (ignore-errors (parse-integer (get-field :|id0| features)))
for token
= (find id0 tokens :key #'(lambda (node) (get-field :id node)))
for form = (get-field :form token)
for start = (get-field :start token)
for end = (get-field :end token)
do
(when token
(nconc node (pairlis '(:form :start :end) (list form start end))))
(when (eq from :bpa)
(let* ((vn (get-field :vn node))
(colon (position #\: vn))
(vn (and colon (subseq vn (+ colon 1)))))
(when vn (nconc features (acons :|vn| vn nil)))))
(set-field :features (remove :|id0| features :key #'first) node)
(loop
for governor in governors
for dependency in dependencies
for target
= (find governor graph :key #'(lambda (node) (get-field :id node)))
when target
do (push (cons dependency node) (get-field :edges target))
else do (set-field :top t node)))
(set-field
:i-tokens
(sort
(get-field :i-tokens graph)
#'(lambda (one two)
(let ((ostart (get-field :start one))
(oend (get-field :end one))
(tstart (get-field :start two))
(tend (get-field :end two)))
(when (and ostart oend)
(or
(null tstart) (null tend)
(< ostart tstart)
(and (= ostart tstart) (< oend tend)))))))
graph)
graph)