;;; -*- mode: common-lisp; coding: utf-8; package: tsdb -*- (in-package :tsdb) ;;; ;;; [incr tsdb()] --- Competence and Performance Profiling Environment ;;; Copyright (c) 2017 -- 2017 Stephan Oepen (oe@ifi.uio.no) ;;; ;;; This program is free software; you can redistribute it and/or modify it ;;; under the terms of the GNU Lesser General Public License as published by ;;; the Free Software Foundation; either version 2.1 of the License, or (at ;;; your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, but WITHOUT ;;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ;;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public ;;; License for more details. ;;; (defun epe-preprocess (raw &optional tt &key debug segments alignp ids) (let* ((*genia-tokenize-p* nil) (user (current-user)) (pid (current-pid)) (tokenizer (format nil "exec ~a -L en-u8 -S -N -P -x -E '' '~a'" (logon-file "bin" "tokenizer" :string) raw) #+:null (format nil "exec ~a -L en-u8 -S -N -x -E '~a' '~a'" (logon-file "bin" "tokenizer" :string) #\newline raw)) (sentences (format nil "~a/.epe.tokenizer.~a.~a.~a" (tmp :tsdb) user pid (string-downcase (gensym "")))) (repp (format nil "exec ~a -c '~a' --format line" (logon-file "bin" "repp" :string) (logon-file "lingo/erg/pet" "repp.set" :string))) (tokens (format nil "~a/.epe.repp.~a.~a.~a" (tmp :tsdb) user pid (string-downcase (gensym "")))) (alignment (format nil "~a/.epe.resa.~a.~a.~a" (tmp :tsdb) user pid (string-downcase (gensym "")))) (resa (format nil "exec ~a -f TAB -r '~a' -i '~a' -o '~a'" (logon-file "bin" "resaalign" :string) (namestring raw) tokens alignment))) (unless (probe-file raw) (error "epe-preprocess(): invalid input file ‘~a’." raw)) (unless segments (run-process tokenizer :wait t :output sentences :error-output nil) (unless (probe-file sentences) (error "epe-preprocess(): mysterious error in sentence splitting.")) ;; ;; _fix_me_ ;; REPP appears to ignore the final sentence unless there is a trailing ;; newline; tell bec, one day. (17-apr-17; oe) ;; (with-open-file (stream sentences :direction :output :if-exists :append) (terpri stream))) ;; ;; furthermore, tokenizer actually appears to apply some normalization: it ;; converts (initial, at least) HT (ASCII 9) to SP (32) and keeps it as part ;; of the sentence: REPP will return correct offsets, but when we then call ;; RESAalign, it considers the sentence start without whitespace, hence the ;; summing of sentence plus token offsets goes wrong. (18-apr-17; oe) ;; ;; but it turned out impossible in the end to make tokenizer not normalize ;; sentence-internal whitespace, i.e. sequences of HT, SP, or NL, which it ;; will just read much like a single space. (4-jun-17; oe) ;; #+:null (let ((lines (with-open-file (stream sentences) (loop for line = (read-line stream nil nil) while line collect line)))) (with-open-file (stream sentences :direction :output :if-exists :supersede) (labels ((whitespacep (char) (member char '(#\space #\tab #\newline #\page #\return #\linefeed) :test #'char=))) (loop with initial = t for line in lines for string = (if initial (string-left-trim '(#\space #\tab) line) line) for eos = (search "" string) unless (and initial (every #'whitespacep string)) do (setf initial nil) (cond (eos (write-string (subseq string 0 eos) stream) (terpri stream) (setf initial t)) (t (if (zerop (length string)) (write-char #\space stream) (write-string string stream)))))))) (if segments (with-open-file (stream tokens :direction :output :if-exixts :supersede) (loop for sentence in segments do (format stream "~{~a~%~}" sentence) (terpri stream))) (run-process repp :wait t :input sentences :output tokens :error-output nil)) (unless (probe-file tokens) (error "epe-preprocess(): mysterious error in tokenization.")) (if alignp (run-process resa :wait t :output nil :error-output nil) (with-open-file (stream alignment :direction :output :if-exixts :supersede))) (unless (probe-file alignment) (error "epe-preprocess(): mysterious error in RESA alignment.")) (genia-shutdown) (unless debug (delete-file sentences)) (let* ((pairs (with-open-file (stream alignment) (loop with scanner = (ppcre:create-scanner "\\t") for line = (read-line stream nil nil) for fields = (and line (ppcre:split scanner line)) while line when (and fields (string= (third fields) "TOK")) collect (cons (parse-integer (first fields)) (parse-integer (second fields)))))) (sentences (with-open-file (stream tokens) (loop with sentence for line = (read-line stream nil nil) when (and (or (null line) (string= line "")) sentence) collect (nreverse sentence) and do (setf sentence nil) else when (and line (not (string= line ""))) do (let ((pair (pop pairs))) (push (pairlis '(:start :end :form) (list (or (first pair) -1) (or (rest pair) -1) line)) sentence)) while line))) (tnt (loop for sentence in sentences collect (lkb::tnt sentence))) (genia (loop for sentence in tnt collect (genia sentence)))) (unless debug (delete-file tokens) (delete-file alignment)) (when (or (stringp tt) (pathnamep tt)) (with-open-file (stream tt :direction :output :if-exists :supersede) (loop for id in ids for sentence in genia do (when (numberp id) (format stream "#~a~%" id)) (loop for token in sentence when (numberp id) do (format stream "~a ~a~%" (get-field :stem token) (get-field :pos token)) else do (format stream "~a ~a ~a ~a ~a~%" (get-field :start token) (get-field :end token) (get-field :form token) (get-field :stem token) (get-field :pos token))) (terpri stream)))) (when (and alignp debug) (with-open-file (stream tt) (loop with text = (with-open-file (stream raw) (loop for c = (read-char stream nil nil) while c collect c into result finally (return (coerce result 'string)))) with scanner = (ppcre:create-scanner "\\t") for i from 0 for line = (read-line stream nil nil) for fields = (and line (ppcre:split scanner line)) while line when fields do (let* ((start (parse-integer (first fields))) (end (parse-integer (second fields))) (token (subseq text start end))) (unless (string= token (third fields)) (format t "[~a] <~a:~a> ‘~a’ vs. ‘~a’.~%" i start end token (third fields))))))) sentences))) (defun epe-project (input gold output) (unless (probe-file input) (error "epe-project(): invalid input file ‘~a’." input)) (unless (probe-file gold) (error "epe-project(): invalid negation file ‘~a’." gold)) (let* ((*conll-type* :starsem+) (profile (format nil "tmp/epe.~a" (gensym "")))) (purge-profile-cache :all) (do-import-items gold profile :format :conll :create :purge) (purge-profile-cache :all) (with-open-file (stream output :direction :output :if-exists :supersede) (loop with items = (analyze profile :tokensp t :commentp t) with table = (make-hash-table) with cues = (index-negations items table) for item in (read-epe input) for gold = (lookup-negation item cues table) do (starsem-output (get-field :i-tokens item) (get-field :ncues gold) (get-field :nscopes gold) (get-field :nevents gold) :stream nil :format :epe) (conll-output item :stream stream :type :epe) (terpri stream))))) (defun read-epe (stream &key debug) (cond ((and (stringp stream) (string= stream "-")) (setf stream *standard-input*)) ((or (pathnamep stream) (stringp stream)) (when (probe-file stream) (with-open-file (stream stream :direction :input) (read-epe stream :debug debug)))) ((streamp stream) (labels ((clone (object keys) (loop for key in keys for value = (or (get-field key object) (let ((key (intern (string-upcase key) :keyword))) (get-field key object)) (let ((key (intern (string-downcase key) :keyword))) (get-field key object))) when value append (acons key value nil)))) (let ((json:*json-identifier-name-to-lisp* #'string) items) (handler-case (loop for object = (json:decode-json stream) for id = (get-field :|id| object) for nodes = (get-field :|nodes| object) for graph = (loop for node in nodes for token = (clone node '(:id :start :end :top :form :properties)) do ;; ;; _fix_me_ ;; the initial set of UDPipe outputs has ‘feats’ in UD ;; format as one property; patch this up here. ;; (3-may-17; oe) (let* ((properties (get-field :properties token)) (feats (get-field :|feats| properties))) (when feats (setf properties (delete :|feats| properties :key #'first)) (nconc properties (conll-parse-features feats)))) collect token) do (loop for old in nodes for new in graph for edges = (loop for edge in (get-field :|edges| old) for label = (get-field :|label| edge) for target = (get-field :|target| edge) for match = (find target graph :key #'(lambda (node) (get-field :id node))) when match collect (cons label match)) when edges do (set-field :edges edges new)) (push (pairlis '(:i-id :i-tokens) (list id graph)) items)) (end-of-file () (setf items (nreverse items)) #+:null (when debug (with-open-file (stream debug :direction :output :if-exists :supersede) (loop for item in items for iid = (get-field :i-id item) do (loop with graph = (get-field :i-tokens item) with ids = (loop for node in graph for id = (let ((id (get-field :id node))) (if (stringp id) id #+:null(ignore-errors (parse-integer id)) id)) when (null id) do (format t "[~a] invalid node identifier: ‘~a’.~%" iid (get-field :id node)) else when (member id ids :test #'equal) do (format t "[~a] duplicate node identifier: ‘~a’.~%" iid id) else collect id into ids) for node in graph do (loop for edge in (get-field :edges node) for target = (let ((target (get-field :target edge))) (if (stringp target) (ignore-errors (parse-integer target)) target)) when (null target) do (format t "[~a] invalid edge identifier: ‘~a’ (‘~a’).~%" iid (get-field :target edge) (get-field :label edge)) else unless (member target ids :test #'equal) do (format t "[~a] unkown edge identifier: ‘~a’ (‘~a’).~%" iid target (get-field :label edge))))))) items))))))) (defun conll-barcelona-expand (graph tokens &key (from :dsynt)) (loop for node in (get-field :i-tokens graph) do (set-field :top nil node) (set-field :edges nil node)) (loop with graph = (get-field :i-tokens graph) with tokens = (get-field :i-tokens tokens) with filter = (if (eq from :dsyn) '(:|id1| :|id2| :|id3| :|id4| :|id5| :|anteposed| :|arguments| :|circum_paradigm| :|conj_num| :|coord_conj| :|elided| :|hasDuplicate| :|head_of_relative| :|hasAntecedent| :|hasRelPro| :|I| :|II| :|III| :|IV| :|root|) '(:|benefactive| :|connect_check| :|direction| :|extent| :|manner| :|place| :|pred_Name| :|pred_Value| :|purpose| :|root| :|time| :|vn|)) with rename = (when (eq from :bpa) '((:|word| . :|lemma|) (:|dpos| . :|pos|))) for node in graph for governors = (loop for string in (ppcre:split "," (get-field :governors node)) collect (ignore-errors (parse-integer string))) for dependencies = (ppcre:split "," (get-field :dependencies node)) for features = (conll-parse-features (get-field :features node) :filter filter :rename rename) for id0 = (ignore-errors (parse-integer (get-field :|id0| features))) for token = (find id0 tokens :key #'(lambda (node) (get-field :id node))) for form = (get-field :form token) for start = (get-field :start token) for end = (get-field :end token) do (when token (nconc node (pairlis '(:form :start :end) (list form start end)))) (when (eq from :bpa) (let* ((vn (get-field :vn node)) (colon (position #\: vn)) (vn (and colon (subseq vn (+ colon 1))))) (when vn (nconc features (acons :|vn| vn nil))))) (set-field :features (remove :|id0| features :key #'first) node) (loop for governor in governors for dependency in dependencies for target = (find governor graph :key #'(lambda (node) (get-field :id node))) when target do (push (cons dependency node) (get-field :edges target)) else do (set-field :top t node))) (set-field :i-tokens (sort (get-field :i-tokens graph) #'(lambda (one two) (let ((ostart (get-field :start one)) (oend (get-field :end one)) (tstart (get-field :start two)) (tend (get-field :end two))) (when (and ostart oend) (or (null tstart) (null tend) (< ostart tstart) (and (= ostart tstart) (< oend tend))))))) graph) graph)