# Natural Language Toolkit: NomBank Corpus Reader # # Copyright (C) 2001-2012 NLTK Project # Authors: Paul Bedaride # Edward Loper # URL: # For license information, see LICENSE.TXT import re import codecs from nltk.tree import Tree from xml.etree import ElementTree from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class NombankCorpusReader(CorpusReader): """ Corpus reader for the nombank corpus, which augments the Penn Treebank with information about the predicate argument structure of every noun instance. The corpus consists of two parts: the predicate-argument annotations themselves, and a set of "frameset files" which define the argument labels used by the annotations, on a per-noun basis. Each "frameset file" contains one or more predicates, such as ``'turn'`` or ``'turn_on'``, each of which is divided into coarse-grained word senses called "rolesets". For each "roleset", the frameset file provides descriptions of the argument roles, along with examples. """ def __init__(self, root, nomfile, framefiles='', nounsfile=None, parse_fileid_xform=None, parse_corpus=None, encoding=None): """ :param root: The root directory for this corpus. :param nomfile: The name of the file containing the predicate- argument annotations (relative to ``root``). :param framefiles: A list or regexp specifying the frameset fileids for this corpus. :param parse_fileid_xform: A transform that should be applied to the fileids in this corpus. This should be a function of one argument (a fileid) that returns a string (the new fileid). :param parse_corpus: The corpus containing the parse trees corresponding to this corpus. These parse trees are necessary to resolve the tree pointers used by nombank. """ # If framefiles is specified as a regexp, expand it. if isinstance(framefiles, basestring): framefiles = find_corpus_fileids(root, framefiles) framefiles = list(framefiles) # Initialze the corpus reader. CorpusReader.__init__(self, root, [nomfile, nounsfile] + framefiles, encoding) # Record our frame fileids & nom file. self._nomfile = nomfile self._framefiles = framefiles self._nounsfile = nounsfile self._parse_fileid_xform = parse_fileid_xform self._parse_corpus = parse_corpus def raw(self, fileids=None): """ :return: the text contents of the given fileids, as a single string. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, basestring): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def instances(self, baseform=None): """ :return: a corpus view that acts as a list of ``NombankInstance`` objects, one for each noun in the corpus. """ kwargs = {} if baseform is not None: kwargs['instance_filter'] = lambda inst: inst.baseform==baseform return StreamBackedCorpusView(self.abspath(self._nomfile), lambda stream: self._read_instance_block(stream, **kwargs), encoding=self.encoding(self._nomfile)) def lines(self): """ :return: a corpus view that acts as a list of strings, one for each line in the predicate-argument annotation file. """ return StreamBackedCorpusView(self.abspath(self._nomfile), read_line_block, encoding=self.encoding(self._nomfile)) def roleset(self, roleset_id): """ :return: the xml description for the given roleset. """ baseform = roleset_id.split('.')[0] baseform = baseform.replace('perc-sign','%') baseform = baseform.replace('oneslashonezero', '1/10').replace('1/10','1-slash-10') framefile = 'frames/%s.xml' % baseform if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % roleset_id) # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. etree = ElementTree.parse(self.abspath(framefile).open()).getroot() for roleset in etree.findall('predicate/roleset'): if roleset.attrib['id'] == roleset_id: return roleset else: raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile)) def rolesets(self, baseform=None): """ :return: list of xml descriptions for rolesets. """ if baseform is not None: framefile = 'frames/%s.xml' % baseform if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % baseform) framefiles = [framefile] else: framefiles = self._framefiles rsets = [] for framefile in framefiles: # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. etree = ElementTree.parse(self.abspath(framefile).open()).getroot() rsets.append(etree.findall('predicate/roleset')) return LazyConcatenation(rsets) def nouns(self): """ :return: a corpus view that acts as a list of all noun lemmas in this corpus (from the nombank.1.0.words file). """ return StreamBackedCorpusView(self.abspath(self._nounsfile), read_line_block, encoding=self.encoding(self._nounsfile)) def _read_instance_block(self, stream, instance_filter=lambda inst: True): block = [] # Read 100 at a time. for i in range(100): line = stream.readline().strip() if line: inst = NombankInstance.parse( line, self._parse_fileid_xform, self._parse_corpus) if instance_filter(inst): block.append(inst) return block ###################################################################### #{ Nombank Instance & related datatypes ###################################################################### class NombankInstance(object): def __init__(self, fileid, sentnum, wordnum, baseform, sensenumber, predicate, predid, arguments, parse_corpus=None): self.fileid = fileid """The name of the file containing the parse tree for this instance's sentence.""" self.sentnum = sentnum """The sentence number of this sentence within ``fileid``. Indexing starts from zero.""" self.wordnum = wordnum """The word number of this instance's predicate within its containing sentence. Word numbers are indexed starting from zero, and include traces and other empty parse elements.""" self.baseform = baseform """The baseform of the predicate.""" self.sensenumber = sensenumber """The sense number of the predicate.""" self.predicate = predicate """A ``NombankTreePointer`` indicating the position of this instance's predicate within its containing sentence.""" self.predid = predid """Identifier of the predicate.""" self.arguments = tuple(arguments) """A list of tuples (argloc, argid), specifying the location and identifier for each of the predicate's argument in the containing sentence. Argument identifiers are strings such as ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain the predicate.""" self.parse_corpus = parse_corpus """A corpus reader for the parse trees corresponding to the instances in this nombank corpus.""" @property def roleset(self): """The name of the roleset used by this instance's predicate. Use ``nombank.roleset() `` to look up information about the roleset.""" r = self.baseform.replace('%','perc-sign') r = r.replace('1/10','1-slash-10').replace('1-slash-10','oneslashonezero') return '%s.%s'%(r, self.sensenumber) def __repr__(self): return ('' % (self.fileid, self.sentnum, self.wordnum)) def __str__(self): s = '%s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum, self.baseform, self.sensenumber) items = self.arguments + ((self.predicate, 'rel'),) for (argloc, argid) in sorted(items): s += ' %s-%s' % (argloc, argid) return s def _get_tree(self): if self.parse_corpus is None: return None if self.fileid not in self.parse_corpus.fileids(): return None return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] tree = property(_get_tree, doc=""" The parse tree corresponding to this instance, or None if the corresponding tree is not available.""") @staticmethod def parse(s, parse_fileid_xform=None, parse_corpus=None): pieces = s.split() if len(pieces) < 6: raise ValueError('Badly formatted nombank line: %r' % s) # Divide the line into its basic pieces. (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5] args = pieces[5:] rel = [args.pop(i) for i,p in enumerate(args) if '-rel' in p] if len(rel) != 1: raise ValueError('Badly formatted nombank line: %r' % s) # Apply the fileid selector, if any. if parse_fileid_xform is not None: fileid = parse_fileid_xform(fileid) # Convert sentence & word numbers to ints. sentnum = int(sentnum) wordnum = int(wordnum) # Parse the predicate location. predloc, predid = rel[0].split('-', 1) predicate = NombankTreePointer.parse(predloc) # Parse the arguments. arguments = [] for arg in args: argloc, argid = arg.split('-', 1) arguments.append( (NombankTreePointer.parse(argloc), argid) ) # Put it all together. return NombankInstance(fileid, sentnum, wordnum, baseform, sensenumber, predicate, predid, arguments, parse_corpus) class NombankPointer(object): """ A pointer used by nombank to identify one or more constituents in a parse tree. ``NombankPointer`` is an abstract base class with three concrete subclasses: - ``NombankTreePointer`` is used to point to single constituents. - ``NombankSplitTreePointer`` is used to point to 'split' constituents, which consist of a sequence of two or more ``NombankTreePointer`` pointers. - ``NombankChainTreePointer`` is used to point to entire trace chains in a tree. It consists of a sequence of pieces, which can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers. """ def __init__(self): if self.__class__ == NombankPoitner: raise NotImplementedError() class NombankChainTreePointer(NombankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements may be either ``NombankSplitTreePointer`` or ``NombankTreePointer`` pointers.""" def __str__(self): return '*'.join('%s' % p for p in self.pieces) def __repr__(self): return '' % self def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*CHAIN*', [p.select(tree) for p in self.pieces]) class NombankSplitTreePointer(NombankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements are all ``NombankTreePointer`` pointers.""" def __str__(self): return ','.join('%s' % p for p in self.pieces) def __repr__(self): return '' % self def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*SPLIT*', [p.select(tree) for p in self.pieces]) class NombankTreePointer(NombankPointer): """ wordnum:height*wordnum:height*... wordnum:height, """ def __init__(self, wordnum, height): self.wordnum = wordnum self.height = height @staticmethod def parse(s): # Deal with chains (xx*yy*zz) pieces = s.split('*') if len(pieces) > 1: return NombankChainTreePointer([NombankTreePointer.parse(elt) for elt in pieces]) # Deal with split args (xx,yy,zz) pieces = s.split(',') if len(pieces) > 1: return NombankSplitTreePointer([NombankTreePointer.parse(elt) for elt in pieces]) # Deal with normal pointers. pieces = s.split(':') if len(pieces) != 2: raise ValueError('bad nombank pointer %r' % s) return NombankTreePointer(int(pieces[0]), int(pieces[1])) def __str__(self): return '%s:%s' % (self.wordnum, self.height) def __repr__(self): return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height) def __cmp__(self, other): while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, NombankTreePointer): return cmp(id(self), id(other)) return cmp( (self.wordnum, -self.height), (other.wordnum, -other.height) ) def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return tree[self.treepos(tree)] def treepos(self, tree): """ Convert this pointer to a standard 'tree position' pointer, given that it points to the given tree. """ if tree is None: raise ValueError('Parse tree not avaialable') stack = [tree] treepos = [] wordnum = 0 while True: #print treepos #print stack[-1] # tree node: if isinstance(stack[-1], Tree): # Select the next child. if len(treepos) < len(stack): treepos.append(0) else: treepos[-1] += 1 # Update the stack. if treepos[-1] < len(stack[-1]): stack.append(stack[-1][treepos[-1]]) else: # End of node's child list: pop up a level. stack.pop() treepos.pop() # word node: else: if wordnum == self.wordnum: return tuple(treepos[:len(treepos)-self.height-1]) else: wordnum += 1 stack.pop()