# -*- coding: iso-8859-1 -*- # Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) # # Copyright (C) 2001-2012 NLTK Project # Author: Selina Dennis # URL: # For license information, see LICENSE.TXT """ Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE), a 1.5 million word syntactically-annotated corpus of Old English prose texts. The corpus is distributed by the Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included with NLTK. The YCOE corpus is divided into 100 files, each representing an Old English prose text. Tags used within each text complies to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm """ import os import re from nltk.tokenize import RegexpTokenizer from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader from nltk.corpus.reader.tagged import TaggedCorpusReader from string import split from util import * from api import * class YCOECorpusReader(CorpusReader): """ Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE), a 1.5 million word syntactically-annotated corpus of Old English prose texts. """ def __init__(self, root, encoding=None): CorpusReader.__init__(self, root, [], encoding) self._psd_reader = YCOEParseCorpusReader( self.root.join('psd'), '.*', '.psd', encoding=encoding) self._pos_reader = YCOETaggedCorpusReader( self.root.join('pos'), '.*', '.pos') # Make sure we have a consistent set of items: documents = set(f[:-4] for f in self._psd_reader.fileids()) if set(f[:-4] for f in self._pos_reader.fileids()) != documents: raise ValueError('Items in "psd" and "pos" ' 'subdirectories do not match.') fileids = sorted(['%s.psd' % doc for doc in documents] + ['%s.pos' % doc for doc in documents]) CorpusReader.__init__(self, root, fileids, encoding) self._documents = sorted(documents) def documents(self, fileids=None): """ Return a list of document identifiers for all documents in this corpus, or for the documents with the given file(s) if specified. """ if fileids is None: return self._documents if isinstance(fileids, basestring): fileids = [fileids] for f in fileids: if f not in self._fileids: raise KeyError('File id %s not found' % fileids) # Strip off the '.pos' and '.psd' extensions. return sorted(set(f[:-4] for f in fileids)) def fileids(self, documents=None): """ Return a list of file identifiers for the files that make up this corpus, or that store the given document(s) if specified. """ if documents is None: return self._fileids elif isinstance(documents, basestring): documents = [documents] return sorted(set(['%s.pos' % doc for doc in documents] + ['%s.psd' % doc for doc in documents])) def _getfileids(self, documents, subcorpus): """ Helper that selects the appropriate fileids for a given set of documents from a given subcorpus (pos or psd). """ if documents is None: documents = self._documents else: if isinstance(documents, basestring): documents = [documents] for document in documents: if document not in self._documents: if document[-4:] in ('.pos', '.psd'): raise ValueError( 'Expected a document identifier, not a file ' 'identifier. (Use corpus.documents() to get ' 'a list of document identifiers.') else: raise ValueError('Document identifier %s not found' % document) return ['%s.%s' % (d, subcorpus) for d in documents] # Delegate to one of our two sub-readers: def words(self, documents=None): return self._pos_reader.words(self._getfileids(documents, 'pos')) def sents(self, documents=None): return self._pos_reader.sents(self._getfileids(documents, 'pos')) def paras(self, documents=None): return self._pos_reader.paras(self._getfileids(documents, 'pos')) def tagged_words(self, documents=None): return self._pos_reader.tagged_words(self._getfileids(documents, 'pos')) def tagged_sents(self, documents=None): return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos')) def tagged_paras(self, documents=None): return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos')) def parsed_sents(self, documents=None): return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd')) class YCOEParseCorpusReader(BracketParseCorpusReader): """Specialized version of the standard bracket parse corpus reader that strips out (CODE ...) and (ID ...) nodes.""" def _parse(self, t): t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t) if re.match(r'\s*\(\s*\)\s*$', t): return None return BracketParseCorpusReader._parse(self, t) class YCOETaggedCorpusReader(TaggedCorpusReader): def __init__(self, root, items, encoding=None): gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__(self, root, items, sep='_', sent_tokenizer=sent_tokenizer) #: A list of all documents and their titles in ycoe. documents = { 'coadrian.o34': 'Adrian and Ritheus', 'coaelhom.o3': 'Ælfric, Supplemental Homilies', 'coaelive.o3': 'Ælfric\'s Lives of Saints', 'coalcuin': 'Alcuin De virtutibus et vitiis', 'coalex.o23': 'Alexander\'s Letter to Aristotle', 'coapollo.o3': 'Apollonius of Tyre', 'coaugust': 'Augustine', 'cobede.o2': 'Bede\'s History of the English Church', 'cobenrul.o3': 'Benedictine Rule', 'coblick.o23': 'Blickling Homilies', 'coboeth.o2': 'Boethius\' Consolation of Philosophy', 'cobyrhtf.o3': 'Byrhtferth\'s Manual', 'cocanedgD': 'Canons of Edgar (D)', 'cocanedgX': 'Canons of Edgar (X)', 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I', 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II', 'cochad.o24': 'Saint Chad', 'cochdrul': 'Chrodegang of Metz, Rule', 'cochristoph': 'Saint Christopher', 'cochronA.o23': 'Anglo-Saxon Chronicle A', 'cochronC': 'Anglo-Saxon Chronicle C', 'cochronD': 'Anglo-Saxon Chronicle D', 'cochronE.o34': 'Anglo-Saxon Chronicle E', 'cocura.o2': 'Cura Pastoralis', 'cocuraC': 'Cura Pastoralis (Cotton)', 'codicts.o34': 'Dicts of Cato', 'codocu1.o1': 'Documents 1 (O1)', 'codocu2.o12': 'Documents 2 (O1/O2)', 'codocu2.o2': 'Documents 2 (O2)', 'codocu3.o23': 'Documents 3 (O2/O3)', 'codocu3.o3': 'Documents 3 (O3)', 'codocu4.o24': 'Documents 4 (O2/O4)', 'coeluc1': 'Honorius of Autun, Elucidarium 1', 'coeluc2': 'Honorius of Autun, Elucidarium 1', 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis', 'coeuphr': 'Saint Euphrosyne', 'coeust': 'Saint Eustace and his companions', 'coexodusP': 'Exodus (P)', 'cogenesiC': 'Genesis (C)', 'cogregdC.o24': 'Gregory\'s Dialogues (C)', 'cogregdH.o23': 'Gregory\'s Dialogues (H)', 'coherbar': 'Pseudo-Apuleius, Herbarium', 'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)', 'coinspolX': 'Wulfstan\'s Institute of Polity (X)', 'cojames': 'Saint James', 'colacnu.o23': 'Lacnunga', 'colaece.o2': 'Leechdoms', 'colaw1cn.o3': 'Laws, Cnut I', 'colaw2cn.o3': 'Laws, Cnut II', 'colaw5atr.o3': 'Laws, Æthelred V', 'colaw6atr.o3': 'Laws, Æthelred VI', 'colawaf.o2': 'Laws, Alfred', 'colawafint.o2': 'Alfred\'s Introduction to Laws', 'colawger.o34': 'Laws, Gerefa', 'colawine.ox2': 'Laws, Ine', 'colawnorthu.o3': 'Northumbra Preosta Lagu', 'colawwllad.o4': 'Laws, William I, Lad', 'coleofri.o4': 'Leofric', 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth', 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)', 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)', 'colwgeat': 'Ælfric\'s Letter to Wulfgeat', 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)', 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)', 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I', 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II', 'comargaC.o34': 'Saint Margaret (C)', 'comargaT': 'Saint Margaret (T)', 'comart1': 'Martyrology, I', 'comart2': 'Martyrology, II', 'comart3.o23': 'Martyrology, III', 'comarvel.o23': 'Marvels of the East', 'comary': 'Mary of Egypt', 'coneot': 'Saint Neot', 'conicodA': 'Gospel of Nicodemus (A)', 'conicodC': 'Gospel of Nicodemus (C)', 'conicodD': 'Gospel of Nicodemus (D)', 'conicodE': 'Gospel of Nicodemus (E)', 'coorosiu.o2': 'Orosius', 'cootest.o3': 'Heptateuch', 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I', 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II', 'coprefcura.o2': 'Preface to the Cura Pastoralis', 'coprefgen.o3': 'Ælfric\'s Preface to Genesis', 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints', 'coprefsolilo': 'Preface to Augustine\'s Soliloquies', 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus', 'corood': 'History of the Holy Rood-Tree', 'cosevensl': 'Seven Sleepers', 'cosolilo': 'St. Augustine\'s Soliloquies', 'cosolsat1.o4': 'Solomon and Saturn I', 'cosolsat2': 'Solomon and Saturn II', 'cotempo.o3': 'Ælfric\'s De Temporibus Anni', 'coverhom': 'Vercelli Homilies', 'coverhomE': 'Vercelli Homilies (E)', 'coverhomL': 'Vercelli Homilies (L)', 'covinceB': 'Saint Vincent (Bodley 343)', 'covinsal': 'Vindicta Salvatoris', 'cowsgosp.o3': 'West-Saxon Gospels', 'cowulf.o34': 'Wulfstan\'s Homilies' }