import sys from digest import * class Node : pass class NonTerminal (Node) : def __init__ (self, name) : self.name = name self.children = [] self.cache_width = None self.cache_depth = None def contains (self, l) : if self.name in l : return True return (True in [ child.contains(l) for child in self.children ]) def show (self, depth=0) : sys.stdout.write ((depth * " " + self.name + '\n').encode('utf8')) for child in self.children : child.show(depth+4) def transform (self, d) : # Uses the dictionary to replace certain terms with others. try : self.name = d[self.name] except KeyError : pass [ c.transform(d) for c in self.children ] def show_tree (self) : current_line = [ (0,self) ] while len(current_line) > 0 : current_pos = 0 new_line = [] connection_lines = [] for left, node in current_line : # Get to the correct position on the line. sys.stdout.write (" " * max(0, (left-current_pos))) current_pos = left # Write the Node's identifier, and adjust the writer's position sys.stdout.write (node.name.center(node.width()).encode('utf8')) sys.stdout.write (" ") current_pos += node.width() + 1 # If this node is a NonTerminal, add its children to the new_line. if isinstance(node, NonTerminal) : new_left = left for child in node.children : new_line.append ( (new_left, child) ) connection_right = new_left new_left += child.width() + 1 connection_lines.append((left + node.children[0].width()/2, \ connection_right + node.children[-1].width()/2)) current_line = new_line sys.stdout.write ('\n') # Write the connections current_pos = 0 for left, right in connection_lines : if left == right : sys.stdout.write (" " * (left-current_pos) + "|") current_pos += left-current_pos+1 else : sys.stdout.write (" " * (left-current_pos)) sys.stdout.write ("/" + (right-left-2) * "-" + "\\") current_pos += right-current_pos+2 sys.stdout.write ('\n') def depth (self) : if self.cache_depth == None : self.cache_depth = 1 + max([ child.depth() for child in self.children]) return self.cache_depth def width (self) : if self.cache_width == None : if len(self.children) == 1 : self.cache_width = max (len(self.name), self.children[0].width()) else : self.cache_width = max(len(self.name), \ len(self.children) - 1 + sum ([ child.width() for child in self.children ])) return self.cache_width class Terminal (Node) : def __init__ (self, name) : self.name = name def contains (self, l) : return self.name in l def transform (self, d) : # Uses the dictionary to replace certain terms with others. try : self.name = d[self.name] except KeyError : pass def show (self, depth=0) : sys.stdout.write (depth * " ") sys.stdout.write (self.name.encode('utf8')) sys.stdout.write ('\n') def show_tree (self) : pass def depth (self) : return 1 def width (self) : return len(self.name) def center (self) : return self.width() / 2 def parse_root (s, pos) : # root-x NODE name, pos = digest(re_non_whitespaces_plus, s, pos, True) node = NonTerminal (name) pos = discard(re_whitespaces_plus, s, pos, True) child_node, pos = parse_derivation(s, pos) node.children.append(child_node) return node, pos def parse_non_terminal (s, pos) : # edge-nr rulename logprob startspan endspan child+ edge_nr, pos = digest (re_non_whitespaces_plus, s, pos, True) pos = discard (re_whitespaces_plus, s, pos, True) rulename, pos = digest (re_non_whitespaces_plus, s, pos, True) pos = discard (re_whitespaces_plus, s, pos, True) logprob, pos = digest (re_non_whitespaces_plus, s, pos, True) pos = discard (re_whitespaces_plus, s, pos, True) startspan, pos = digest (re_non_whitespaces_plus, s, pos, True) pos = discard (re_whitespaces_plus, s, pos, True) endspan, pos = digest (re_non_whitespaces_plus, s, pos, True) pos = discard (re_whitespaces_plus, s, pos, True) node = NonTerminal (rulename) while s[pos] == "(" : child_node, pos = parse_derivation(s, pos) node.children.append(child_node) pos = discard (re_whitespaces_star, s, pos, True) return node, pos def parse_terminal (s, pos) : # CM: "einigte" 37 "token [ +TNT tnt [ +PRBS ne-list [ REST null FIRST \\"1\\" ] +TAGS ne-list [ REST null FIRST \\"VFIN\\" ] ] +TO \\"24\\" +FROM \\"18\\" +IDS diff-list +TRAIT native_trait +FORM \\"einigte\\" ]" # Non-CM: "einigte" 16 17 pos = discard (re_double_quote, s, pos, True) orth, pos = digest (re_any_but_double_quote, s, pos, True) node = Terminal('"' + orth + '"') pos = discard (re_double_quote, s, pos, True) pos = discard (re_whitespaces_plus, s, pos, True) pos_start = pos pos = discard (re_integer, s, pos, True) pos = discard (re_whitespaces_plus, s, pos, True) if s[pos].isdigit() : # First try non-chart mapping case. pos = discard (re_integer, s, pos, True) else : # If there is no integer there, we assume chart mapping format of the result file. # One terminal may be connected to several input TOKENs. pos = pos_start while s[pos].isalnum() : edge_nr, pos = digest (re_non_whitespaces_plus, s, pos, True) pos = discard (re_whitespaces_plus, s, pos, True) found = False pos += 1 # skip opening doublequote while not found : # skip escaped doublequote if s[pos] == '\\' and s[pos+1] == '"' : pos += 1 else : found = (s[pos] == '"') pos += 1 pos = discard (re_whitespaces_star, s, pos, True) return node, pos def parse_derivation (s, pos) : pos = discard(re_open_bracket, s, pos) if s[pos:pos+4].isalpha() : node, pos = parse_root(s, pos) elif s[pos:pos+1] == '"' : node, pos = parse_terminal(s, pos) elif s[pos:pos+1].isdigit() : node, pos = parse_non_terminal(s, pos) else : sys.stderr.write ("Can't identify node: " + s[pos:pos+20] + '\n') pos = discard(re_close_bracket, s, pos, True) return node, pos