Package Bio :: Package Nexus :: Module Trees
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Trees

  1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
  7  """Tree class to handle phylogenetic trees. 
  8   
  9  Provides a set of methods to read and write newick-format tree descriptions, 
 10  get information about trees (monphyly of taxon sets, congruence between trees, 
 11  common ancestors,...) and to manipulate trees (reroot trees, split terminal 
 12  nodes). 
 13  """ 
 14   
 15  from __future__ import print_function 
 16   
 17  import random 
 18  import sys 
 19  from . import Nodes 
 20   
 21   
 22  PRECISION_BRANCHLENGTH = 6 
 23  PRECISION_SUPPORT = 6 
 24  NODECOMMENT_START = '[&' 
 25  NODECOMMENT_END = ']' 
 26   
 27   
28 -class TreeError(Exception):
29 pass
30 31
32 -class NodeData(object):
33 """Stores tree-relevant data associated with nodes (e.g. branches or otus).""" 34
35 - def __init__(self, taxon=None, branchlength=0.0, support=None, comment=None):
36 self.taxon = taxon 37 self.branchlength = branchlength 38 self.support = support 39 self.comment = comment
40 41
42 -class Tree(Nodes.Chain):
43 """Represents a tree using a chain of nodes with on predecessor (=ancestor) 44 and multiple successors (=subclades). 45 """ 46 47 # A newick tree is parsed into nested list and then converted to a node list in two stages 48 # mostly due to historical reasons. This could be done in one swoop). Note: parentheses ( ) and 49 # colon : are not allowed in taxon names. This is against NEXUS standard, but makes life much 50 # easier when parsing trees. 51 52 # NOTE: Tree should store its data class in something like self.dataclass=data, 53 # so that nodes that are generated have easy access to the data class 54 # Some routines use automatically NodeData, this needs to be more concise 55
56 - def __init__(self, tree=None, weight=1.0, rooted=False, name='', data=NodeData, values_are_support=False, max_support=1.0):
57 """Ntree(self,tree).""" 58 Nodes.Chain.__init__(self) 59 self.dataclass = data 60 self.__values_are_support = values_are_support 61 self.max_support = max_support 62 self.weight = weight 63 self.rooted = rooted 64 self.name = name 65 root = Nodes.Node(data()) 66 self.root = self.add(root) 67 if tree: # use the tree we have 68 # if Tree is called from outside Nexus parser, we need to get rid of linebreaks, etc 69 tree = tree.strip().replace('\n', '').replace('\r', '') 70 # there's discrepancy whether newick allows semicolons et the end 71 tree = tree.rstrip(';') 72 subtree_info, base_info = self._parse(tree) 73 root.data = self._add_nodedata(root.data, [[], base_info]) 74 self._add_subtree(parent_id=root.id, tree=subtree_info)
75
76 - def _parse(self, tree):
77 """Parses (a,b,c...)[[[xx]:]yy] into subcomponents and travels down recursively.""" 78 # Remove any leading/trailing white space - want any string starting 79 # with " (..." should be recognised as a leaf, "(..." 80 tree = tree.strip() 81 if tree.count('(') != tree.count(')'): 82 raise TreeError('Parentheses do not match in (sub)tree: ' + tree) 83 if tree.count('(') == 0: # a leaf 84 # check if there's a colon, or a special comment, or both after the taxon name 85 nodecomment = tree.find(NODECOMMENT_START) 86 colon = tree.find(':') 87 if colon == -1 and nodecomment == -1: # none 88 return [tree, [None]] 89 elif colon == -1 and nodecomment > -1: # only special comment 90 return [tree[:nodecomment], self._get_values(tree[nodecomment:])] 91 elif colon > -1 and nodecomment == -1: # only numerical values 92 return [tree[:colon], self._get_values(tree[colon + 1:])] 93 elif colon < nodecomment: # taxon name ends at first colon or with special comment 94 return [tree[:colon], self._get_values(tree[colon + 1:])] 95 else: 96 return [tree[:nodecomment], self._get_values(tree[nodecomment:])] 97 else: 98 closing = tree.rfind(')') 99 val = self._get_values(tree[closing + 1:]) 100 if not val: 101 val = [None] 102 subtrees = [] 103 plevel = 0 104 prev = 1 105 incomment = False 106 for p in range(1, closing): 107 if not incomment and tree[p] == '(': 108 plevel += 1 109 elif not incomment and tree[p] == ')': 110 plevel -= 1 111 elif tree[p:].startswith(NODECOMMENT_START): 112 incomment = True 113 elif incomment and tree[p] == NODECOMMENT_END: 114 incomment = False 115 elif not incomment and tree[p] == ',' and plevel == 0: 116 subtrees.append(tree[prev:p]) 117 prev = p + 1 118 119 subtrees.append(tree[prev:closing]) 120 subclades = [self._parse(subtree) for subtree in subtrees] 121 return [subclades, val]
122
123 - def _add_subtree(self, parent_id=None, tree=None):
124 """Adds leaf or tree (in newick format) to a parent_id.""" 125 if parent_id is None: 126 raise TreeError('Need node_id to connect to.') 127 for st in tree: 128 nd = self.dataclass() 129 nd = self._add_nodedata(nd, st) 130 if isinstance(st[0], list): # it's a subtree 131 sn = Nodes.Node(nd) 132 self.add(sn, parent_id) 133 self._add_subtree(sn.id, st[0]) 134 else: # it's a leaf 135 nd.taxon = st[0] 136 leaf = Nodes.Node(nd) 137 self.add(leaf, parent_id)
138
139 - def _add_nodedata(self, nd, st):
140 """Add data to the node parsed from the comments, taxon and support.""" 141 if isinstance(st[1][-1], str) and st[1][-1].startswith(NODECOMMENT_START): 142 nd.comment = st[1].pop(-1) 143 # if the first element is a string, it's the subtree node taxon 144 elif isinstance(st[1][0], str): 145 nd.taxon = st[1][0] 146 st[1] = st[1][1:] 147 if len(st) > 1: 148 if len(st[1]) >= 2: # if there's two values, support comes first. Is that always so? 149 nd.support = st[1][0] 150 if st[1][1] is not None: 151 nd.branchlength = st[1][1] 152 elif len(st[1]) == 1: # otherwise it could be real branchlengths or support as branchlengths 153 if not self.__values_are_support: # default 154 if st[1][0] is not None: 155 nd.branchlength = st[1][0] 156 else: 157 nd.support = st[1][0] 158 return nd
159
160 - def _get_values(self, text):
161 """Extracts values (support/branchlength) from xx[:yyy], xx.""" 162 if text == '': 163 return None 164 nodecomment = None 165 if NODECOMMENT_START in text: # if there's a [&....] comment, cut it out 166 nc_start = text.find(NODECOMMENT_START) 167 nc_end = text.find(NODECOMMENT_END) 168 if nc_end == -1: 169 raise TreeError('Error in tree description: Found %s without matching %s' 170 % (NODECOMMENT_START, NODECOMMENT_END)) 171 nodecomment = text[nc_start:nc_end + 1] 172 text = text[:nc_start] + text[nc_end + 1:] 173 174 # pase out supports and branchlengths, with internal node taxa info 175 values = [] 176 taxonomy = None 177 for part in [t.strip() for t in text.split(":")]: 178 if part: 179 try: 180 values.append(float(part)) 181 except ValueError: 182 assert taxonomy is None, "Two string taxonomies?" 183 taxonomy = part 184 if taxonomy: 185 values.insert(0, taxonomy) 186 if nodecomment: 187 values.append(nodecomment) 188 return values
189
190 - def _walk(self, node=None):
191 """Return all node_ids downwards from a node.""" 192 if node is None: 193 node = self.root 194 for n in self.node(node).succ: 195 yield n 196 for sn in self._walk(n): 197 yield sn
198
199 - def node(self, node_id):
200 """Return the instance of node_id. 201 202 node = node(self,node_id) 203 """ 204 if node_id not in self.chain: 205 raise TreeError('Unknown node_id: %d' % node_id) 206 return self.chain[node_id]
207
208 - def split(self, parent_id=None, n=2, branchlength=1.0):
209 """Speciation: generates n (default two) descendants of a node. 210 211 [new ids] = split(self,parent_id=None,n=2,branchlength=1.0): 212 """ 213 if parent_id is None: 214 raise TreeError('Missing node_id.') 215 ids = [] 216 parent_data = self.chain[parent_id].data 217 for i in range(n): 218 node = Nodes.Node() 219 if parent_data: 220 node.data = self.dataclass() 221 # each node has taxon and branchlength attribute 222 if parent_data.taxon: 223 node.data.taxon = parent_data.taxon + str(i) 224 node.data.branchlength = branchlength 225 ids.append(self.add(node, parent_id)) 226 return ids
227
228 - def search_taxon(self, taxon):
229 """Returns the first matching taxon in self.data.taxon. Not restricted to terminal nodes. 230 231 node_id = search_taxon(self,taxon) 232 """ 233 for id, node in self.chain.items(): 234 if node.data.taxon == taxon: 235 return id 236 return None
237
238 - def prune(self, taxon):
239 """Prunes a terminal taxon from the tree. 240 241 id_of_previous_node = prune(self,taxon) 242 If taxon is from a bifurcation, the connectiong node will be collapsed 243 and its branchlength added to remaining terminal node. This might be no 244 longer a meaningful value' 245 """ 246 id = self.search_taxon(taxon) 247 if id is None: 248 raise TreeError('Taxon not found: %s' % taxon) 249 elif id not in self.get_terminals(): 250 raise TreeError('Not a terminal taxon: %s' % taxon) 251 else: 252 prev = self.unlink(id) 253 self.kill(id) 254 if len(self.node(prev).succ) == 1: 255 if prev == self.root: # we deleted one branch of a bifurcating root, then we have to move the root upwards 256 self.root = self.node(self.root).succ[0] 257 self.node(self.root).branchlength = 0.0 258 self.kill(prev) 259 else: 260 succ = self.node(prev).succ[0] 261 new_bl = self.node(prev).data.branchlength + self.node(succ).data.branchlength 262 self.collapse(prev) 263 self.node(succ).data.branchlength = new_bl 264 return prev
265
266 - def get_taxa(self, node_id=None):
267 """Return a list of all otus downwards from a node. 268 269 nodes = get_taxa(self,node_id=None) 270 """ 271 if node_id is None: 272 node_id = self.root 273 if node_id not in self.chain: 274 raise TreeError('Unknown node_id: %d.' % node_id) 275 if self.chain[node_id].succ == []: 276 if self.chain[node_id].data: 277 return [self.chain[node_id].data.taxon] 278 else: 279 return None 280 else: 281 list = [] 282 for succ in self.chain[node_id].succ: 283 list.extend(self.get_taxa(succ)) 284 return list
285
286 - def get_terminals(self):
287 """Return a list of all terminal nodes.""" 288 return [i for i in self.all_ids() if self.node(i).succ == []]
289
290 - def is_terminal(self, node):
291 """Returns True if node is a terminal node.""" 292 return self.node(node).succ == []
293
294 - def is_internal(self, node):
295 """Returns True if node is an internal node.""" 296 return len(self.node(node).succ) > 0
297
298 - def is_preterminal(self, node):
299 """Returns True if all successors of a node are terminal ones.""" 300 if self.is_terminal(node): 301 return False not in [self.is_terminal(n) for n in self.node(node).succ] 302 else: 303 return False
304
305 - def count_terminals(self, node=None):
306 """Counts the number of terminal nodes that are attached to a node.""" 307 if node is None: 308 node = self.root 309 return len([n for n in self._walk(node) if self.is_terminal(n)])
310
311 - def collapse_genera(self, space_equals_underscore=True):
312 """Collapses all subtrees which belong to the same genus. 313 314 (i.e share the same first word in their taxon name.) 315 """ 316 while True: 317 for n in self._walk(): 318 if self.is_terminal(n): 319 continue 320 taxa = self.get_taxa(n) 321 genera = [] 322 for t in taxa: 323 if space_equals_underscore: 324 t = t.replace(' ', '_') 325 try: 326 genus = t.split('_', 1)[0] 327 except IndexError: 328 genus = 'None' 329 if genus not in genera: 330 genera.append(genus) 331 if len(genera) == 1: 332 self.node(n).data.taxon = genera[0] + ' <collapsed>' 333 # now we kill all nodes downstream 334 nodes2kill = [kn for kn in self._walk(node=n)] 335 for kn in nodes2kill: 336 self.kill(kn) 337 self.node(n).succ = [] 338 break # break out of for loop because node list from _walk will be inconsistent 339 else: # for loop exhausted: no genera to collapse left 340 break # while
341
342 - def sum_branchlength(self, root=None, node=None):
343 """Adds up the branchlengths from root (default self.root) to node. 344 345 sum = sum_branchlength(self,root=None,node=None) 346 """ 347 if root is None: 348 root = self.root 349 if node is None: 350 raise TreeError('Missing node id.') 351 blen = 0.0 352 while node is not None and node is not root: 353 blen += self.node(node).data.branchlength 354 node = self.node(node).prev 355 return blen
356
357 - def set_subtree(self, node):
358 """Return subtree as a set of nested sets. 359 360 sets = set_subtree(self,node) 361 """ 362 if self.node(node).succ == []: 363 return self.node(node).data.taxon 364 else: 365 try: 366 return frozenset(self.set_subtree(n) for n in self.node(node).succ) 367 except Exception: 368 print(node) 369 print(self.node(node).succ) 370 for n in self.node(node).succ: 371 print("%s %s" % (n, self.set_subtree(n))) 372 print([self.set_subtree(n) for n in self.node(node).succ]) 373 raise
374
375 - def is_identical(self, tree2):
376 """Compare tree and tree2 for identity. 377 378 result = is_identical(self,tree2) 379 """ 380 return self.set_subtree(self.root) == tree2.set_subtree(tree2.root)
381
382 - def is_compatible(self, tree2, threshold, strict=True):
383 """Compares branches with support>threshold for compatibility. 384 385 result = is_compatible(self,tree2,threshold) 386 """ 387 # check if both trees have the same set of taxa. strict=True enforces this. 388 missing2 = set(self.get_taxa()) - set(tree2.get_taxa()) 389 missing1 = set(tree2.get_taxa()) - set(self.get_taxa()) 390 if strict and (missing1 or missing2): 391 if missing1: 392 print('Taxon/taxa %s is/are missing in tree %s' % (','.join(missing1), self.name)) 393 if missing2: 394 print('Taxon/taxa %s is/are missing in tree %s' % (','.join(missing2), tree2.name)) 395 raise TreeError('Can\'t compare trees with different taxon compositions.') 396 t1 = [(set(self.get_taxa(n)), self.node(n).data.support) for n in self.all_ids() if 397 self.node(n).succ and 398 (self.node(n).data and self.node(n).data.support and self.node(n).data.support >= threshold)] 399 t2 = [(set(tree2.get_taxa(n)), tree2.node(n).data.support) for n in tree2.all_ids() if 400 tree2.node(n).succ and 401 (tree2.node(n).data and tree2.node(n).data.support and tree2.node(n).data.support >= threshold)] 402 conflict = [] 403 for (st1, sup1) in t1: 404 for (st2, sup2) in t2: 405 if not st1.issubset(st2) and not st2.issubset(st1): # don't hiccup on upstream nodes 406 intersect, notin1, notin2 = st1 & st2, st2 - st1, st1 - st2 # all three are non-empty sets 407 # if notin1==missing1 or notin2==missing2 <==> st1.issubset(st2) or st2.issubset(st1) ??? 408 if intersect and not (notin1.issubset(missing1) or notin2.issubset(missing2)): # omit conflicts due to missing taxa 409 conflict.append((st1, sup1, st2, sup2, intersect, notin1, notin2)) 410 return conflict
411
412 - def common_ancestor(self, node1, node2):
413 """Return the common ancestor that connects two nodes. 414 415 node_id = common_ancestor(self,node1,node2) 416 """ 417 l1 = [self.root] + self.trace(self.root, node1) 418 l2 = [self.root] + self.trace(self.root, node2) 419 return [n for n in l1 if n in l2][-1]
420
421 - def distance(self, node1, node2):
422 """Add and return the sum of the branchlengths between two nodes. 423 424 dist = distance(self,node1,node2) 425 """ 426 ca = self.common_ancestor(node1, node2) 427 return self.sum_branchlength(ca, node1) + self.sum_branchlength(ca, node2)
428
429 - def is_monophyletic(self, taxon_list):
430 """Return node_id of common ancestor if taxon_list is monophyletic, -1 otherwise. 431 432 result = is_monophyletic(self,taxon_list) 433 """ 434 if isinstance(taxon_list, str): 435 taxon_set = set([taxon_list]) 436 else: 437 taxon_set = set(taxon_list) 438 node_id = self.root 439 while True: 440 subclade_taxa = set(self.get_taxa(node_id)) 441 if subclade_taxa == taxon_set: # are we there? 442 return node_id 443 else: # check subnodes 444 for subnode in self.chain[node_id].succ: 445 if set(self.get_taxa(subnode)).issuperset(taxon_set): # taxon_set is downstream 446 node_id = subnode 447 break # out of for loop 448 else: 449 return -1 # taxon set was not with successors, for loop exhausted
450
451 - def is_bifurcating(self, node=None):
452 """Return True if tree downstream of node is strictly bifurcating.""" 453 if node is None: 454 node = self.root 455 if node == self.root and len(self.node(node).succ) == 3: # root can be trifurcating, because it has no ancestor 456 return self.is_bifurcating(self.node(node).succ[0]) and \ 457 self.is_bifurcating(self.node(node).succ[1]) and \ 458 self.is_bifurcating(self.node(node).succ[2]) 459 if len(self.node(node).succ) == 2: 460 return self.is_bifurcating(self.node(node).succ[0]) and self.is_bifurcating(self.node(node).succ[1]) 461 elif len(self.node(node).succ) == 0: 462 return True 463 else: 464 return False
465
466 - def branchlength2support(self):
467 """Move values stored in data.branchlength to data.support, and set branchlength to 0.0 468 469 This is necessary when support has been stored as branchlength (e.g. paup), and has thus 470 been read in as branchlength. 471 """ 472 for n in self.chain: 473 self.node(n).data.support = self.node(n).data.branchlength 474 self.node(n).data.branchlength = 0.0
475
476 - def convert_absolute_support(self, nrep):
477 """Convert absolute support (clade-count) to rel. frequencies. 478 479 Some software (e.g. PHYLIP consense) just calculate how often clades appear, instead of 480 calculating relative frequencies. 481 """ 482 for n in self._walk(): 483 if self.node(n).data.support: 484 self.node(n).data.support /= float(nrep)
485
486 - def has_support(self, node=None):
487 """Returns True if any of the nodes has data.support != None.""" 488 for n in self._walk(node): 489 if self.node(n).data.support: 490 return True 491 else: 492 return False
493
494 - def randomize(self, ntax=None, taxon_list=None, branchlength=1.0, branchlength_sd=None, bifurcate=True):
495 """Generates a random tree with ntax taxa and/or taxa from taxlabels. 496 497 new_tree = randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True) 498 Trees are bifurcating by default. (Polytomies not yet supported). 499 """ 500 if not ntax and taxon_list: 501 ntax = len(taxon_list) 502 elif not taxon_list and ntax: 503 taxon_list = ['taxon' + str(i + 1) for i in range(ntax)] 504 elif not ntax and not taxon_list: 505 raise TreeError('Either numer of taxa or list of taxa must be specified.') 506 elif ntax != len(taxon_list): 507 raise TreeError('Length of taxon list must correspond to ntax.') 508 # initiate self with empty root 509 self.__init__() 510 terminals = self.get_terminals() 511 # bifurcate randomly at terminal nodes until ntax is reached 512 while len(terminals) < ntax: 513 newsplit = random.choice(terminals) 514 new_terminals = self.split(parent_id=newsplit, branchlength=branchlength) 515 # if desired, give some variation to the branch length 516 if branchlength_sd: 517 for nt in new_terminals: 518 bl = random.gauss(branchlength, branchlength_sd) 519 if bl < 0: 520 bl = 0 521 self.node(nt).data.branchlength = bl 522 terminals.extend(new_terminals) 523 terminals.remove(newsplit) 524 # distribute taxon labels randomly 525 random.shuffle(taxon_list) 526 for (node, name) in zip(terminals, taxon_list): 527 self.node(node).data.taxon = name
528
529 - def display(self):
530 """Quick and dirty lists of all nodes.""" 531 table = [('#', 'taxon', 'prev', 'succ', 'brlen', 'blen (sum)', 'support', 'comment')] 532 # Sort this to be consistent across CPython, Jython, etc 533 for i in sorted(self.all_ids()): 534 n = self.node(i) 535 if not n.data: 536 table.append((str(i), '-', str(n.prev), str(n.succ), '-', '-', '-', '-')) 537 else: 538 tx = n.data.taxon 539 if not tx: 540 tx = '-' 541 blength = "%0.2f" % n.data.branchlength 542 if blength is None: 543 blength = '-' 544 sum_blength = '-' 545 else: 546 sum_blength = "%0.2f" % self.sum_branchlength(node=i) 547 support = n.data.support 548 if support is None: 549 support = '-' 550 else: 551 support = "%0.2f" % support 552 comment = n.data.comment 553 if comment is None: 554 comment = '-' 555 table.append((str(i), tx, str(n.prev), str(n.succ), 556 blength, sum_blength, support, comment)) 557 print('\n'.join('%3s %32s %15s %15s %8s %10s %8s %20s' % l for l in table)) 558 print('\nRoot: %s' % self.root)
559
560 - def to_string(self, support_as_branchlengths=False, 561 branchlengths_only=False, plain=True, plain_newick=False, 562 ladderize=None, ignore_comments=True):
563 """Return a paup compatible tree line.""" 564 # if there's a conflict in the arguments, we override plain=True 565 if support_as_branchlengths or branchlengths_only: 566 plain = False 567 self.support_as_branchlengths = support_as_branchlengths 568 self.branchlengths_only = branchlengths_only 569 self.ignore_comments = ignore_comments 570 self.plain = plain 571 572 def make_info_string(data, terminal=False): 573 """Creates nicely formatted support/branchlengths.""" 574 # CHECK FORMATTING 575 if self.plain: # plain tree only. That's easy. 576 info_string = '' 577 elif self.support_as_branchlengths: # support as branchlengths (eg. PAUP), ignore actual branchlengths 578 if terminal: # terminal branches have 100% support 579 info_string = ':%1.2f' % self.max_support 580 elif data.support: 581 info_string = ':%1.2f' % (data.support) 582 else: 583 info_string = ':0.00' 584 elif self.branchlengths_only: # write only branchlengths, ignore support 585 info_string = ':%1.5f' % (data.branchlength) 586 else: # write suport and branchlengths (e.g. .con tree of mrbayes) 587 if terminal: 588 info_string = ':%1.5f' % (data.branchlength) 589 else: 590 if data.branchlength is not None and data.support is not None: # we have blen and suppport 591 info_string = '%1.2f:%1.5f' % (data.support, data.branchlength) 592 elif data.branchlength is not None: # we have only blen 593 info_string = '0.00000:%1.5f' % (data.branchlength) 594 elif data.support is not None: # we have only support 595 info_string = '%1.2f:0.00000' % (data.support) 596 else: 597 info_string = '0.00:0.00000' 598 if not ignore_comments and hasattr(data, 'nodecomment'): 599 info_string = str(data.nodecomment) + info_string 600 return info_string
601 602 def ladderize_nodes(nodes, ladderize=None): 603 """Sorts node numbers according to the number of terminal nodes.""" 604 if ladderize in ['left', 'LEFT', 'right', 'RIGHT']: 605 succnode_terminals = sorted((self.count_terminals(node=n), n) for n in nodes) 606 if (ladderize == 'right' or ladderize == 'RIGHT'): 607 succnode_terminals.reverse() 608 if succnode_terminals: 609 succnodes = zip(*succnode_terminals)[1] 610 else: 611 succnodes = [] 612 else: 613 succnodes = nodes 614 return succnodes
615 616 def newickize(node, ladderize=None): 617 """Convert a node tree to a newick tree recursively.""" 618 if not self.node(node).succ: # terminal 619 return self.node(node).data.taxon + make_info_string(self.node(node).data, terminal=True) 620 else: 621 succnodes = ladderize_nodes(self.node(node).succ, ladderize=ladderize) 622 subtrees = [newickize(sn, ladderize=ladderize) for sn in succnodes] 623 return '(%s)%s' % (','.join(subtrees), make_info_string(self.node(node).data)) 624 625 treeline = ['tree'] 626 if self.name: 627 treeline.append(self.name) 628 else: 629 treeline.append('a_tree') 630 treeline.append('=') 631 if self.weight != 1: 632 treeline.append('[&W%s]' % str(round(float(self.weight), 3))) 633 if self.rooted: 634 treeline.append('[&R]') 635 succnodes = ladderize_nodes(self.node(self.root).succ) 636 subtrees = [newickize(sn, ladderize=ladderize) for sn in succnodes] 637 treeline.append('(%s)' % ','.join(subtrees)) 638 if plain_newick: 639 return treeline[-1] 640 else: 641 return ' '.join(treeline) + ';' 642
643 - def __str__(self):
644 """Short version of to_string(), gives plain tree""" 645 return self.to_string(plain=True)
646
647 - def unroot(self):
648 """Defines a unrooted Tree structure, using data of a rooted Tree.""" 649 # travel down the rooted tree structure and save all branches and the nodes they connect 650 651 def _get_branches(node): 652 branches = [] 653 for b in self.node(node).succ: 654 branches.append([node, b, self.node(b).data.branchlength, self.node(b).data.support]) 655 branches.extend(_get_branches(b)) 656 return branches
657 658 self.unrooted = _get_branches(self.root) 659 # if root is bifurcating, then it is eliminated 660 if len(self.node(self.root).succ) == 2: 661 # find the two branches that connect to root 662 rootbranches = [b for b in self.unrooted if self.root in b[:2]] 663 b1 = self.unrooted.pop(self.unrooted.index(rootbranches[0])) 664 b2 = self.unrooted.pop(self.unrooted.index(rootbranches[1])) 665 # Connect them two each other. If both have support, it should be identical (or one set to None?). 666 # If both have branchlengths, they will be added 667 newbranch = [b1[1], b2[1], b1[2] + b2[2]] 668 if b1[3] is None: 669 newbranch.append(b2[3]) # either None (both rootbranches are unsupported) or some support 670 elif b2[3] is None: 671 newbranch.append(b1[3]) # dito 672 elif b1[3] == b2[3]: 673 newbranch.append(b1[3]) # identical support 674 elif b1[3] == 0 or b2[3] == 0: 675 newbranch.append(b1[3] + b2[3]) # one is 0, take the other 676 else: 677 raise TreeError('Support mismatch in bifurcating root: %f, %f' 678 % (float(b1[3]), float(b2[3]))) 679 self.unrooted.append(newbranch) 680
681 - def root_with_outgroup(self, outgroup=None):
682 683 def _connect_subtree(parent, child): 684 """Hook subtree starting with node child to parent.""" 685 for i, branch in enumerate(self.unrooted): 686 if parent in branch[:2] and child in branch[:2]: 687 branch = self.unrooted.pop(i) 688 break 689 else: 690 raise TreeError('Unable to connect nodes for rooting: nodes %d and %d are not connected' 691 % (parent, child)) 692 self.link(parent, child) 693 self.node(child).data.branchlength = branch[2] 694 self.node(child).data.support = branch[3] 695 # now check if there are more branches connected to the child, and if so, connect them 696 child_branches = [b for b in self.unrooted if child in b[:2]] 697 for b in child_branches: 698 if child == b[0]: 699 succ = b[1] 700 else: 701 succ = b[0] 702 _connect_subtree(child, succ)
703 704 # check the outgroup we're supposed to root with 705 if outgroup is None: 706 return self.root 707 outgroup_node = self.is_monophyletic(outgroup) 708 if outgroup_node == -1: 709 return -1 710 # if tree is already rooted with outgroup on a bifurcating root, 711 # or the outgroup includes all taxa on the tree, then we're fine 712 if (len(self.node(self.root).succ) == 2 and outgroup_node in self.node(self.root).succ) or outgroup_node == self.root: 713 return self.root 714 715 self.unroot() 716 # now we find the branch that connects outgroup and ingroup 717 # print(self.node(outgroup_node).prev) 718 for i, b in enumerate(self.unrooted): 719 if outgroup_node in b[:2] and self.node(outgroup_node).prev in b[:2]: 720 root_branch = self.unrooted.pop(i) 721 break 722 else: 723 raise TreeError('Unrooted and rooted Tree do not match') 724 if outgroup_node == root_branch[1]: 725 ingroup_node = root_branch[0] 726 else: 727 ingroup_node = root_branch[1] 728 # now we destroy the old tree structure, but keep node data. Nodes will be reconnected according to new outgroup 729 for n in self.all_ids(): 730 self.node(n).prev = None 731 self.node(n).succ = [] 732 # now we just add both subtrees (outgroup and ingroup) branch for branch 733 root = Nodes.Node(data=NodeData()) # new root 734 self.add(root) # add to tree description 735 self.root = root.id # set as root 736 self.unrooted.append([root.id, ingroup_node, root_branch[2], root_branch[3]]) # add branch to ingroup to unrooted tree 737 self.unrooted.append([root.id, outgroup_node, 0.0, 0.0]) # add branch to outgroup to unrooted tree 738 _connect_subtree(root.id, ingroup_node) # add ingroup 739 _connect_subtree(root.id, outgroup_node) # add outgroup 740 # if theres still a lonely node in self.chain, then it's the old root, and we delete it 741 oldroot = [i for i in self.all_ids() if self.node(i).prev is None and i != self.root] 742 if len(oldroot) > 1: 743 raise TreeError('Isolated nodes in tree description: %s' 744 % ','.join(oldroot)) 745 elif len(oldroot) == 1: 746 self.kill(oldroot[0]) 747 return self.root 748
749 - def merge_with_support(self, bstrees=None, constree=None, threshold=0.5, outgroup=None):
750 """Merges clade support (from consensus or list of bootstrap-trees) with phylogeny. 751 752 tree=merge_bootstrap(phylo,bs_tree=<list_of_trees>) 753 or 754 tree=merge_bootstrap(phylo,consree=consensus_tree with clade support) 755 """ 756 if bstrees and constree: 757 raise TreeError('Specify either list of bootstrap trees or consensus tree, not both') 758 if not (bstrees or constree): 759 raise TreeError('Specify either list of bootstrap trees or consensus tree.') 760 # no outgroup specified: use the smallest clade of the root 761 if outgroup is None: 762 try: 763 succnodes = self.node(self.root).succ 764 smallest = min((len(self.get_taxa(n)), n) for n in succnodes) 765 outgroup = self.get_taxa(smallest[1]) 766 except Exception: 767 raise TreeError("Error determining outgroup.") 768 else: # root with user specified outgroup 769 self.root_with_outgroup(outgroup) 770 771 if bstrees: # calculate consensus 772 constree = consensus(bstrees, threshold=threshold, outgroup=outgroup) 773 else: 774 if not constree.has_support(): 775 constree.branchlength2support() 776 constree.root_with_outgroup(outgroup) 777 # now we travel all nodes, and add support from consensus, if the clade is present in both 778 for pnode in self._walk(): 779 cnode = constree.is_monophyletic(self.get_taxa(pnode)) 780 if cnode > -1: 781 self.node(pnode).data.support = constree.node(cnode).data.support
782 783
784 -def consensus(trees, threshold=0.5, outgroup=None):
785 """Compute a majority rule consensus tree of all clades with relative frequency>=threshold from a list of trees.""" 786 total = len(trees) 787 if total == 0: 788 return None 789 # shouldn't we make sure that it's NodeData or subclass?? 790 dataclass = trees[0].dataclass 791 max_support = trees[0].max_support 792 clades = {} 793 # countclades={} 794 alltaxa = set(trees[0].get_taxa()) 795 # calculate calde frequencies 796 c = 0 797 for t in trees: 798 c += 1 799 # if c%100==0: 800 # print(c) 801 if alltaxa != set(t.get_taxa()): 802 raise TreeError('Trees for consensus must contain the same taxa') 803 t.root_with_outgroup(outgroup=outgroup) 804 for st_node in t._walk(t.root): 805 subclade_taxa = sorted(t.get_taxa(st_node)) 806 subclade_taxa = str(subclade_taxa) # lists are not hashable 807 if subclade_taxa in clades: 808 clades[subclade_taxa] += float(t.weight) / total 809 else: 810 clades[subclade_taxa] = float(t.weight) / total 811 # if subclade_taxa in countclades: 812 # countclades[subclade_taxa]+=t.weight 813 # else: 814 # countclades[subclade_taxa]=t.weight 815 # weed out clades below threshold 816 delclades = [c for c, p in clades.items() if round(p, 3) < threshold] # round can be necessary 817 for c in delclades: 818 del clades[c] 819 # create a tree with a root node 820 consensus = Tree(name='consensus_%2.1f' % float(threshold), data=dataclass) 821 # each clade needs a node in the new tree, add them as isolated nodes 822 for c, s in clades.items(): 823 node = Nodes.Node(data=dataclass()) 824 node.data.support = s 825 node.data.taxon = set(eval(c)) 826 consensus.add(node) 827 # set root node data 828 consensus.node(consensus.root).data.support = None 829 consensus.node(consensus.root).data.taxon = alltaxa 830 # we sort the nodes by no. of taxa in the clade, so root will be the last 831 consensus_ids = consensus.all_ids() 832 consensus_ids.sort(lambda x, y: len(consensus.node(x).data.taxon) - len(consensus.node(y).data.taxon)) 833 # now we just have to hook each node to the next smallest node that includes all taxa of the current 834 for i, current in enumerate(consensus_ids[:-1]): # skip the last one which is the root 835 # print('----') 836 # print('current: %s' % consensus.node(current).data.taxon) 837 # search remaining nodes 838 for parent in consensus_ids[i + 1:]: 839 # print('parent: %s' % consensus.node(parent).data.taxon) 840 if consensus.node(parent).data.taxon.issuperset(consensus.node(current).data.taxon): 841 break 842 else: 843 sys.exit('corrupt tree structure?') 844 # internal nodes don't have taxa 845 if len(consensus.node(current).data.taxon) == 1: 846 consensus.node(current).data.taxon = consensus.node(current).data.taxon.pop() 847 # reset the support for terminal nodes to maximum 848 # consensus.node(current).data.support=max_support 849 else: 850 consensus.node(current).data.taxon = None 851 consensus.link(parent, current) 852 # eliminate root taxon name 853 consensus.node(consensus_ids[-1]).data.taxon = None 854 if alltaxa != set(consensus.get_taxa()): 855 raise TreeError('FATAL ERROR: consensus tree is corrupt') 856 return consensus
857