1
2
3
4
5
6
7 """Tree class to handle phylogenetic trees.
8
9 Provides a set of methods to read and write newick-format tree descriptions,
10 get information about trees (monphyly of taxon sets, congruence between trees,
11 common ancestors,...) and to manipulate trees (reroot trees, split terminal
12 nodes).
13 """
14
15 import random
16 import sys
17 import Nodes
18
19 PRECISION_BRANCHLENGTH=6
20 PRECISION_SUPPORT=6
21 NODECOMMENT_START='[&'
22 NODECOMMENT_END=']'
23
24
27
28
30 """Stores tree-relevant data associated with nodes (e.g. branches or otus)."""
31 - def __init__(self,taxon=None,branchlength=0.0,support=None,comment=None):
32 self.taxon=taxon
33 self.branchlength=branchlength
34 self.support=support
35 self.comment=comment
36
37
38 -class Tree(Nodes.Chain):
39 """Represents a tree using a chain of nodes with on predecessor (=ancestor)
40 and multiple successors (=subclades).
41 """
42
43
44
45
46
47
48
49
50
51 - def __init__(self,tree=None,weight=1.0,rooted=False,name='',data=NodeData,values_are_support=False,max_support=1.0):
70
72 """Parses (a,b,c...)[[[xx]:]yy] into subcomponents and travels down recursively."""
73
74
75 tree = tree.strip()
76 if tree.count('(')!=tree.count(')'):
77 raise TreeError('Parentheses do not match in (sub)tree: '+tree)
78 if tree.count('(')==0:
79
80 nodecomment=tree.find(NODECOMMENT_START)
81 colon=tree.find(':')
82 if colon==-1 and nodecomment==-1:
83 return [tree,[None]]
84 elif colon==-1 and nodecomment>-1:
85 return [tree[:nodecomment],self._get_values(tree[nodecomment:])]
86 elif colon>-1 and nodecomment==-1:
87 return [tree[:colon],self._get_values(tree[colon+1:])]
88 elif colon < nodecomment:
89 return [tree[:colon],self._get_values(tree[colon+1:])]
90 else:
91 return [tree[:nodecomment],self._get_values(tree[nodecomment:])]
92 else:
93 closing=tree.rfind(')')
94 val=self._get_values(tree[closing+1:])
95 if not val:
96 val=[None]
97 subtrees=[]
98 plevel=0
99 prev=1
100 for p in range(1,closing):
101 if tree[p]=='(':
102 plevel+=1
103 elif tree[p]==')':
104 plevel-=1
105 elif tree[p]==',' and plevel==0:
106 subtrees.append(tree[prev:p])
107 prev=p+1
108 subtrees.append(tree[prev:closing])
109 subclades=[self._parse(subtree) for subtree in subtrees]
110 return [subclades,val]
111
113 """Adds leaf or tree (in newick format) to a parent_id."""
114 if parent_id is None:
115 raise TreeError('Need node_id to connect to.')
116 for st in tree:
117 nd=self.dataclass()
118 nd = self._add_nodedata(nd, st)
119 if type(st[0])==list:
120 sn=Nodes.Node(nd)
121 self.add(sn,parent_id)
122 self._add_subtree(sn.id,st[0])
123 else:
124 nd.taxon=st[0]
125 leaf=Nodes.Node(nd)
126 self.add(leaf,parent_id)
127
129 """Add data to the node parsed from the comments, taxon and support.
130 """
131 if isinstance(st[1][-1],str) and st[1][-1].startswith(NODECOMMENT_START):
132 nd.comment=st[1].pop(-1)
133
134 elif isinstance(st[1][0], str):
135 nd.taxon = st[1][0]
136 st[1] = st[1][1:]
137 if len(st)>1:
138 if len(st[1])>=2:
139 nd.support=st[1][0]
140 if st[1][1] is not None:
141 nd.branchlength=st[1][1]
142 elif len(st[1])==1:
143 if not self.__values_are_support:
144 if st[1][0] is not None:
145 nd.branchlength=st[1][0]
146 else:
147 nd.support=st[1][0]
148 return nd
149
180
181 - def _walk(self,node=None):
182 """Return all node_ids downwards from a node."""
183
184 if node is None:
185 node=self.root
186 for n in self.node(node).succ:
187 yield n
188 for sn in self._walk(n):
189 yield sn
190
191 - def node(self,node_id):
192 """Return the instance of node_id.
193
194 node = node(self,node_id)
195 """
196 if node_id not in self.chain:
197 raise TreeError('Unknown node_id: %d' % node_id)
198 return self.chain[node_id]
199
200 - def split(self,parent_id=None,n=2,branchlength=1.0):
201 """Speciation: generates n (default two) descendants of a node.
202
203 [new ids] = split(self,parent_id=None,n=2,branchlength=1.0):
204 """
205 if parent_id is None:
206 raise TreeError('Missing node_id.')
207 ids=[]
208 parent_data=self.chain[parent_id].data
209 for i in range(n):
210 node=Nodes.Node()
211 if parent_data:
212 node.data=self.dataclass()
213
214 if parent_data.taxon:
215 node.data.taxon=parent_data.taxon+str(i)
216 node.data.branchlength=branchlength
217 ids.append(self.add(node,parent_id))
218 return ids
219
221 """Returns the first matching taxon in self.data.taxon. Not restricted to terminal nodes.
222
223 node_id = search_taxon(self,taxon)
224 """
225 for id,node in self.chain.iteritems():
226 if node.data.taxon==taxon:
227 return id
228 return None
229
231 """Prunes a terminal taxon from the tree.
232
233 id_of_previous_node = prune(self,taxon)
234 If taxon is from a bifurcation, the connectiong node will be collapsed
235 and its branchlength added to remaining terminal node. This might be no
236 longer a meaningful value'
237 """
238
239 id=self.search_taxon(taxon)
240 if id is None:
241 raise TreeError('Taxon not found: %s' % taxon)
242 elif id not in self.get_terminals():
243 raise TreeError('Not a terminal taxon: %s' % taxon)
244 else:
245 prev=self.unlink(id)
246 self.kill(id)
247 if len(self.node(prev).succ)==1:
248 if prev==self.root:
249 self.root=self.node(self.root).succ[0]
250 self.node(self.root).branchlength=0.0
251 self.kill(prev)
252 else:
253 succ=self.node(prev).succ[0]
254 new_bl=self.node(prev).data.branchlength+self.node(succ).data.branchlength
255 self.collapse(prev)
256 self.node(succ).data.branchlength=new_bl
257 return prev
258
279
281 """Return a list of all terminal nodes."""
282 return [i for i in self.all_ids() if self.node(i).succ==[]]
283
285 """Returns True if node is a terminal node."""
286 return self.node(node).succ==[]
287
289 """Returns True if node is an internal node."""
290 return len(self.node(node).succ)>0
291
293 """Returns True if all successors of a node are terminal ones."""
294 if self.is_terminal(node):
295 return False not in [self.is_terminal(n) for n in self.node(node).succ]
296 else:
297 return False
298
300 """Counts the number of terminal nodes that are attached to a node."""
301 if node is None:
302 node=self.root
303 return len([n for n in self._walk(node) if self.is_terminal(n)])
304
306 """Collapses all subtrees which belong to the same genus (i.e share the same first word in their taxon name.)"""
307
308 while True:
309 for n in self._walk():
310 if self.is_terminal(n):
311 continue
312 taxa=self.get_taxa(n)
313 genera=[]
314 for t in taxa:
315 if space_equals_underscore:
316 t=t.replace(' ','_')
317 try:
318 genus=t.split('_',1)[0]
319 except:
320 genus='None'
321 if genus not in genera:
322 genera.append(genus)
323 if len(genera)==1:
324 self.node(n).data.taxon=genera[0]+' <collapsed>'
325
326 nodes2kill=[kn for kn in self._walk(node=n)]
327 for kn in nodes2kill:
328 self.kill(kn)
329 self.node(n).succ=[]
330 break
331 else:
332 break
333
335 """Adds up the branchlengths from root (default self.root) to node.
336
337 sum = sum_branchlength(self,root=None,node=None)
338 """
339
340 if root is None:
341 root=self.root
342 if node is None:
343 raise TreeError('Missing node id.')
344 blen=0.0
345 while node is not None and node is not root:
346 blen+=self.node(node).data.branchlength
347 node=self.node(node).prev
348 return blen
349
351 """Return subtree as a set of nested sets.
352
353 sets = set_subtree(self,node)
354 """
355
356 if self.node(node).succ==[]:
357 return self.node(node).data.taxon
358 else:
359 try:
360 return frozenset([self.set_subtree(n) for n in self.node(node).succ])
361 except:
362 print node
363 print self.node(node).succ
364 for n in self.node(node).succ:
365 print n, self.set_subtree(n)
366 print [self.set_subtree(n) for n in self.node(node).succ]
367 raise
368
370 """Compare tree and tree2 for identity.
371
372 result = is_identical(self,tree2)
373 """
374 return self.set_subtree(self.root)==tree2.set_subtree(tree2.root)
375
377 """Compares branches with support>threshold for compatibility.
378
379 result = is_compatible(self,tree2,threshold)
380 """
381
382
383 missing2=set(self.get_taxa())-set(tree2.get_taxa())
384 missing1=set(tree2.get_taxa())-set(self.get_taxa())
385 if strict and (missing1 or missing2):
386 if missing1:
387 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing1) , self.name)
388 if missing2:
389 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing2) , tree2.name)
390 raise TreeError('Can\'t compare trees with different taxon compositions.')
391 t1=[(set(self.get_taxa(n)),self.node(n).data.support) for n in self.all_ids() if
392 self.node(n).succ and
393 (self.node(n).data and self.node(n).data.support and self.node(n).data.support>=threshold)]
394 t2=[(set(tree2.get_taxa(n)),tree2.node(n).data.support) for n in tree2.all_ids() if
395 tree2.node(n).succ and
396 (tree2.node(n).data and tree2.node(n).data.support and tree2.node(n).data.support>=threshold)]
397 conflict=[]
398 for (st1,sup1) in t1:
399 for (st2,sup2) in t2:
400 if not st1.issubset(st2) and not st2.issubset(st1):
401 intersect,notin1,notin2=st1 & st2, st2-st1, st1-st2
402
403 if intersect and not (notin1.issubset(missing1) or notin2.issubset(missing2)):
404 conflict.append((st1,sup1,st2,sup2,intersect,notin1,notin2))
405 return conflict
406
408 """Return the common ancestor that connects two nodes.
409
410 node_id = common_ancestor(self,node1,node2)
411 """
412
413 l1=[self.root]+self.trace(self.root,node1)
414 l2=[self.root]+self.trace(self.root,node2)
415 return [n for n in l1 if n in l2][-1]
416
418 """Add and return the sum of the branchlengths between two nodes.
419 dist = distance(self,node1,node2)
420 """
421
422 ca=self.common_ancestor(node1,node2)
423 return self.sum_branchlength(ca,node1)+self.sum_branchlength(ca,node2)
424
426 """Return node_id of common ancestor if taxon_list is monophyletic, -1 otherwise.
427
428 result = is_monophyletic(self,taxon_list)
429 """
430 if isinstance(taxon_list,str):
431 taxon_set=set([taxon_list])
432 else:
433 taxon_set=set(taxon_list)
434 node_id=self.root
435 while 1:
436 subclade_taxa=set(self.get_taxa(node_id))
437 if subclade_taxa==taxon_set:
438 return node_id
439 else:
440 for subnode in self.chain[node_id].succ:
441 if set(self.get_taxa(subnode)).issuperset(taxon_set):
442 node_id=subnode
443 break
444 else:
445 return -1
446
461
463 """Move values stored in data.branchlength to data.support, and set branchlength to 0.0
464
465 This is necessary when support has been stored as branchlength (e.g. paup), and has thus
466 been read in as branchlength.
467 """
468
469 for n in self.chain:
470 self.node(n).data.support=self.node(n).data.branchlength
471 self.node(n).data.branchlength=0.0
472
474 """Convert absolute support (clade-count) to rel. frequencies.
475
476 Some software (e.g. PHYLIP consense) just calculate how often clades appear, instead of
477 calculating relative frequencies."""
478
479 for n in self._walk():
480 if self.node(n).data.support:
481 self.node(n).data.support/=float(nrep)
482
484 """Returns True if any of the nodes has data.support != None."""
485 for n in self._walk(node):
486 if self.node(n).data.support:
487 return True
488 else:
489 return False
490
491 - def randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True):
492 """Generates a random tree with ntax taxa and/or taxa from taxlabels.
493
494 new_tree = randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True)
495 Trees are bifurcating by default. (Polytomies not yet supported).
496 """
497
498 if not ntax and taxon_list:
499 ntax=len(taxon_list)
500 elif not taxon_list and ntax:
501 taxon_list=['taxon'+str(i+1) for i in range(ntax)]
502 elif not ntax and not taxon_list:
503 raise TreeError('Either numer of taxa or list of taxa must be specified.')
504 elif ntax != len(taxon_list):
505 raise TreeError('Length of taxon list must correspond to ntax.')
506
507 self.__init__()
508 terminals=self.get_terminals()
509
510 while len(terminals)<ntax:
511 newsplit=random.choice(terminals)
512 new_terminals=self.split(parent_id=newsplit,branchlength=branchlength)
513
514 if branchlength_sd:
515 for nt in new_terminals:
516 bl=random.gauss(branchlength,branchlength_sd)
517 if bl<0:
518 bl=0
519 self.node(nt).data.branchlength=bl
520 terminals.extend(new_terminals)
521 terminals.remove(newsplit)
522
523 random.shuffle(taxon_list)
524 for (node,name) in zip(terminals,taxon_list):
525 self.node(node).data.taxon=name
526
528 """Quick and dirty lists of all nodes."""
529 table=[('#','taxon','prev','succ','brlen','blen (sum)','support','comment')]
530
531 for i in sorted(self.all_ids()):
532 n=self.node(i)
533 if not n.data:
534 table.append((str(i),'-',str(n.prev),str(n.succ),'-','-','-','-'))
535 else:
536 tx=n.data.taxon
537 if not tx:
538 tx='-'
539 blength="%0.2f" % n.data.branchlength
540 if blength is None:
541 blength='-'
542 sum_blength='-'
543 else:
544 sum_blength="%0.2f" % self.sum_branchlength(node=i)
545 support=n.data.support
546 if support is None:
547 support='-'
548 else:
549 support="%0.2f" % support
550 comment=n.data.comment
551 if comment is None:
552 comment='-'
553 table.append((str(i),tx,str(n.prev),str(n.succ),
554 blength, sum_blength, support, comment))
555 print '\n'.join(['%3s %32s %15s %15s %8s %10s %8s %20s' % l for l in table])
556 print '\nRoot: ',self.root
557
558 - def to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True,plain_newick=False,ladderize=None,ignore_comments=True):
559 """Return a paup compatible tree line."""
560
561 if support_as_branchlengths or branchlengths_only:
562 plain=False
563 self.support_as_branchlengths=support_as_branchlengths
564 self.branchlengths_only=branchlengths_only
565 self.ignore_comments=ignore_comments
566 self.plain=plain
567
568 def make_info_string(data,terminal=False):
569 """Creates nicely formatted support/branchlengths."""
570
571 if self.plain:
572 info_string= ''
573 elif self.support_as_branchlengths:
574 if terminal:
575 info_string= ':%1.2f' % self.max_support
576 elif data.support:
577 info_string= ':%1.2f' % (data.support)
578 else:
579 info_string=':0.00'
580 elif self.branchlengths_only:
581 info_string= ':%1.5f' % (data.branchlength)
582 else:
583 if terminal:
584 info_string= ':%1.5f' % (data.branchlength)
585 else:
586 if data.branchlength is not None and data.support is not None:
587 info_string= '%1.2f:%1.5f' % (data.support,data.branchlength)
588 elif data.branchlength is not None:
589 info_string= '0.00000:%1.5f' % (data.branchlength)
590 elif data.support is not None:
591 info_string= '%1.2f:0.00000' % (data.support)
592 else:
593 info_string= '0.00:0.00000'
594 if not ignore_comments and hasattr(data,'nodecomment'):
595 info_string=str(data.nodecomment)+info_string
596 return info_string
597
598 def ladderize_nodes(nodes,ladderize=None):
599 """Sorts node numbers according to the number of terminal nodes."""
600 if ladderize in ['left','LEFT','right','RIGHT']:
601 succnode_terminals=[(self.count_terminals(node=n),n) for n in nodes]
602 succnode_terminals.sort()
603 if (ladderize=='right' or ladderize=='RIGHT'):
604 succnode_terminals.reverse()
605 if succnode_terminals:
606 succnodes=zip(*succnode_terminals)[1]
607 else:
608 succnodes=[]
609 else:
610 succnodes=nodes
611 return succnodes
612
613 def newickize(node,ladderize=None):
614 """Convert a node tree to a newick tree recursively."""
615
616 if not self.node(node).succ:
617 return self.node(node).data.taxon+make_info_string(self.node(node).data,terminal=True)
618 else:
619 succnodes=ladderize_nodes(self.node(node).succ,ladderize=ladderize)
620 subtrees=[newickize(sn,ladderize=ladderize) for sn in succnodes]
621 return '(%s)%s' % (','.join(subtrees),make_info_string(self.node(node).data))
622
623 treeline=['tree']
624 if self.name:
625 treeline.append(self.name)
626 else:
627 treeline.append('a_tree')
628 treeline.append('=')
629 if self.weight != 1:
630 treeline.append('[&W%s]' % str(round(float(self.weight),3)))
631 if self.rooted:
632 treeline.append('[&R]')
633 succnodes=ladderize_nodes(self.node(self.root).succ)
634 subtrees=[newickize(sn,ladderize=ladderize) for sn in succnodes]
635 treeline.append('(%s)' % ','.join(subtrees))
636 if plain_newick:
637 return treeline[-1]
638 else:
639 return ' '.join(treeline)+';'
640
642 """Short version of to_string(), gives plain tree"""
643 return self.to_string(plain=True)
644
646 """Defines a unrooted Tree structure, using data of a rooted Tree."""
647
648
649
650 def _get_branches(node):
651 branches=[]
652 for b in self.node(node).succ:
653 branches.append([node,b,self.node(b).data.branchlength,self.node(b).data.support])
654 branches.extend(_get_branches(b))
655 return branches
656
657 self.unrooted=_get_branches(self.root)
658
659 if len(self.node(self.root).succ)==2:
660
661 rootbranches=[b for b in self.unrooted if self.root in b[:2]]
662 b1=self.unrooted.pop(self.unrooted.index(rootbranches[0]))
663 b2=self.unrooted.pop(self.unrooted.index(rootbranches[1]))
664
665
666 newbranch=[b1[1],b2[1],b1[2]+b2[2]]
667 if b1[3] is None:
668 newbranch.append(b2[3])
669 elif b2[3] is None:
670 newbranch.append(b1[3])
671 elif b1[3]==b2[3]:
672 newbranch.append(b1[3])
673 elif b1[3]==0 or b2[3]==0:
674 newbranch.append(b1[3]+b2[3])
675 else:
676 raise TreeError('Support mismatch in bifurcating root: %f, %f'
677 % (float(b1[3]),float(b2[3])))
678 self.unrooted.append(newbranch)
679
681
682 def _connect_subtree(parent,child):
683 """Hook subtree starting with node child to parent."""
684 for i,branch in enumerate(self.unrooted):
685 if parent in branch[:2] and child in branch[:2]:
686 branch=self.unrooted.pop(i)
687 break
688 else:
689 raise TreeError('Unable to connect nodes for rooting: nodes %d and %d are not connected'
690 % (parent,child))
691 self.link(parent,child)
692 self.node(child).data.branchlength=branch[2]
693 self.node(child).data.support=branch[3]
694
695 child_branches=[b for b in self.unrooted if child in b[:2]]
696 for b in child_branches:
697 if child==b[0]:
698 succ=b[1]
699 else:
700 succ=b[0]
701 _connect_subtree(child,succ)
702
703
704 if outgroup is None:
705 return self.root
706 outgroup_node=self.is_monophyletic(outgroup)
707 if outgroup_node==-1:
708 return -1
709
710
711 if (len(self.node(self.root).succ)==2 and outgroup_node in self.node(self.root).succ) or outgroup_node==self.root:
712 return self.root
713
714 self.unroot()
715
716
717 for i,b in enumerate(self.unrooted):
718 if outgroup_node in b[:2] and self.node(outgroup_node).prev in b[:2]:
719 root_branch=self.unrooted.pop(i)
720 break
721 else:
722 raise TreeError('Unrooted and rooted Tree do not match')
723 if outgroup_node==root_branch[1]:
724 ingroup_node=root_branch[0]
725 else:
726 ingroup_node=root_branch[1]
727
728 for n in self.all_ids():
729 self.node(n).prev=None
730 self.node(n).succ=[]
731
732 root=Nodes.Node(data=NodeData())
733 self.add(root)
734 self.root=root.id
735 self.unrooted.append([root.id,ingroup_node,root_branch[2],root_branch[3]])
736 self.unrooted.append([root.id,outgroup_node,0.0,0.0])
737 _connect_subtree(root.id,ingroup_node)
738 _connect_subtree(root.id,outgroup_node)
739
740 oldroot=[i for i in self.all_ids() if self.node(i).prev is None and i!=self.root]
741 if len(oldroot)>1:
742 raise TreeError('Isolated nodes in tree description: %s'
743 % ','.join(oldroot))
744 elif len(oldroot)==1:
745 self.kill(oldroot[0])
746 return self.root
747
749 """Merges clade support (from consensus or list of bootstrap-trees) with phylogeny.
750
751 tree=merge_bootstrap(phylo,bs_tree=<list_of_trees>)
752 or
753 tree=merge_bootstrap(phylo,consree=consensus_tree with clade support)
754 """
755
756 if bstrees and constree:
757 raise TreeError('Specify either list of bootstrap trees or consensus tree, not both')
758 if not (bstrees or constree):
759 raise TreeError('Specify either list of bootstrap trees or consensus tree.')
760
761 if outgroup is None:
762 try:
763 succnodes=self.node(self.root).succ
764 smallest=min([(len(self.get_taxa(n)),n) for n in succnodes])
765 outgroup=self.get_taxa(smallest[1])
766 except:
767 raise TreeError("Error determining outgroup.")
768 else:
769 self.root_with_outgroup(outgroup)
770
771 if bstrees:
772 constree=consensus(bstrees,threshold=threshold,outgroup=outgroup)
773 else:
774 if not constree.has_support():
775 constree.branchlength2support()
776 constree.root_with_outgroup(outgroup)
777
778 for pnode in self._walk():
779 cnode=constree.is_monophyletic(self.get_taxa(pnode))
780 if cnode>-1:
781 self.node(pnode).data.support=constree.node(cnode).data.support
782
783
784 -def consensus(trees, threshold=0.5,outgroup=None):
859