Package Bio :: Package Phylo :: Module BaseTree
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.BaseTree

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Base classes for Bio.Phylo objects. 
   7   
   8  All object representations for phylogenetic trees should derive from these base 
   9  classes in order to use the common methods defined on them. 
  10  """ 
  11   
  12  from Bio._py3k import basestring, filter, unicode, zip 
  13   
  14  import collections 
  15  import copy 
  16  import itertools 
  17  import random 
  18  import re 
  19   
  20  from Bio import _utils 
  21   
  22  # NB: On Python 2, repr() and str() are specified to return byte strings, not 
  23  # unicode. On Python 3, it's the opposite. Horrible. 
  24  import sys 
  25  if sys.version_info[0] < 3: 
26 - def as_string(s):
27 if isinstance(s, unicode): 28 return s.encode('utf-8') 29 return str(s)
30 else: 31 as_string = str
32 33 34 # General tree-traversal algorithms 35 36 -def _level_traverse(root, get_children):
37 """Traverse a tree in breadth-first (level) order.""" 38 Q = collections.deque([root]) 39 while Q: 40 v = Q.popleft() 41 yield v 42 Q.extend(get_children(v))
43
44 45 -def _preorder_traverse(root, get_children):
46 """Traverse a tree in depth-first pre-order (parent before children).""" 47 def dfs(elem): 48 yield elem 49 for v in get_children(elem): 50 for u in dfs(v): 51 yield u
52 for elem in dfs(root): 53 yield elem 54
55 56 -def _postorder_traverse(root, get_children):
57 """Traverse a tree in depth-first post-order (children before parent).""" 58 def dfs(elem): 59 for v in get_children(elem): 60 for u in dfs(v): 61 yield u 62 yield elem
63 for elem in dfs(root): 64 yield elem 65
66 67 -def _sorted_attrs(elem):
68 """Get a flat list of elem's attributes, sorted for consistency.""" 69 singles = [] 70 lists = [] 71 # Sort attributes for consistent results 72 for attrname, child in sorted(elem.__dict__.items(), 73 key=lambda kv: kv[0]): 74 if child is None: 75 continue 76 if isinstance(child, list): 77 lists.extend(child) 78 else: 79 singles.append(child) 80 return (x for x in singles + lists 81 if isinstance(x, TreeElement))
82
83 84 # Factory functions to generalize searching for clades/nodes 85 86 -def _identity_matcher(target):
87 """Match a node to the target object by identity.""" 88 def match(node): 89 return (node is target)
90 return match 91
92 93 -def _class_matcher(target_cls):
94 """Match a node if it's an instance of the given class.""" 95 def match(node): 96 return isinstance(node, target_cls)
97 return match 98
99 100 -def _string_matcher(target):
101 def match(node): 102 if isinstance(node, (Clade, Tree)): 103 # Avoid triggering specialized or recursive magic methods 104 return node.name == target 105 return as_string(node) == target
106 return match 107
108 109 -def _attribute_matcher(kwargs):
110 """Match a node by specified attribute values. 111 112 ``terminal`` is a special case: True restricts the search to external (leaf) 113 nodes, False restricts to internal nodes, and None allows all tree elements 114 to be searched, including phyloXML annotations. 115 116 Otherwise, for a tree element to match the specification (i.e. for the 117 function produced by `_attribute_matcher` to return True when given a tree 118 element), it must have each of the attributes specified by the keys and 119 match each of the corresponding values -- think 'and', not 'or', for 120 multiple keys. 121 """ 122 def match(node): 123 if 'terminal' in kwargs: 124 # Special case: restrict to internal/external/any nodes 125 kwa_copy = kwargs.copy() 126 pattern = kwa_copy.pop('terminal') 127 if (pattern is not None and 128 (not hasattr(node, 'is_terminal') or 129 node.is_terminal() != pattern)): 130 return False 131 else: 132 kwa_copy = kwargs 133 for key, pattern in kwa_copy.items(): 134 # Nodes must match all other specified attributes 135 if not hasattr(node, key): 136 return False 137 target = getattr(node, key) 138 if isinstance(pattern, basestring): 139 return (isinstance(target, basestring) and 140 re.match(pattern + '$', target)) 141 if isinstance(pattern, bool): 142 return (pattern == bool(target)) 143 if isinstance(pattern, int): 144 return (pattern == target) 145 if pattern is None: 146 return (target is None) 147 raise TypeError('invalid query type: %s' % type(pattern)) 148 return True
149 return match 150
151 152 -def _function_matcher(matcher_func):
153 """Safer attribute lookup -- returns False instead of raising an error.""" 154 def match(node): 155 try: 156 return matcher_func(node) 157 except (LookupError, AttributeError, ValueError, TypeError): 158 return False
159 return match 160
161 162 -def _object_matcher(obj):
163 """Retrieve a matcher function by passing an arbitrary object. 164 165 i.e. passing a `TreeElement` such as a `Clade` or `Tree` instance returns an 166 identity matcher, passing a type such as the `PhyloXML.Taxonomy` class 167 returns a class matcher, and passing a dictionary returns an attribute 168 matcher. 169 170 The resulting 'match' function returns True when given an object matching 171 the specification (identity, type or attribute values), otherwise False. 172 This is useful for writing functions that search the tree, and probably 173 shouldn't be used directly by the end user. 174 """ 175 if isinstance(obj, TreeElement): 176 return _identity_matcher(obj) 177 if isinstance(obj, type): 178 return _class_matcher(obj) 179 if isinstance(obj, basestring): 180 return _string_matcher(obj) 181 if isinstance(obj, dict): 182 return _attribute_matcher(obj) 183 if callable(obj): 184 return _function_matcher(obj) 185 raise ValueError("%s (type %s) is not a valid type for comparison." 186 % (obj, type(obj)))
187
188 189 -def _combine_matchers(target, kwargs, require_spec):
190 """Merge target specifications with keyword arguments. 191 192 Dispatch the components to the various matcher functions, then merge into a 193 single boolean function. 194 """ 195 if not target: 196 if not kwargs: 197 if require_spec: 198 raise ValueError("you must specify a target object or keyword " 199 "arguments.") 200 return lambda x: True 201 return _attribute_matcher(kwargs) 202 match_obj = _object_matcher(target) 203 if not kwargs: 204 return match_obj 205 match_kwargs = _attribute_matcher(kwargs) 206 return (lambda x: match_obj(x) and match_kwargs(x))
207
208 209 -def _combine_args(first, *rest):
210 """Convert ``[targets]`` or ``*targets`` arguments to a single iterable. 211 212 This helps other functions work like the built-in functions `max` and 213 `min`. 214 """ 215 # Background: is_monophyletic takes a single list or iterable (like the 216 # same method in Bio.Nexus.Trees); root_with_outgroup and common_ancestor 217 # take separate arguments. This mismatch was in the initial release and I 218 # didn't notice the inconsistency until after Biopython 1.55. I can think 219 # of cases where either style is more convenient, so let's support both 220 # (for backward compatibility and consistency between methods). 221 if hasattr(first, '__iter__') and not (isinstance(first, TreeElement) or 222 isinstance(first, type) or 223 isinstance(first, basestring) or 224 isinstance(first, dict)): 225 # `terminals` is an iterable of targets 226 if rest: 227 raise ValueError("Arguments must be either a single list of " 228 "targets, or separately specified targets " 229 "(e.g. foo(t1, t2, t3)), but not both.") 230 return first 231 # `terminals` is a single target -- wrap in a container 232 return itertools.chain([first], rest)
233
234 235 # Class definitions 236 237 -class TreeElement(object):
238 """Base class for all Bio.Phylo classes.""" 239
240 - def __repr__(self):
241 """Show this object's constructor with its primitive arguments.""" 242 def pair_as_kwarg_string(key, val): 243 if isinstance(val, basestring): 244 return ("%s='%s'" 245 % (key, _utils.trim_str(as_string(val), 60, '...'))) 246 return "%s=%s" % (key, val)
247 return ('%s(%s)' 248 % (self.__class__.__name__, 249 ', '.join(pair_as_kwarg_string(key, val) 250 for key, val in sorted(self.__dict__.items()) 251 if val is not None and 252 type(val) in (str, int, float, bool, unicode))))
253 254 __str__ = __repr__ 255
256 257 -class TreeMixin(object):
258 """Methods for Tree- and Clade-based classes. 259 260 This lets `Tree` and `Clade` support the same traversal and searching 261 operations without requiring Clade to inherit from Tree, so Clade isn't 262 required to have all of Tree's attributes -- just ``root`` (a Clade 263 instance) and ``is_terminal``. 264 """ 265 # Traversal methods 266
267 - def _filter_search(self, filter_func, order, follow_attrs):
268 """Perform a BFS or DFS traversal through all elements in this tree. 269 270 :returns: generator of all elements for which `filter_func` is True. 271 """ 272 order_opts = {'preorder': _preorder_traverse, 273 'postorder': _postorder_traverse, 274 'level': _level_traverse} 275 try: 276 order_func = order_opts[order] 277 except KeyError: 278 raise ValueError("Invalid order '%s'; must be one of: %s" 279 % (order, tuple(order_opts))) 280 if follow_attrs: 281 get_children = _sorted_attrs 282 root = self 283 else: 284 get_children = lambda elem: elem.clades 285 root = self.root 286 return filter(filter_func, order_func(root, get_children))
287
288 - def find_any(self, *args, **kwargs):
289 """Return the first element found by find_elements(), or None. 290 291 This is also useful for checking whether any matching element exists in 292 the tree, and can be used in a conditional expression. 293 """ 294 hits = self.find_elements(*args, **kwargs) 295 try: 296 return next(hits) 297 except StopIteration: 298 return None
299
300 - def find_elements(self, target=None, terminal=None, order='preorder', 301 **kwargs):
302 """Find all tree elements matching the given attributes. 303 304 The arbitrary keyword arguments indicate the attribute name of the 305 sub-element and the value to match: string, integer or boolean. Strings 306 are evaluated as regular expression matches; integers are compared 307 directly for equality, and booleans evaluate the attribute's truth value 308 (True or False) before comparing. To handle nonzero floats, search with 309 a boolean argument, then filter the result manually. 310 311 If no keyword arguments are given, then just the class type is used for 312 matching. 313 314 The result is an iterable through all matching objects, by depth-first 315 search. (Not necessarily the same order as the elements appear in the 316 source file!) 317 318 :Parameters: 319 target : TreeElement instance, type, dict, or callable 320 Specifies the characteristics to search for. (The default, 321 TreeElement, matches any standard Bio.Phylo type.) 322 terminal : bool 323 A boolean value to select for or against terminal nodes (a.k.a. 324 leaf nodes). True searches for only terminal nodes, False 325 excludes terminal nodes, and the default, None, searches both 326 terminal and non-terminal nodes, as well as any tree elements 327 lacking the ``is_terminal`` method. 328 order : {'preorder', 'postorder', 'level'} 329 Tree traversal order: 'preorder' (default) is depth-first 330 search, 'postorder' is DFS with child nodes preceding parents, 331 and 'level' is breadth-first search. 332 333 Example 334 ------- 335 336 >>> from Bio import Phylo 337 >>> phx = Phylo.PhyloXMLIO.read('PhyloXML/phyloxml_examples.xml') 338 >>> matches = phx.phylogenies[5].find_elements(code='OCTVU') 339 >>> next(matches) 340 Taxonomy(code='OCTVU', scientific_name='Octopus vulgaris') 341 342 """ 343 if terminal is not None: 344 kwargs['terminal'] = terminal 345 is_matching_elem = _combine_matchers(target, kwargs, False) 346 return self._filter_search(is_matching_elem, order, True)
347
348 - def find_clades(self, target=None, terminal=None, order='preorder', 349 **kwargs):
350 """Find each clade containing a matching element. 351 352 That is, find each element as with find_elements(), but return the 353 corresponding clade object. (This is usually what you want.) 354 355 :returns: an iterable through all matching objects, searching 356 depth-first (preorder) by default. 357 """ 358 def match_attrs(elem): 359 orig_clades = elem.__dict__.pop('clades') 360 found = elem.find_any(target, **kwargs) 361 elem.clades = orig_clades 362 return (found is not None)
363 if terminal is None: 364 is_matching_elem = match_attrs 365 else: 366 def is_matching_elem(elem): 367 return ((elem.is_terminal() == terminal) and 368 match_attrs(elem))
369 return self._filter_search(is_matching_elem, order, False) 370
371 - def get_path(self, target=None, **kwargs):
372 """List the clades directly between this root and the given target. 373 374 :returns: list of all clade objects along this path, ending with the 375 given target, but excluding the root clade. 376 """ 377 # Only one path will work -- ignore weights and visits 378 path = [] 379 match = _combine_matchers(target, kwargs, True) 380 381 def check_in_path(v): 382 if match(v): 383 path.append(v) 384 return True 385 elif v.is_terminal(): 386 return False 387 for child in v: 388 if check_in_path(child): 389 path.append(v) 390 return True 391 return False
392 393 if not check_in_path(self.root): 394 return None 395 return path[-2::-1] 396
397 - def get_nonterminals(self, order='preorder'):
398 """Get a list of all of this tree's nonterminal (internal) nodes.""" 399 return list(self.find_clades(terminal=False, order=order))
400
401 - def get_terminals(self, order='preorder'):
402 """Get a list of all of this tree's terminal (leaf) nodes.""" 403 return list(self.find_clades(terminal=True, order=order))
404
405 - def trace(self, start, finish):
406 """List of all clade object between two targets in this tree. 407 408 Excluding `start`, including `finish`. 409 """ 410 mrca = self.common_ancestor(start, finish) 411 fromstart = mrca.get_path(start)[-2::-1] 412 to = mrca.get_path(finish) 413 return fromstart + [mrca] + to
414 415 # Information methods 416
417 - def common_ancestor(self, targets, *more_targets):
418 """Most recent common ancestor (clade) of all the given targets. 419 420 Edge cases: 421 - If no target is given, returns self.root 422 - If 1 target is given, returns the target 423 - If any target is not found in this tree, raises a ValueError 424 """ 425 paths = [self.get_path(t) 426 for t in _combine_args(targets, *more_targets)] 427 # Validation -- otherwise izip throws a spooky error below 428 for p, t in zip(paths, targets): 429 if p is None: 430 raise ValueError("target %s is not in this tree" % repr(t)) 431 mrca = self.root 432 for level in zip(*paths): 433 ref = level[0] 434 for other in level[1:]: 435 if ref is not other: 436 break 437 else: 438 mrca = ref 439 if ref is not mrca: 440 break 441 return mrca
442
443 - def count_terminals(self):
444 """Counts the number of terminal (leaf) nodes within this tree.""" 445 return _utils.iterlen(self.find_clades(terminal=True))
446
447 - def depths(self, unit_branch_lengths=False):
448 """Create a mapping of tree clades to depths (by branch length). 449 450 :Parameters: 451 unit_branch_lengths : bool 452 If True, count only the number of branches (levels in the tree). 453 By default the distance is the cumulative branch length leading 454 to the clade. 455 456 :returns: dict of {clade: depth}, where keys are all of the Clade 457 instances in the tree, and values are the distance from the root to 458 each clade (including terminals). 459 """ 460 if unit_branch_lengths: 461 depth_of = lambda c: 1 462 else: 463 depth_of = lambda c: c.branch_length or 0 464 depths = {} 465 466 def update_depths(node, curr_depth): 467 depths[node] = curr_depth 468 for child in node.clades: 469 new_depth = curr_depth + depth_of(child) 470 update_depths(child, new_depth)
471 472 update_depths(self.root, self.root.branch_length or 0) 473 return depths 474
475 - def distance(self, target1, target2=None):
476 """Calculate the sum of the branch lengths between two targets. 477 478 If only one target is specified, the other is the root of this tree. 479 """ 480 if target2 is None: 481 return sum(n.branch_length for n in self.get_path(target1) 482 if n.branch_length is not None) 483 mrca = self.common_ancestor(target1, target2) 484 return mrca.distance(target1) + mrca.distance(target2)
485
486 - def is_bifurcating(self):
487 """Return True if tree downstream of node is strictly bifurcating. 488 489 I.e., all nodes have either 2 or 0 children (internal or external, 490 respectively). The root may have 3 descendents and still be considered 491 part of a bifurcating tree, because it has no ancestor. 492 """ 493 # Root can be trifurcating 494 if isinstance(self, Tree) and len(self.root) == 3: 495 return (self.root.clades[0].is_bifurcating() and 496 self.root.clades[1].is_bifurcating() and 497 self.root.clades[2].is_bifurcating()) 498 if len(self.root) == 2: 499 return (self.root.clades[0].is_bifurcating() and 500 self.root.clades[1].is_bifurcating()) 501 if len(self.root) == 0: 502 return True 503 return False
504
505 - def is_monophyletic(self, terminals, *more_terminals):
506 """MRCA of terminals if they comprise a complete subclade, or False. 507 508 I.e., there exists a clade such that its terminals are the same set as 509 the given targets. 510 511 The given targets must be terminals of the tree. 512 513 To match both `Bio.Nexus.Trees` and the other multi-target methods in 514 Bio.Phylo, arguments to this method can be specified either of two ways: 515 (i) as a single list of targets, or (ii) separately specified targets, 516 e.g. is_monophyletic(t1, t2, t3) -- but not both. 517 518 For convenience, this method returns the common ancestor (MCRA) of the 519 targets if they are monophyletic (instead of the value True), and False 520 otherwise. 521 522 :returns: common ancestor if terminals are monophyletic, otherwise False. 523 """ 524 target_set = set(_combine_args(terminals, *more_terminals)) 525 current = self.root 526 while True: 527 if set(current.get_terminals()) == target_set: 528 return current 529 # Try a narrower subclade 530 for subclade in current.clades: 531 if set(subclade.get_terminals()).issuperset(target_set): 532 current = subclade 533 break 534 else: 535 return False
536
537 - def is_parent_of(self, target=None, **kwargs):
538 """True if target is a descendent of this tree. 539 540 Not required to be a direct descendent. 541 542 To check only direct descendents of a clade, simply use list membership 543 testing: ``if subclade in clade: ...`` 544 """ 545 return self.get_path(target, **kwargs) is not None
546
547 - def is_preterminal(self):
548 """True if all direct descendents are terminal.""" 549 if self.root.is_terminal(): 550 return False 551 for clade in self.root.clades: 552 if not clade.is_terminal(): 553 return False 554 return True
555
556 - def total_branch_length(self):
557 """Calculate the sum of all the branch lengths in this tree.""" 558 return sum(node.branch_length 559 for node in self.find_clades(branch_length=True))
560 561 # Tree manipulation methods 562
563 - def collapse(self, target=None, **kwargs):
564 """Deletes target from the tree, relinking its children to its parent. 565 566 :returns: the parent clade. 567 """ 568 path = self.get_path(target, **kwargs) 569 if not path: 570 raise ValueError("couldn't collapse %s in this tree" 571 % (target or kwargs)) 572 if len(path) == 1: 573 parent = self.root 574 else: 575 parent = path[-2] 576 popped = parent.clades.pop(parent.clades.index(path[-1])) 577 extra_length = popped.branch_length or 0 578 for child in popped: 579 child.branch_length += extra_length 580 parent.clades.extend(popped.clades) 581 return parent
582
583 - def collapse_all(self, target=None, **kwargs):
584 """Collapse all the descendents of this tree, leaving only terminals. 585 586 Total branch lengths are preserved, i.e. the distance to each terminal 587 stays the same. 588 589 For example, this will safely collapse nodes with poor bootstrap 590 support: 591 592 >>> from Bio import Phylo 593 >>> tree = Phylo.read('PhyloXML/apaf.xml', 'phyloxml') 594 >>> print("Total branch length %0.2f" % tree.total_branch_length()) 595 Total branch length 20.44 596 >>> tree.collapse_all(lambda c: c.confidence is not None and c.confidence < 70) 597 >>> print("Total branch length %0.2f" % tree.total_branch_length()) 598 Total branch length 21.37 599 600 This implementation avoids strange side-effects by using level-order 601 traversal and testing all clade properties (versus the target 602 specification) up front. In particular, if a clade meets the target 603 specification in the original tree, it will be collapsed. For example, 604 if the condition is: 605 606 >>> from Bio import Phylo 607 >>> tree = Phylo.read('PhyloXML/apaf.xml', 'phyloxml') 608 >>> print("Total branch length %0.2f" % tree.total_branch_length()) 609 Total branch length 20.44 610 >>> tree.collapse_all(lambda c: c.branch_length < 0.1) 611 >>> print("Total branch length %0.2f" % tree.total_branch_length()) 612 Total branch length 21.13 613 614 Collapsing a clade's parent node adds the parent's branch length to the 615 child, so during the execution of collapse_all, a clade's branch_length 616 may increase. In this implementation, clades are collapsed according to 617 their properties in the original tree, not the properties when tree 618 traversal reaches the clade. (It's easier to debug.) If you want the 619 other behavior (incremental testing), modifying the source code of this 620 function is straightforward. 621 """ 622 # Read the iterable into a list to protect against in-place changes 623 matches = list(self.find_clades(target, False, 'level', **kwargs)) 624 if not matches: 625 # No matching nodes to collapse 626 return 627 # Skip the root node -- it can't be collapsed 628 if matches[0] == self.root: 629 matches.pop(0) 630 for clade in matches: 631 self.collapse(clade)
632
633 - def ladderize(self, reverse=False):
634 """Sort clades in-place according to the number of terminal nodes. 635 636 Deepest clades are last by default. Use ``reverse=True`` to sort clades 637 deepest-to-shallowest. 638 """ 639 self.root.clades.sort(key=lambda c: c.count_terminals(), 640 reverse=reverse) 641 for subclade in self.root.clades: 642 subclade.ladderize(reverse=reverse)
643
644 - def prune(self, target=None, **kwargs):
645 """Prunes a terminal clade from the tree. 646 647 If taxon is from a bifurcation, the connecting node will be collapsed 648 and its branch length added to remaining terminal node. This might be no 649 longer be a meaningful value. 650 651 :returns: parent clade of the pruned target 652 """ 653 if 'terminal' in kwargs and kwargs['terminal']: 654 raise ValueError("target must be terminal") 655 path = self.get_path(target, terminal=True, **kwargs) 656 if not path: 657 raise ValueError("can't find a matching target below this root") 658 if len(path) == 1: 659 parent = self.root 660 else: 661 parent = path[-2] 662 parent.clades.remove(path[-1]) 663 if len(parent) == 1: 664 # We deleted a branch from a bifurcation 665 if parent == self.root: 666 # If we're at the root, move the root upwards 667 # NB: This loses the length of the original branch 668 newroot = parent.clades[0] 669 newroot.branch_length = None 670 parent = self.root = newroot 671 else: 672 # If we're not at the root, collapse this parent 673 child = parent.clades[0] 674 if child.branch_length is not None: 675 child.branch_length += (parent.branch_length or 0.0) 676 if len(path) < 3: 677 grandparent = self.root 678 else: 679 grandparent = path[-3] 680 # Replace parent with child at the same place in grandparent 681 index = grandparent.clades.index(parent) 682 grandparent.clades.pop(index) 683 grandparent.clades.insert(index, child) 684 parent = grandparent 685 return parent
686
687 - def split(self, n=2, branch_length=1.0):
688 """Generate n (default 2) new descendants. 689 690 In a species tree, this is a speciation event. 691 692 New clades have the given branch_length and the same name as this 693 clade's root plus an integer suffix (counting from 0). For example, 694 splitting a clade named "A" produces sub-clades named "A0" and "A1". 695 If the clade has no name, the prefix "n" is used for child nodes, e.g. 696 "n0" and "n1". 697 """ 698 clade_cls = type(self.root) 699 base_name = self.root.name or 'n' 700 for i in range(n): 701 clade = clade_cls(name=base_name + str(i), 702 branch_length=branch_length) 703 self.root.clades.append(clade)
704
705 706 -class Tree(TreeElement, TreeMixin):
707 """A phylogenetic tree, containing global info for the phylogeny. 708 709 The structure and node-specific data is accessible through the 'root' 710 clade attached to the Tree instance. 711 712 :Parameters: 713 root : Clade 714 The starting node of the tree. If the tree is rooted, this will 715 usually be the root node. 716 rooted : bool 717 Whether or not the tree is rooted. By default, a tree is assumed to 718 be rooted. 719 id : str 720 The identifier of the tree, if there is one. 721 name : str 722 The name of the tree, in essence a label. 723 """ 724
725 - def __init__(self, root=None, rooted=True, id=None, name=None):
726 self.root = root or Clade() 727 self.rooted = rooted 728 self.id = id 729 self.name = name
730 731 @classmethod
732 - def from_clade(cls, clade, **kwargs):
733 """Create a new Tree object given a clade. 734 735 Keyword arguments are the usual `Tree` constructor parameters. 736 """ 737 root = copy.deepcopy(clade) 738 return cls(root, **kwargs)
739 740 @classmethod
741 - def randomized(cls, taxa, branch_length=1.0, branch_stdev=None):
742 """Create a randomized bifurcating tree given a list of taxa. 743 744 :param taxa: Either an integer specifying the number of taxa to create 745 (automatically named taxon#), or an iterable of taxon names, as 746 strings. 747 748 :returns: a tree of the same type as this class. 749 """ 750 if isinstance(taxa, int): 751 taxa = ['taxon%s' % (i + 1) for i in range(taxa)] 752 elif hasattr(taxa, '__iter__'): 753 taxa = list(taxa) 754 else: 755 raise TypeError("taxa argument must be integer (# taxa) or " 756 "iterable of taxon names.") 757 rtree = cls() 758 terminals = [rtree.root] 759 while len(terminals) < len(taxa): 760 newsplit = random.choice(terminals) 761 newsplit.split(branch_length=branch_length) 762 newterms = newsplit.clades 763 if branch_stdev: 764 # Add some noise to the branch lengths 765 for nt in newterms: 766 nt.branch_length = max(0, 767 random.gauss(branch_length, branch_stdev)) 768 terminals.remove(newsplit) 769 terminals.extend(newterms) 770 # Distribute taxon labels randomly 771 random.shuffle(taxa) 772 for node, name in zip(terminals, taxa): 773 node.name = name 774 return rtree
775 776 @property
777 - def clade(self):
778 """The first clade in this tree (not itself).""" 779 return self.root
780
781 - def as_phyloxml(self, **kwargs):
782 """Convert this tree to a PhyloXML-compatible Phylogeny. 783 784 This lets you use the additional annotation types PhyloXML defines, and 785 save this information when you write this tree as 'phyloxml'. 786 """ 787 from Bio.Phylo.PhyloXML import Phylogeny 788 return Phylogeny.from_tree(self, **kwargs)
789 790 # XXX Py3 Compatibility: In Python 3.0+, **kwargs can be replaced with the 791 # named keyword argument outgroup_branch_length=None
792 - def root_with_outgroup(self, outgroup_targets, *more_targets, **kwargs):
793 """Reroot this tree with the outgroup clade containing outgroup_targets. 794 795 Operates in-place. 796 797 Edge cases: 798 799 - If ``outgroup == self.root``, no change 800 - If outgroup is terminal, create new bifurcating root node with a 801 0-length branch to the outgroup 802 - If outgroup is internal, use the given outgroup node as the new 803 trifurcating root, keeping branches the same 804 - If the original root was bifurcating, drop it from the tree, 805 preserving total branch lengths 806 807 :param outgroup_branch_length: length of the branch leading to the 808 outgroup after rerooting. If not specified (None), then: 809 810 - If the outgroup is an internal node (not a single terminal taxon), 811 then use that node as the new root. 812 - Otherwise, create a new root node as the parent of the outgroup. 813 814 """ 815 # This raises a ValueError if any target is not in this tree 816 # Otherwise, common_ancestor guarantees outgroup is in this tree 817 outgroup = self.common_ancestor(outgroup_targets, *more_targets) 818 outgroup_path = self.get_path(outgroup) 819 if len(outgroup_path) == 0: 820 # Outgroup is the current root -- no change 821 return 822 823 prev_blen = outgroup.branch_length or 0.0 824 # Hideous kludge because Py2.x doesn't allow keyword args after *args 825 outgroup_branch_length = kwargs.get('outgroup_branch_length') 826 if outgroup_branch_length is not None: 827 assert 0 <= outgroup_branch_length <= prev_blen, \ 828 "outgroup_branch_length must be between 0 and the " \ 829 "original length of the branch leading to the outgroup." 830 831 if outgroup.is_terminal() or outgroup_branch_length is not None: 832 # Create a new root with a 0-length branch to the outgroup 833 outgroup.branch_length = outgroup_branch_length or 0.0 834 new_root = self.root.__class__( 835 branch_length=self.root.branch_length, clades=[outgroup]) 836 # The first branch reversal (see the upcoming loop) is modified 837 if len(outgroup_path) == 1: 838 # No nodes between the original root and outgroup to rearrange. 839 # Most of the code below will be skipped, but we still need 840 # 'new_parent' pointing at the new root. 841 new_parent = new_root 842 else: 843 parent = outgroup_path.pop(-2) 844 # First iteration of reversing the path to the outgroup 845 parent.clades.pop(parent.clades.index(outgroup)) 846 (prev_blen, parent.branch_length) = (parent.branch_length, 847 prev_blen - outgroup.branch_length) 848 new_root.clades.insert(0, parent) 849 new_parent = parent 850 else: 851 # Use the given outgroup node as the new (trifurcating) root 852 new_root = outgroup 853 new_root.branch_length = self.root.branch_length 854 new_parent = new_root 855 856 # Tracing the outgroup lineage backwards, reattach the subclades under a 857 # new root clade. Reverse the branches directly above the outgroup in 858 # the tree, but keep the descendants of those clades as they are. 859 for parent in outgroup_path[-2::-1]: 860 parent.clades.pop(parent.clades.index(new_parent)) 861 prev_blen, parent.branch_length = parent.branch_length, prev_blen 862 new_parent.clades.insert(0, parent) 863 new_parent = parent 864 865 # Finally, handle the original root according to number of descendents 866 old_root = self.root 867 if outgroup in old_root.clades: 868 assert len(outgroup_path) == 1 869 old_root.clades.pop(old_root.clades.index(outgroup)) 870 else: 871 old_root.clades.pop(old_root.clades.index(new_parent)) 872 if len(old_root) == 1: 873 # Delete the old bifurcating root & add branch lengths 874 ingroup = old_root.clades[0] 875 if ingroup.branch_length: 876 ingroup.branch_length += prev_blen 877 else: 878 ingroup.branch_length = prev_blen 879 new_parent.clades.insert(0, ingroup) 880 # ENH: If annotations are attached to old_root, do... something. 881 else: 882 # Keep the old trifurcating/polytomous root as an internal node 883 old_root.branch_length = prev_blen 884 new_parent.clades.insert(0, old_root) 885 886 self.root = new_root 887 self.rooted = True 888 return
889
890 - def root_at_midpoint(self):
891 """Root the tree at the midpoint of the two most distant taxa. 892 893 This operates in-place, leaving a bifurcating root. The topology of the 894 tree is otherwise retained, though no guarantees are made about the 895 stability of clade/node/taxon ordering. 896 """ 897 # Identify the largest pairwise distance 898 max_distance = 0.0 899 tips = self.get_terminals() 900 for tip in tips: 901 self.root_with_outgroup(tip) 902 new_max = max(self.depths().items(), key=lambda nd: nd[1]) 903 if new_max[1] > max_distance: 904 tip1 = tip 905 tip2 = new_max[0] 906 max_distance = new_max[1] 907 self.root_with_outgroup(tip1) 908 # Depth to go from the ingroup tip toward the outgroup tip 909 root_remainder = 0.5 * (max_distance - (self.root.branch_length or 0)) 910 assert root_remainder >= 0 911 # Identify the midpoint and reroot there. 912 # Trace the path to the outgroup tip until all of the root depth has 913 # been traveled/accounted for. 914 for node in self.get_path(tip2): 915 root_remainder -= node.branch_length 916 if root_remainder < 0: 917 outgroup_node = node 918 outgroup_branch_length = -root_remainder 919 break 920 else: 921 raise ValueError("Somehow, failed to find the midpoint!") 922 self.root_with_outgroup(outgroup_node, 923 outgroup_branch_length=outgroup_branch_length)
924 925 # Method assumed by TreeMixin 926
927 - def is_terminal(self):
928 """True if the root of this tree is terminal.""" 929 return (not self.root.clades)
930 931 # Convention from SeqRecord and Alignment classes 932
933 - def __format__(self, format_spec):
934 """Serialize the tree as a string in the specified file format. 935 936 This method supports the ``format`` built-in function added in Python 937 2.6/3.0. 938 939 :param format_spec: a lower-case string supported by `Bio.Phylo.write` 940 as an output file format. 941 """ 942 if format_spec: 943 from Bio._py3k import StringIO 944 from Bio.Phylo import _io 945 handle = StringIO() 946 _io.write([self], handle, format_spec) 947 return handle.getvalue() 948 else: 949 # Follow python convention and default to using __str__ 950 return str(self)
951
952 - def format(self, format):
953 """Serialize the tree as a string in the specified file format. 954 955 This duplicates the __format__ magic method for pre-2.6 Pythons. 956 """ 957 return self.__format__(format)
958 959 # Pretty-printer for the entire tree hierarchy 960
961 - def __str__(self):
962 """String representation of the entire tree. 963 964 Serializes each sub-clade recursively using ``repr`` to create a summary 965 of the object structure. 966 """ 967 TAB = ' ' 968 textlines = [] 969 970 def print_tree(obj, indent): 971 """Recursively serialize sub-elements. 972 973 This closes over textlines and modifies it in-place. 974 """ 975 if isinstance(obj, (Tree, Clade)): 976 # Avoid infinite recursion or special formatting from str() 977 objstr = repr(obj) 978 else: 979 objstr = as_string(obj) 980 textlines.append(TAB * indent + objstr) 981 indent += 1 982 for attr in obj.__dict__: 983 child = getattr(obj, attr) 984 if isinstance(child, TreeElement): 985 print_tree(child, indent) 986 elif isinstance(child, list): 987 for elem in child: 988 if isinstance(elem, TreeElement): 989 print_tree(elem, indent)
990 991 print_tree(self, 0) 992 return '\n'.join(textlines)
993
994 995 -class Clade(TreeElement, TreeMixin):
996 """A recursively defined sub-tree. 997 998 :Parameters: 999 branch_length : str 1000 The length of the branch leading to the root node of this clade. 1001 name : str 1002 The clade's name (a label). 1003 clades : list 1004 Sub-trees rooted directly under this tree's root. 1005 confidence : number 1006 Support. 1007 color : BranchColor 1008 The display color of the branch and descendents. 1009 width : number 1010 The display width of the branch and descendents. 1011 """ 1012
1013 - def __init__(self, branch_length=None, name=None, clades=None, 1014 confidence=None, color=None, width=None):
1015 self.branch_length = branch_length 1016 self.name = name 1017 self.clades = clades or [] 1018 self.confidence = confidence 1019 self.color = color 1020 self.width = width
1021 1022 @property
1023 - def root(self):
1024 """Allow TreeMixin methods to traverse clades properly.""" 1025 return self
1026
1027 - def is_terminal(self):
1028 """True if this is a terminal (leaf) node.""" 1029 return (not self.clades)
1030 1031 # Sequence-type behavior methods 1032
1033 - def __getitem__(self, index):
1034 """Get clades by index (integer or slice).""" 1035 if isinstance(index, (int, slice)): 1036 return self.clades[index] 1037 ref = self 1038 for idx in index: 1039 ref = ref[idx] 1040 return ref
1041
1042 - def __iter__(self):
1043 """Iterate through this tree's direct descendent clades (sub-trees).""" 1044 return iter(self.clades)
1045
1046 - def __len__(self):
1047 """Number of clades directy under the root.""" 1048 return len(self.clades)
1049 1050 # Python 3:
1051 - def __bool__(self):
1052 """Boolean value of an instance of this class (True). 1053 1054 NB: If this method is not defined, but ``__len__`` is, then the object 1055 is considered true if the result of ``__len__()`` is nonzero. We want 1056 Clade instances to always be considered True. 1057 """ 1058 return True
1059 # Python 2: 1060 __nonzero__ = __bool__ 1061
1062 - def __str__(self):
1063 if self.name: 1064 return _utils.trim_str(self.name, 40, '...') 1065 return self.__class__.__name__
1066 1067 # Syntax sugar for setting the branch color
1068 - def _get_color(self):
1069 return self._color
1070
1071 - def _set_color(self, arg):
1072 if arg is None or isinstance(arg, BranchColor): 1073 self._color = arg 1074 elif isinstance(arg, basestring): 1075 if arg in BranchColor.color_names: 1076 # Known color name 1077 self._color = BranchColor.from_name(arg) 1078 elif arg.startswith('#') and len(arg) == 7: 1079 # HTML-style hex string 1080 self._color = BranchColor.from_hex(arg) 1081 else: 1082 raise ValueError("invalid color string %s" % arg) 1083 elif hasattr(arg, '__iter__') and len(arg) == 3: 1084 # RGB triplet 1085 self._color = BranchColor(*arg) 1086 else: 1087 raise ValueError("invalid color value %s" % arg)
1088 1089 color = property(_get_color, _set_color, doc="Branch color.")
1090
1091 1092 -class BranchColor(object):
1093 """Indicates the color of a clade when rendered graphically. 1094 1095 The color should be interpreted by client code (e.g. visualization 1096 programs) as applying to the whole clade, unless overwritten by the 1097 color(s) of sub-clades. 1098 1099 Color values must be integers from 0 to 255. 1100 """ 1101 1102 color_names = { 1103 'red': (255, 0, 0), 1104 'r': (255, 0, 0), 1105 'yellow': (255, 255, 0), 1106 'y': (255, 255, 0), 1107 'green': (0, 128, 0), 1108 'g': (0, 128, 0), 1109 'cyan': (0, 255, 255), 1110 'c': (0, 255, 255), 1111 'blue': (0, 0, 255), 1112 'b': (0, 0, 255), 1113 'magenta': (255, 0, 255), 1114 'm': (255, 0, 255), 1115 'black': (0, 0, 0), 1116 'k': (0, 0, 0), 1117 'white': (255, 255, 255), 1118 'w': (255, 255, 255), 1119 # Names standardized in HTML/CSS spec 1120 # http://w3schools.com/html/html_colornames.asp 1121 'maroon': (128, 0, 0), 1122 'olive': (128, 128, 0), 1123 'lime': (0, 255, 0), 1124 'aqua': (0, 255, 255), 1125 'teal': (0, 128, 128), 1126 'navy': (0, 0, 128), 1127 'fuchsia': (255, 0, 255), 1128 'purple': (128, 0, 128), 1129 'silver': (192, 192, 192), 1130 'gray': (128, 128, 128), 1131 # More definitions from matplotlib/gcolor2 1132 'grey': (128, 128, 128), 1133 'pink': (255, 192, 203), 1134 'salmon': (250, 128, 114), 1135 'orange': (255, 165, 0), 1136 'gold': (255, 215, 0), 1137 'tan': (210, 180, 140), 1138 'brown': (165, 42, 42), 1139 } 1140
1141 - def __init__(self, red, green, blue):
1142 for color in (red, green, blue): 1143 assert (isinstance(color, int) and 1144 0 <= color <= 255 1145 ), "Color values must be integers between 0 and 255." 1146 self.red = red 1147 self.green = green 1148 self.blue = blue
1149 1150 @classmethod
1151 - def from_hex(cls, hexstr):
1152 """Construct a BranchColor object from a hexadecimal string. 1153 1154 The string format is the same style used in HTML and CSS, such as 1155 '#FF8000' for an RGB value of (255, 128, 0). 1156 """ 1157 assert (isinstance(hexstr, basestring) and 1158 hexstr.startswith('#') and 1159 len(hexstr) == 7 1160 ), "need a 24-bit hexadecimal string, e.g. #000000" 1161 1162 RGB = hexstr[1:3], hexstr[3:5], hexstr[5:] 1163 return cls(*[int('0x' + cc, base=16) for cc in RGB])
1164 1165 @classmethod
1166 - def from_name(cls, colorname):
1167 """Construct a BranchColor object by the color's name.""" 1168 return cls(*cls.color_names[colorname])
1169
1170 - def to_hex(self):
1171 """Return a 24-bit hexadecimal RGB representation of this color. 1172 1173 The returned string is suitable for use in HTML/CSS, as a color 1174 parameter in matplotlib, and perhaps other situations. 1175 1176 Example: 1177 1178 >>> bc = BranchColor(12, 200, 100) 1179 >>> bc.to_hex() 1180 '#0cc864' 1181 """ 1182 return "#%02x%02x%02x" % (self.red, self.green, self.blue)
1183
1184 - def to_rgb(self):
1185 """Return a tuple of RGB values (0 to 255) representing this color. 1186 1187 Example: 1188 1189 >>> bc = BranchColor(255, 165, 0) 1190 >>> bc.to_rgb() 1191 (255, 165, 0) 1192 """ 1193 return (self.red, self.green, self.blue)
1194
1195 - def __repr__(self):
1196 """Preserve the standard RGB order when representing this object.""" 1197 return ('%s(red=%d, green=%d, blue=%d)' 1198 % (self.__class__.__name__, self.red, self.green, self.blue))
1199
1200 - def __str__(self):
1201 """Show the color's RGB values.""" 1202 return "(%d, %d, %d)" % (self.red, self.green, self.blue)
1203