Package Bio :: Package Phylo :: Module BaseTree
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.BaseTree

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Base classes for Bio.Phylo objects. 
   7   
   8  All object representations for phylogenetic trees should derive from these base 
   9  classes in order to use the common methods defined on them. 
  10  """ 
  11   
  12  from Bio._py3k import basestring, filter, unicode, zip 
  13   
  14  import collections 
  15  import copy 
  16  import itertools 
  17  import random 
  18  import re 
  19   
  20  from Bio import _utils 
  21   
  22  # NB: On Python 2, repr() and str() are specified to return byte strings, not 
  23  # unicode. On Python 3, it's the opposite. Horrible. 
  24  import sys 
  25  if sys.version_info[0] < 3: 
26 - def as_string(s):
27 """Encode string to UTF-8.""" 28 if isinstance(s, unicode): 29 return s.encode('utf-8') 30 return str(s)
31 else: 32 as_string = str
33 34 35 # General tree-traversal algorithms 36 37 -def _level_traverse(root, get_children):
38 """Traverse a tree in breadth-first (level) order.""" 39 Q = collections.deque([root]) 40 while Q: 41 v = Q.popleft() 42 yield v 43 Q.extend(get_children(v))
44
45 46 -def _preorder_traverse(root, get_children):
47 """Traverse a tree in depth-first pre-order (parent before children).""" 48 def dfs(elem): 49 yield elem 50 for v in get_children(elem): 51 for u in dfs(v): 52 yield u
53 for elem in dfs(root): 54 yield elem 55
56 57 -def _postorder_traverse(root, get_children):
58 """Traverse a tree in depth-first post-order (children before parent).""" 59 def dfs(elem): 60 for v in get_children(elem): 61 for u in dfs(v): 62 yield u 63 yield elem
64 for elem in dfs(root): 65 yield elem 66
67 68 -def _sorted_attrs(elem):
69 """Get a flat list of elem's attributes, sorted for consistency.""" 70 singles = [] 71 lists = [] 72 # Sort attributes for consistent results 73 for attrname, child in sorted(elem.__dict__.items(), 74 key=lambda kv: kv[0]): 75 if child is None: 76 continue 77 if isinstance(child, list): 78 lists.extend(child) 79 else: 80 singles.append(child) 81 return (x for x in singles + lists 82 if isinstance(x, TreeElement))
83
84 85 # Factory functions to generalize searching for clades/nodes 86 87 -def _identity_matcher(target):
88 """Match a node to the target object by identity.""" 89 def match(node): 90 return (node is target)
91 return match 92
93 94 -def _class_matcher(target_cls):
95 """Match a node if it's an instance of the given class.""" 96 def match(node): 97 return isinstance(node, target_cls)
98 return match 99
100 101 -def _string_matcher(target):
102 def match(node): 103 if isinstance(node, (Clade, Tree)): 104 # Avoid triggering specialized or recursive magic methods 105 return node.name == target 106 return as_string(node) == target
107 return match 108
109 110 -def _attribute_matcher(kwargs):
111 """Match a node by specified attribute values. 112 113 ``terminal`` is a special case: True restricts the search to external (leaf) 114 nodes, False restricts to internal nodes, and None allows all tree elements 115 to be searched, including phyloXML annotations. 116 117 Otherwise, for a tree element to match the specification (i.e. for the 118 function produced by `_attribute_matcher` to return True when given a tree 119 element), it must have each of the attributes specified by the keys and 120 match each of the corresponding values -- think 'and', not 'or', for 121 multiple keys. 122 """ 123 def match(node): 124 if 'terminal' in kwargs: 125 # Special case: restrict to internal/external/any nodes 126 kwa_copy = kwargs.copy() 127 pattern = kwa_copy.pop('terminal') 128 if (pattern is not None and 129 (not hasattr(node, 'is_terminal') or 130 node.is_terminal() != pattern)): 131 return False 132 else: 133 kwa_copy = kwargs 134 for key, pattern in kwa_copy.items(): 135 # Nodes must match all other specified attributes 136 if not hasattr(node, key): 137 return False 138 target = getattr(node, key) 139 if isinstance(pattern, basestring): 140 return (isinstance(target, basestring) and 141 re.match(pattern + '$', target)) 142 if isinstance(pattern, bool): 143 return (pattern == bool(target)) 144 if isinstance(pattern, int): 145 return (pattern == target) 146 if pattern is None: 147 return (target is None) 148 raise TypeError('invalid query type: %s' % type(pattern)) 149 return True
150 return match 151
152 153 -def _function_matcher(matcher_func):
154 """Safer attribute lookup -- returns False instead of raising an error.""" 155 def match(node): 156 try: 157 return matcher_func(node) 158 except (LookupError, AttributeError, ValueError, TypeError): 159 return False
160 return match 161
162 163 -def _object_matcher(obj):
164 """Retrieve a matcher function by passing an arbitrary object. 165 166 i.e. passing a `TreeElement` such as a `Clade` or `Tree` instance returns an 167 identity matcher, passing a type such as the `PhyloXML.Taxonomy` class 168 returns a class matcher, and passing a dictionary returns an attribute 169 matcher. 170 171 The resulting 'match' function returns True when given an object matching 172 the specification (identity, type or attribute values), otherwise False. 173 This is useful for writing functions that search the tree, and probably 174 shouldn't be used directly by the end user. 175 """ 176 if isinstance(obj, TreeElement): 177 return _identity_matcher(obj) 178 if isinstance(obj, type): 179 return _class_matcher(obj) 180 if isinstance(obj, basestring): 181 return _string_matcher(obj) 182 if isinstance(obj, dict): 183 return _attribute_matcher(obj) 184 if callable(obj): 185 return _function_matcher(obj) 186 raise ValueError("%s (type %s) is not a valid type for comparison." 187 % (obj, type(obj)))
188
189 190 -def _combine_matchers(target, kwargs, require_spec):
191 """Merge target specifications with keyword arguments. 192 193 Dispatch the components to the various matcher functions, then merge into a 194 single boolean function. 195 """ 196 if not target: 197 if not kwargs: 198 if require_spec: 199 raise ValueError("you must specify a target object or keyword " 200 "arguments.") 201 return lambda x: True 202 return _attribute_matcher(kwargs) 203 match_obj = _object_matcher(target) 204 if not kwargs: 205 return match_obj 206 match_kwargs = _attribute_matcher(kwargs) 207 return (lambda x: match_obj(x) and match_kwargs(x))
208
209 210 -def _combine_args(first, *rest):
211 """Convert ``[targets]`` or ``*targets`` arguments to a single iterable. 212 213 This helps other functions work like the built-in functions `max` and 214 `min`. 215 """ 216 # Background: is_monophyletic takes a single list or iterable (like the 217 # same method in Bio.Nexus.Trees); root_with_outgroup and common_ancestor 218 # take separate arguments. This mismatch was in the initial release and I 219 # didn't notice the inconsistency until after Biopython 1.55. I can think 220 # of cases where either style is more convenient, so let's support both 221 # (for backward compatibility and consistency between methods). 222 if hasattr(first, '__iter__') and not (isinstance(first, TreeElement) or 223 isinstance(first, type) or 224 isinstance(first, basestring) or 225 isinstance(first, dict)): 226 # `terminals` is an iterable of targets 227 if rest: 228 raise ValueError("Arguments must be either a single list of " 229 "targets, or separately specified targets " 230 "(e.g. foo(t1, t2, t3)), but not both.") 231 return first 232 # `terminals` is a single target -- wrap in a container 233 return itertools.chain([first], rest)
234
235 236 # Class definitions 237 238 -class TreeElement(object):
239 """Base class for all Bio.Phylo classes.""" 240
241 - def __repr__(self):
242 """Show this object's constructor with its primitive arguments.""" 243 def pair_as_kwarg_string(key, val): 244 if isinstance(val, basestring): 245 return ("%s='%s'" 246 % (key, _utils.trim_str(as_string(val), 60, '...'))) 247 return "%s=%s" % (key, val)
248 return ('%s(%s)' 249 % (self.__class__.__name__, 250 ', '.join(pair_as_kwarg_string(key, val) 251 for key, val in sorted(self.__dict__.items()) 252 if val is not None and 253 type(val) in (str, int, float, bool, unicode))))
254 255 __str__ = __repr__ 256
257 258 -class TreeMixin(object):
259 """Methods for Tree- and Clade-based classes. 260 261 This lets `Tree` and `Clade` support the same traversal and searching 262 operations without requiring Clade to inherit from Tree, so Clade isn't 263 required to have all of Tree's attributes -- just ``root`` (a Clade 264 instance) and ``is_terminal``. 265 """ 266 267 # Traversal methods 268
269 - def _filter_search(self, filter_func, order, follow_attrs):
270 """Perform a BFS or DFS traversal through all elements in this tree. 271 272 :returns: generator of all elements for which `filter_func` is True. 273 274 """ 275 order_opts = {'preorder': _preorder_traverse, 276 'postorder': _postorder_traverse, 277 'level': _level_traverse} 278 try: 279 order_func = order_opts[order] 280 except KeyError: 281 raise ValueError("Invalid order '%s'; must be one of: %s" 282 % (order, tuple(order_opts))) 283 if follow_attrs: 284 get_children = _sorted_attrs 285 root = self 286 else: 287 get_children = lambda elem: elem.clades 288 root = self.root 289 return filter(filter_func, order_func(root, get_children))
290
291 - def find_any(self, *args, **kwargs):
292 """Return the first element found by find_elements(), or None. 293 294 This is also useful for checking whether any matching element exists in 295 the tree, and can be used in a conditional expression. 296 """ 297 hits = self.find_elements(*args, **kwargs) 298 try: 299 return next(hits) 300 except StopIteration: 301 return None
302
303 - def find_elements(self, target=None, terminal=None, order='preorder', 304 **kwargs):
305 """Find all tree elements matching the given attributes. 306 307 The arbitrary keyword arguments indicate the attribute name of the 308 sub-element and the value to match: string, integer or boolean. Strings 309 are evaluated as regular expression matches; integers are compared 310 directly for equality, and booleans evaluate the attribute's truth value 311 (True or False) before comparing. To handle nonzero floats, search with 312 a boolean argument, then filter the result manually. 313 314 If no keyword arguments are given, then just the class type is used for 315 matching. 316 317 The result is an iterable through all matching objects, by depth-first 318 search. (Not necessarily the same order as the elements appear in the 319 source file!) 320 321 :Parameters: 322 target : TreeElement instance, type, dict, or callable 323 Specifies the characteristics to search for. (The default, 324 TreeElement, matches any standard Bio.Phylo type.) 325 terminal : bool 326 A boolean value to select for or against terminal nodes (a.k.a. 327 leaf nodes). True searches for only terminal nodes, False 328 excludes terminal nodes, and the default, None, searches both 329 terminal and non-terminal nodes, as well as any tree elements 330 lacking the ``is_terminal`` method. 331 order : {'preorder', 'postorder', 'level'} 332 Tree traversal order: 'preorder' (default) is depth-first 333 search, 'postorder' is DFS with child nodes preceding parents, 334 and 'level' is breadth-first search. 335 336 Example 337 ------- 338 339 >>> from Bio import Phylo 340 >>> phx = Phylo.PhyloXMLIO.read('PhyloXML/phyloxml_examples.xml') 341 >>> matches = phx.phylogenies[5].find_elements(code='OCTVU') 342 >>> next(matches) 343 Taxonomy(code='OCTVU', scientific_name='Octopus vulgaris') 344 345 """ 346 if terminal is not None: 347 kwargs['terminal'] = terminal 348 is_matching_elem = _combine_matchers(target, kwargs, False) 349 return self._filter_search(is_matching_elem, order, True)
350
351 - def find_clades(self, target=None, terminal=None, order='preorder', 352 **kwargs):
353 """Find each clade containing a matching element. 354 355 That is, find each element as with find_elements(), but return the 356 corresponding clade object. (This is usually what you want.) 357 358 :returns: an iterable through all matching objects, searching 359 depth-first (preorder) by default. 360 361 """ 362 def match_attrs(elem): 363 orig_clades = elem.__dict__.pop('clades') 364 found = elem.find_any(target, **kwargs) 365 elem.clades = orig_clades 366 return (found is not None)
367 if terminal is None: 368 is_matching_elem = match_attrs 369 else: 370 def is_matching_elem(elem): 371 return ((elem.is_terminal() == terminal) and 372 match_attrs(elem))
373 return self._filter_search(is_matching_elem, order, False) 374
375 - def get_path(self, target=None, **kwargs):
376 """List the clades directly between this root and the given target. 377 378 :returns: list of all clade objects along this path, ending with the 379 given target, but excluding the root clade. 380 381 """ 382 # Only one path will work -- ignore weights and visits 383 path = [] 384 match = _combine_matchers(target, kwargs, True) 385 386 def check_in_path(v): 387 if match(v): 388 path.append(v) 389 return True 390 elif v.is_terminal(): 391 return False 392 for child in v: 393 if check_in_path(child): 394 path.append(v) 395 return True 396 return False
397 398 if not check_in_path(self.root): 399 return None 400 return path[-2::-1] 401
402 - def get_nonterminals(self, order='preorder'):
403 """Get a list of all of this tree's nonterminal (internal) nodes.""" 404 return list(self.find_clades(terminal=False, order=order))
405
406 - def get_terminals(self, order='preorder'):
407 """Get a list of all of this tree's terminal (leaf) nodes.""" 408 return list(self.find_clades(terminal=True, order=order))
409
410 - def trace(self, start, finish):
411 """List of all clade object between two targets in this tree. 412 413 Excluding `start`, including `finish`. 414 """ 415 mrca = self.common_ancestor(start, finish) 416 fromstart = mrca.get_path(start)[-2::-1] 417 to = mrca.get_path(finish) 418 return fromstart + [mrca] + to
419 420 # Information methods 421
422 - def common_ancestor(self, targets, *more_targets):
423 """Most recent common ancestor (clade) of all the given targets. 424 425 Edge cases: 426 - If no target is given, returns self.root 427 - If 1 target is given, returns the target 428 - If any target is not found in this tree, raises a ValueError 429 430 """ 431 paths = [self.get_path(t) 432 for t in _combine_args(targets, *more_targets)] 433 # Validation -- otherwise izip throws a spooky error below 434 for p, t in zip(paths, targets): 435 if p is None: 436 raise ValueError("target %s is not in this tree" % repr(t)) 437 mrca = self.root 438 for level in zip(*paths): 439 ref = level[0] 440 for other in level[1:]: 441 if ref is not other: 442 break 443 else: 444 mrca = ref 445 if ref is not mrca: 446 break 447 return mrca
448
449 - def count_terminals(self):
450 """Counts the number of terminal (leaf) nodes within this tree.""" 451 return _utils.iterlen(self.find_clades(terminal=True))
452
453 - def depths(self, unit_branch_lengths=False):
454 """Create a mapping of tree clades to depths (by branch length). 455 456 :Parameters: 457 unit_branch_lengths : bool 458 If True, count only the number of branches (levels in the tree). 459 By default the distance is the cumulative branch length leading 460 to the clade. 461 462 :returns: dict of {clade: depth}, where keys are all of the Clade 463 instances in the tree, and values are the distance from the root to 464 each clade (including terminals). 465 466 """ 467 if unit_branch_lengths: 468 depth_of = lambda c: 1 469 else: 470 depth_of = lambda c: c.branch_length or 0 471 depths = {} 472 473 def update_depths(node, curr_depth): 474 depths[node] = curr_depth 475 for child in node.clades: 476 new_depth = curr_depth + depth_of(child) 477 update_depths(child, new_depth)
478 479 update_depths(self.root, self.root.branch_length or 0) 480 return depths 481
482 - def distance(self, target1, target2=None):
483 """Calculate the sum of the branch lengths between two targets. 484 485 If only one target is specified, the other is the root of this tree. 486 """ 487 if target2 is None: 488 return sum(n.branch_length for n in self.get_path(target1) 489 if n.branch_length is not None) 490 mrca = self.common_ancestor(target1, target2) 491 return mrca.distance(target1) + mrca.distance(target2)
492
493 - def is_bifurcating(self):
494 """Return True if tree downstream of node is strictly bifurcating. 495 496 I.e., all nodes have either 2 or 0 children (internal or external, 497 respectively). The root may have 3 descendents and still be considered 498 part of a bifurcating tree, because it has no ancestor. 499 """ 500 # Root can be trifurcating 501 if isinstance(self, Tree) and len(self.root) == 3: 502 return (self.root.clades[0].is_bifurcating() and 503 self.root.clades[1].is_bifurcating() and 504 self.root.clades[2].is_bifurcating()) 505 if len(self.root) == 2: 506 return (self.root.clades[0].is_bifurcating() and 507 self.root.clades[1].is_bifurcating()) 508 if len(self.root) == 0: 509 return True 510 return False
511
512 - def is_monophyletic(self, terminals, *more_terminals):
513 """MRCA of terminals if they comprise a complete subclade, or False. 514 515 I.e., there exists a clade such that its terminals are the same set as 516 the given targets. 517 518 The given targets must be terminals of the tree. 519 520 To match both `Bio.Nexus.Trees` and the other multi-target methods in 521 Bio.Phylo, arguments to this method can be specified either of two ways: 522 (i) as a single list of targets, or (ii) separately specified targets, 523 e.g. is_monophyletic(t1, t2, t3) -- but not both. 524 525 For convenience, this method returns the common ancestor (MCRA) of the 526 targets if they are monophyletic (instead of the value True), and False 527 otherwise. 528 529 :returns: common ancestor if terminals are monophyletic, otherwise False. 530 531 """ 532 target_set = set(_combine_args(terminals, *more_terminals)) 533 current = self.root 534 while True: 535 if set(current.get_terminals()) == target_set: 536 return current 537 # Try a narrower subclade 538 for subclade in current.clades: 539 if set(subclade.get_terminals()).issuperset(target_set): 540 current = subclade 541 break 542 else: 543 return False
544
545 - def is_parent_of(self, target=None, **kwargs):
546 """True if target is a descendent of this tree. 547 548 Not required to be a direct descendent. 549 550 To check only direct descendents of a clade, simply use list membership 551 testing: ``if subclade in clade: ...`` 552 """ 553 return self.get_path(target, **kwargs) is not None
554
555 - def is_preterminal(self):
556 """True if all direct descendents are terminal.""" 557 if self.root.is_terminal(): 558 return False 559 for clade in self.root.clades: 560 if not clade.is_terminal(): 561 return False 562 return True
563
564 - def total_branch_length(self):
565 """Calculate the sum of all the branch lengths in this tree.""" 566 return sum(node.branch_length 567 for node in self.find_clades(branch_length=True))
568 569 # Tree manipulation methods 570
571 - def collapse(self, target=None, **kwargs):
572 """Deletes target from the tree, relinking its children to its parent. 573 574 :returns: the parent clade. 575 576 """ 577 path = self.get_path(target, **kwargs) 578 if not path: 579 raise ValueError("couldn't collapse %s in this tree" 580 % (target or kwargs)) 581 if len(path) == 1: 582 parent = self.root 583 else: 584 parent = path[-2] 585 popped = parent.clades.pop(parent.clades.index(path[-1])) 586 extra_length = popped.branch_length or 0 587 for child in popped: 588 child.branch_length += extra_length 589 parent.clades.extend(popped.clades) 590 return parent
591
592 - def collapse_all(self, target=None, **kwargs):
593 """Collapse all the descendents of this tree, leaving only terminals. 594 595 Total branch lengths are preserved, i.e. the distance to each terminal 596 stays the same. 597 598 For example, this will safely collapse nodes with poor bootstrap 599 support: 600 601 >>> from Bio import Phylo 602 >>> tree = Phylo.read('PhyloXML/apaf.xml', 'phyloxml') 603 >>> print("Total branch length %0.2f" % tree.total_branch_length()) 604 Total branch length 20.44 605 >>> tree.collapse_all(lambda c: c.confidence is not None and c.confidence < 70) 606 >>> print("Total branch length %0.2f" % tree.total_branch_length()) 607 Total branch length 21.37 608 609 This implementation avoids strange side-effects by using level-order 610 traversal and testing all clade properties (versus the target 611 specification) up front. In particular, if a clade meets the target 612 specification in the original tree, it will be collapsed. For example, 613 if the condition is: 614 615 >>> from Bio import Phylo 616 >>> tree = Phylo.read('PhyloXML/apaf.xml', 'phyloxml') 617 >>> print("Total branch length %0.2f" % tree.total_branch_length()) 618 Total branch length 20.44 619 >>> tree.collapse_all(lambda c: c.branch_length < 0.1) 620 >>> print("Total branch length %0.2f" % tree.total_branch_length()) 621 Total branch length 21.13 622 623 Collapsing a clade's parent node adds the parent's branch length to the 624 child, so during the execution of collapse_all, a clade's branch_length 625 may increase. In this implementation, clades are collapsed according to 626 their properties in the original tree, not the properties when tree 627 traversal reaches the clade. (It's easier to debug.) If you want the 628 other behavior (incremental testing), modifying the source code of this 629 function is straightforward. 630 """ 631 # Read the iterable into a list to protect against in-place changes 632 matches = list(self.find_clades(target, False, 'level', **kwargs)) 633 if not matches: 634 # No matching nodes to collapse 635 return 636 # Skip the root node -- it can't be collapsed 637 if matches[0] == self.root: 638 matches.pop(0) 639 for clade in matches: 640 self.collapse(clade)
641
642 - def ladderize(self, reverse=False):
643 """Sort clades in-place according to the number of terminal nodes. 644 645 Deepest clades are last by default. Use ``reverse=True`` to sort clades 646 deepest-to-shallowest. 647 """ 648 self.root.clades.sort(key=lambda c: c.count_terminals(), 649 reverse=reverse) 650 for subclade in self.root.clades: 651 subclade.ladderize(reverse=reverse)
652
653 - def prune(self, target=None, **kwargs):
654 """Prunes a terminal clade from the tree. 655 656 If taxon is from a bifurcation, the connecting node will be collapsed 657 and its branch length added to remaining terminal node. This might be no 658 longer be a meaningful value. 659 660 :returns: parent clade of the pruned target 661 662 """ 663 if 'terminal' in kwargs and kwargs['terminal']: 664 raise ValueError("target must be terminal") 665 path = self.get_path(target, terminal=True, **kwargs) 666 if not path: 667 raise ValueError("can't find a matching target below this root") 668 if len(path) == 1: 669 parent = self.root 670 else: 671 parent = path[-2] 672 parent.clades.remove(path[-1]) 673 if len(parent) == 1: 674 # We deleted a branch from a bifurcation 675 if parent == self.root: 676 # If we're at the root, move the root upwards 677 # NB: This loses the length of the original branch 678 newroot = parent.clades[0] 679 newroot.branch_length = None 680 parent = self.root = newroot 681 else: 682 # If we're not at the root, collapse this parent 683 child = parent.clades[0] 684 if child.branch_length is not None: 685 child.branch_length += (parent.branch_length or 0.0) 686 if len(path) < 3: 687 grandparent = self.root 688 else: 689 grandparent = path[-3] 690 # Replace parent with child at the same place in grandparent 691 index = grandparent.clades.index(parent) 692 grandparent.clades.pop(index) 693 grandparent.clades.insert(index, child) 694 parent = grandparent 695 return parent
696
697 - def split(self, n=2, branch_length=1.0):
698 """Generate n (default 2) new descendants. 699 700 In a species tree, this is a speciation event. 701 702 New clades have the given branch_length and the same name as this 703 clade's root plus an integer suffix (counting from 0). For example, 704 splitting a clade named "A" produces sub-clades named "A0" and "A1". 705 If the clade has no name, the prefix "n" is used for child nodes, e.g. 706 "n0" and "n1". 707 """ 708 clade_cls = type(self.root) 709 base_name = self.root.name or 'n' 710 for i in range(n): 711 clade = clade_cls(name=base_name + str(i), 712 branch_length=branch_length) 713 self.root.clades.append(clade)
714
715 716 -class Tree(TreeElement, TreeMixin):
717 """A phylogenetic tree, containing global info for the phylogeny. 718 719 The structure and node-specific data is accessible through the 'root' 720 clade attached to the Tree instance. 721 722 :Parameters: 723 root : Clade 724 The starting node of the tree. If the tree is rooted, this will 725 usually be the root node. 726 rooted : bool 727 Whether or not the tree is rooted. By default, a tree is assumed to 728 be rooted. 729 id : str 730 The identifier of the tree, if there is one. 731 name : str 732 The name of the tree, in essence a label. 733 734 """ 735
736 - def __init__(self, root=None, rooted=True, id=None, name=None):
737 """Initialize parameter for phylogenetic tree.""" 738 self.root = root or Clade() 739 self.rooted = rooted 740 self.id = id 741 self.name = name
742 743 @classmethod
744 - def from_clade(cls, clade, **kwargs):
745 """Create a new Tree object given a clade. 746 747 Keyword arguments are the usual `Tree` constructor parameters. 748 """ 749 root = copy.deepcopy(clade) 750 return cls(root, **kwargs)
751 752 @classmethod
753 - def randomized(cls, taxa, branch_length=1.0, branch_stdev=None):
754 """Create a randomized bifurcating tree given a list of taxa. 755 756 :param taxa: Either an integer specifying the number of taxa to create 757 (automatically named taxon#), or an iterable of taxon names, as 758 strings. 759 760 :returns: a tree of the same type as this class. 761 762 """ 763 if isinstance(taxa, int): 764 taxa = ['taxon%s' % (i + 1) for i in range(taxa)] 765 elif hasattr(taxa, '__iter__'): 766 taxa = list(taxa) 767 else: 768 raise TypeError("taxa argument must be integer (# taxa) or " 769 "iterable of taxon names.") 770 rtree = cls() 771 terminals = [rtree.root] 772 while len(terminals) < len(taxa): 773 newsplit = random.choice(terminals) 774 newsplit.split(branch_length=branch_length) 775 newterms = newsplit.clades 776 if branch_stdev: 777 # Add some noise to the branch lengths 778 for nt in newterms: 779 nt.branch_length = max(0, 780 random.gauss(branch_length, branch_stdev)) 781 terminals.remove(newsplit) 782 terminals.extend(newterms) 783 # Distribute taxon labels randomly 784 random.shuffle(taxa) 785 for node, name in zip(terminals, taxa): 786 node.name = name 787 return rtree
788 789 @property
790 - def clade(self):
791 """The first clade in this tree (not itself).""" 792 return self.root
793
794 - def as_phyloxml(self, **kwargs):
795 """Convert this tree to a PhyloXML-compatible Phylogeny. 796 797 This lets you use the additional annotation types PhyloXML defines, and 798 save this information when you write this tree as 'phyloxml'. 799 """ 800 from Bio.Phylo.PhyloXML import Phylogeny 801 return Phylogeny.from_tree(self, **kwargs)
802 803 # XXX Py3 Compatibility: In Python 3.0+, **kwargs can be replaced with the 804 # named keyword argument outgroup_branch_length=None
805 - def root_with_outgroup(self, outgroup_targets, *more_targets, **kwargs):
806 """Reroot this tree with the outgroup clade containing outgroup_targets. 807 808 Operates in-place. 809 810 Edge cases: 811 - If ``outgroup == self.root``, no change 812 - If outgroup is terminal, create new bifurcating root node with a 813 0-length branch to the outgroup 814 - If outgroup is internal, use the given outgroup node as the new 815 trifurcating root, keeping branches the same 816 - If the original root was bifurcating, drop it from the tree, 817 preserving total branch lengths 818 819 :param outgroup_branch_length: length of the branch leading to the 820 outgroup after rerooting. If not specified (None), then: 821 822 - If the outgroup is an internal node (not a single terminal taxon), 823 then use that node as the new root. 824 - Otherwise, create a new root node as the parent of the outgroup. 825 826 """ 827 # This raises a ValueError if any target is not in this tree 828 # Otherwise, common_ancestor guarantees outgroup is in this tree 829 outgroup = self.common_ancestor(outgroup_targets, *more_targets) 830 outgroup_path = self.get_path(outgroup) 831 if len(outgroup_path) == 0: 832 # Outgroup is the current root -- no change 833 return 834 835 prev_blen = outgroup.branch_length or 0.0 836 # Hideous kludge because Py2.x doesn't allow keyword args after *args 837 outgroup_branch_length = kwargs.get('outgroup_branch_length') 838 if outgroup_branch_length is not None: 839 assert 0 <= outgroup_branch_length <= prev_blen, \ 840 "outgroup_branch_length must be between 0 and the " \ 841 "original length of the branch leading to the outgroup." 842 843 if outgroup.is_terminal() or outgroup_branch_length is not None: 844 # Create a new root with a 0-length branch to the outgroup 845 outgroup.branch_length = outgroup_branch_length or 0.0 846 new_root = self.root.__class__( 847 branch_length=self.root.branch_length, clades=[outgroup]) 848 # The first branch reversal (see the upcoming loop) is modified 849 if len(outgroup_path) == 1: 850 # No nodes between the original root and outgroup to rearrange. 851 # Most of the code below will be skipped, but we still need 852 # 'new_parent' pointing at the new root. 853 new_parent = new_root 854 else: 855 parent = outgroup_path.pop(-2) 856 # First iteration of reversing the path to the outgroup 857 parent.clades.pop(parent.clades.index(outgroup)) 858 (prev_blen, parent.branch_length) = (parent.branch_length, 859 prev_blen - outgroup.branch_length) 860 new_root.clades.insert(0, parent) 861 new_parent = parent 862 else: 863 # Use the given outgroup node as the new (trifurcating) root 864 new_root = outgroup 865 new_root.branch_length = self.root.branch_length 866 new_parent = new_root 867 868 # Tracing the outgroup lineage backwards, reattach the subclades under a 869 # new root clade. Reverse the branches directly above the outgroup in 870 # the tree, but keep the descendants of those clades as they are. 871 for parent in outgroup_path[-2::-1]: 872 parent.clades.pop(parent.clades.index(new_parent)) 873 prev_blen, parent.branch_length = parent.branch_length, prev_blen 874 new_parent.clades.insert(0, parent) 875 new_parent = parent 876 877 # Finally, handle the original root according to number of descendents 878 old_root = self.root 879 if outgroup in old_root.clades: 880 assert len(outgroup_path) == 1 881 old_root.clades.pop(old_root.clades.index(outgroup)) 882 else: 883 old_root.clades.pop(old_root.clades.index(new_parent)) 884 if len(old_root) == 1: 885 # Delete the old bifurcating root & add branch lengths 886 ingroup = old_root.clades[0] 887 if ingroup.branch_length: 888 ingroup.branch_length += prev_blen 889 else: 890 ingroup.branch_length = prev_blen 891 new_parent.clades.insert(0, ingroup) 892 # ENH: If annotations are attached to old_root, do... something. 893 else: 894 # Keep the old trifurcating/polytomous root as an internal node 895 old_root.branch_length = prev_blen 896 new_parent.clades.insert(0, old_root) 897 898 self.root = new_root 899 self.rooted = True 900 return
901
902 - def root_at_midpoint(self):
903 """Root the tree at the midpoint of the two most distant taxa. 904 905 This operates in-place, leaving a bifurcating root. The topology of the 906 tree is otherwise retained, though no guarantees are made about the 907 stability of clade/node/taxon ordering. 908 """ 909 # Identify the largest pairwise distance 910 max_distance = 0.0 911 tips = self.get_terminals() 912 for tip in tips: 913 self.root_with_outgroup(tip) 914 new_max = max(self.depths().items(), key=lambda nd: nd[1]) 915 if new_max[1] > max_distance: 916 tip1 = tip 917 tip2 = new_max[0] 918 max_distance = new_max[1] 919 self.root_with_outgroup(tip1) 920 # Depth to go from the ingroup tip toward the outgroup tip 921 root_remainder = 0.5 * (max_distance - (self.root.branch_length or 0)) 922 assert root_remainder >= 0 923 # Identify the midpoint and reroot there. 924 # Trace the path to the outgroup tip until all of the root depth has 925 # been traveled/accounted for. 926 for node in self.get_path(tip2): 927 root_remainder -= node.branch_length 928 if root_remainder < 0: 929 outgroup_node = node 930 outgroup_branch_length = -root_remainder 931 break 932 else: 933 raise ValueError("Somehow, failed to find the midpoint!") 934 self.root_with_outgroup(outgroup_node, 935 outgroup_branch_length=outgroup_branch_length)
936 937 # Method assumed by TreeMixin 938
939 - def is_terminal(self):
940 """True if the root of this tree is terminal.""" 941 return (not self.root.clades)
942 943 # Convention from SeqRecord and Alignment classes 944
945 - def __format__(self, format_spec):
946 """Serialize the tree as a string in the specified file format. 947 948 This method supports the ``format`` built-in function added in Python 949 2.6/3.0. 950 951 :param format_spec: a lower-case string supported by `Bio.Phylo.write` 952 as an output file format. 953 954 """ 955 if format_spec: 956 from Bio._py3k import StringIO 957 from Bio.Phylo import _io 958 handle = StringIO() 959 _io.write([self], handle, format_spec) 960 return handle.getvalue() 961 else: 962 # Follow python convention and default to using __str__ 963 return str(self)
964
965 - def format(self, format):
966 """Serialize the tree as a string in the specified file format. 967 968 This duplicates the __format__ magic method for pre-2.6 Pythons. 969 """ 970 return self.__format__(format)
971 972 # Pretty-printer for the entire tree hierarchy 973
974 - def __str__(self):
975 """String representation of the entire tree. 976 977 Serializes each sub-clade recursively using ``repr`` to create a summary 978 of the object structure. 979 """ 980 TAB = ' ' 981 textlines = [] 982 983 def print_tree(obj, indent): 984 """Recursively serialize sub-elements. 985 986 This closes over textlines and modifies it in-place. 987 """ 988 if isinstance(obj, (Tree, Clade)): 989 # Avoid infinite recursion or special formatting from str() 990 objstr = repr(obj) 991 else: 992 objstr = as_string(obj) 993 textlines.append(TAB * indent + objstr) 994 indent += 1 995 for attr in obj.__dict__: 996 child = getattr(obj, attr) 997 if isinstance(child, TreeElement): 998 print_tree(child, indent) 999 elif isinstance(child, list): 1000 for elem in child: 1001 if isinstance(elem, TreeElement): 1002 print_tree(elem, indent)
1003 1004 print_tree(self, 0) 1005 return '\n'.join(textlines)
1006
1007 1008 -class Clade(TreeElement, TreeMixin):
1009 """A recursively defined sub-tree. 1010 1011 :Parameters: 1012 branch_length : str 1013 The length of the branch leading to the root node of this clade. 1014 name : str 1015 The clade's name (a label). 1016 clades : list 1017 Sub-trees rooted directly under this tree's root. 1018 confidence : number 1019 Support. 1020 color : BranchColor 1021 The display color of the branch and descendents. 1022 width : number 1023 The display width of the branch and descendents. 1024 1025 """ 1026
1027 - def __init__(self, branch_length=None, name=None, clades=None, 1028 confidence=None, color=None, width=None):
1029 """Define parameters for the Clade tree.""" 1030 self.branch_length = branch_length 1031 self.name = name 1032 self.clades = clades or [] 1033 self.confidence = confidence 1034 self.color = color 1035 self.width = width
1036 1037 @property
1038 - def root(self):
1039 """Allow TreeMixin methods to traverse clades properly.""" 1040 return self
1041
1042 - def is_terminal(self):
1043 """True if this is a terminal (leaf) node.""" 1044 return (not self.clades)
1045 1046 # Sequence-type behavior methods 1047
1048 - def __getitem__(self, index):
1049 """Get clades by index (integer or slice).""" 1050 if isinstance(index, (int, slice)): 1051 return self.clades[index] 1052 ref = self 1053 for idx in index: 1054 ref = ref[idx] 1055 return ref
1056
1057 - def __iter__(self):
1058 """Iterate through this tree's direct descendent clades (sub-trees).""" 1059 return iter(self.clades)
1060
1061 - def __len__(self):
1062 """Number of clades directy under the root.""" 1063 return len(self.clades)
1064 1065 # Python 3:
1066 - def __bool__(self):
1067 """Boolean value of an instance of this class (True). 1068 1069 NB: If this method is not defined, but ``__len__`` is, then the object 1070 is considered true if the result of ``__len__()`` is nonzero. We want 1071 Clade instances to always be considered True. 1072 """ 1073 return True
1074 # Python 2: 1075 __nonzero__ = __bool__ 1076
1077 - def __str__(self):
1078 """Return name of the class instance.""" 1079 if self.name: 1080 return _utils.trim_str(self.name, 40, '...') 1081 return self.__class__.__name__
1082 1083 # Syntax sugar for setting the branch color
1084 - def _get_color(self):
1085 return self._color
1086
1087 - def _set_color(self, arg):
1088 if arg is None or isinstance(arg, BranchColor): 1089 self._color = arg 1090 elif isinstance(arg, basestring): 1091 if arg in BranchColor.color_names: 1092 # Known color name 1093 self._color = BranchColor.from_name(arg) 1094 elif arg.startswith('#') and len(arg) == 7: 1095 # HTML-style hex string 1096 self._color = BranchColor.from_hex(arg) 1097 else: 1098 raise ValueError("invalid color string %s" % arg) 1099 elif hasattr(arg, '__iter__') and len(arg) == 3: 1100 # RGB triplet 1101 self._color = BranchColor(*arg) 1102 else: 1103 raise ValueError("invalid color value %s" % arg)
1104 1105 color = property(_get_color, _set_color, doc="Branch color.")
1106
1107 1108 -class BranchColor(object):
1109 """Indicates the color of a clade when rendered graphically. 1110 1111 The color should be interpreted by client code (e.g. visualization 1112 programs) as applying to the whole clade, unless overwritten by the 1113 color(s) of sub-clades. 1114 1115 Color values must be integers from 0 to 255. 1116 """ 1117 1118 color_names = { 1119 'red': (255, 0, 0), 1120 'r': (255, 0, 0), 1121 'yellow': (255, 255, 0), 1122 'y': (255, 255, 0), 1123 'green': (0, 128, 0), 1124 'g': (0, 128, 0), 1125 'cyan': (0, 255, 255), 1126 'c': (0, 255, 255), 1127 'blue': (0, 0, 255), 1128 'b': (0, 0, 255), 1129 'magenta': (255, 0, 255), 1130 'm': (255, 0, 255), 1131 'black': (0, 0, 0), 1132 'k': (0, 0, 0), 1133 'white': (255, 255, 255), 1134 'w': (255, 255, 255), 1135 # Names standardized in HTML/CSS spec 1136 # http://w3schools.com/html/html_colornames.asp 1137 'maroon': (128, 0, 0), 1138 'olive': (128, 128, 0), 1139 'lime': (0, 255, 0), 1140 'aqua': (0, 255, 255), 1141 'teal': (0, 128, 128), 1142 'navy': (0, 0, 128), 1143 'fuchsia': (255, 0, 255), 1144 'purple': (128, 0, 128), 1145 'silver': (192, 192, 192), 1146 'gray': (128, 128, 128), 1147 # More definitions from matplotlib/gcolor2 1148 'grey': (128, 128, 128), 1149 'pink': (255, 192, 203), 1150 'salmon': (250, 128, 114), 1151 'orange': (255, 165, 0), 1152 'gold': (255, 215, 0), 1153 'tan': (210, 180, 140), 1154 'brown': (165, 42, 42), 1155 } 1156
1157 - def __init__(self, red, green, blue):
1158 """Initialize BranchColor for a tree.""" 1159 for color in (red, green, blue): 1160 assert (isinstance(color, int) and 1161 0 <= color <= 255 1162 ), "Color values must be integers between 0 and 255." 1163 self.red = red 1164 self.green = green 1165 self.blue = blue
1166 1167 @classmethod
1168 - def from_hex(cls, hexstr):
1169 """Construct a BranchColor object from a hexadecimal string. 1170 1171 The string format is the same style used in HTML and CSS, such as 1172 '#FF8000' for an RGB value of (255, 128, 0). 1173 """ 1174 assert (isinstance(hexstr, basestring) and 1175 hexstr.startswith('#') and 1176 len(hexstr) == 7 1177 ), "need a 24-bit hexadecimal string, e.g. #000000" 1178 1179 RGB = hexstr[1:3], hexstr[3:5], hexstr[5:] 1180 return cls(*[int('0x' + cc, base=16) for cc in RGB])
1181 1182 @classmethod
1183 - def from_name(cls, colorname):
1184 """Construct a BranchColor object by the color's name.""" 1185 return cls(*cls.color_names[colorname])
1186
1187 - def to_hex(self):
1188 """Return a 24-bit hexadecimal RGB representation of this color. 1189 1190 The returned string is suitable for use in HTML/CSS, as a color 1191 parameter in matplotlib, and perhaps other situations. 1192 1193 Example: 1194 1195 >>> bc = BranchColor(12, 200, 100) 1196 >>> bc.to_hex() 1197 '#0cc864' 1198 1199 """ 1200 return "#%02x%02x%02x" % (self.red, self.green, self.blue)
1201
1202 - def to_rgb(self):
1203 """Return a tuple of RGB values (0 to 255) representing this color. 1204 1205 Example: 1206 1207 >>> bc = BranchColor(255, 165, 0) 1208 >>> bc.to_rgb() 1209 (255, 165, 0) 1210 1211 """ 1212 return (self.red, self.green, self.blue)
1213
1214 - def __repr__(self):
1215 """Preserve the standard RGB order when representing this object.""" 1216 return ('%s(red=%d, green=%d, blue=%d)' 1217 % (self.__class__.__name__, self.red, self.green, self.blue))
1218
1219 - def __str__(self):
1220 """Show the color's RGB values.""" 1221 return "(%d, %d, %d)" % (self.red, self.green, self.blue)
1222