Package Bio :: Package Phylo :: Module BaseTree
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.BaseTree

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Base classes for Bio.Phylo objects. 
   7   
   8  All object representations for phylogenetic trees should derive from these base 
   9  classes in order to use the common methods defined on them. 
  10  """ 
  11  __docformat__ = "restructuredtext en" 
  12   
  13  from Bio._py3k import basestring, filter, unicode, zip 
  14   
  15  import collections 
  16  import copy 
  17  import itertools 
  18  import random 
  19  import re 
  20   
  21  from Bio import _utils 
  22   
  23   
  24  # NB: On Python 2, repr() and str() are specified to return byte strings, not 
  25  # unicode. On Python 3, it's the opposite. Horrible. 
  26  import sys 
  27  if sys.version_info[0] < 3: 
28 - def as_string(s):
29 if isinstance(s, unicode): 30 return s.encode('utf-8') 31 return str(s)
32 else: 33 as_string = str
34 35 36 # General tree-traversal algorithms 37 38 -def _level_traverse(root, get_children):
39 """Traverse a tree in breadth-first (level) order.""" 40 Q = collections.deque([root]) 41 while Q: 42 v = Q.popleft() 43 yield v 44 Q.extend(get_children(v))
45
46 47 -def _preorder_traverse(root, get_children):
48 """Traverse a tree in depth-first pre-order (parent before children).""" 49 def dfs(elem): 50 yield elem 51 for v in get_children(elem): 52 for u in dfs(v): 53 yield u
54 for elem in dfs(root): 55 yield elem 56
57 58 -def _postorder_traverse(root, get_children):
59 """Traverse a tree in depth-first post-order (children before parent).""" 60 def dfs(elem): 61 for v in get_children(elem): 62 for u in dfs(v): 63 yield u 64 yield elem
65 for elem in dfs(root): 66 yield elem 67
68 69 -def _sorted_attrs(elem):
70 """Get a flat list of elem's attributes, sorted for consistency.""" 71 singles = [] 72 lists = [] 73 # Sort attributes for consistent results 74 for attrname, child in sorted(elem.__dict__.items(), 75 key=lambda kv: kv[0]): 76 if child is None: 77 continue 78 if isinstance(child, list): 79 lists.extend(child) 80 else: 81 singles.append(child) 82 return (x for x in singles + lists 83 if isinstance(x, TreeElement))
84
85 86 # Factory functions to generalize searching for clades/nodes 87 88 -def _identity_matcher(target):
89 """Match a node to the target object by identity.""" 90 def match(node): 91 return (node is target)
92 return match 93
94 95 -def _class_matcher(target_cls):
96 """Match a node if it's an instance of the given class.""" 97 def match(node): 98 return isinstance(node, target_cls)
99 return match 100
101 102 -def _string_matcher(target):
103 def match(node): 104 if isinstance(node, (Clade, Tree)): 105 # Avoid triggering specialized or recursive magic methods 106 return node.name == target 107 return as_string(node) == target
108 return match 109
110 111 -def _attribute_matcher(kwargs):
112 """Match a node by specified attribute values. 113 114 ``terminal`` is a special case: True restricts the search to external (leaf) 115 nodes, False restricts to internal nodes, and None allows all tree elements 116 to be searched, including phyloXML annotations. 117 118 Otherwise, for a tree element to match the specification (i.e. for the 119 function produced by `_attribute_matcher` to return True when given a tree 120 element), it must have each of the attributes specified by the keys and 121 match each of the corresponding values -- think 'and', not 'or', for 122 multiple keys. 123 """ 124 def match(node): 125 if 'terminal' in kwargs: 126 # Special case: restrict to internal/external/any nodes 127 kwa_copy = kwargs.copy() 128 pattern = kwa_copy.pop('terminal') 129 if (pattern is not None and 130 (not hasattr(node, 'is_terminal') or 131 node.is_terminal() != pattern)): 132 return False 133 else: 134 kwa_copy = kwargs 135 for key, pattern in kwa_copy.items(): 136 # Nodes must match all other specified attributes 137 if not hasattr(node, key): 138 return False 139 target = getattr(node, key) 140 if isinstance(pattern, basestring): 141 return (isinstance(target, basestring) and 142 re.match(pattern + '$', target)) 143 if isinstance(pattern, bool): 144 return (pattern == bool(target)) 145 if isinstance(pattern, int): 146 return (pattern == target) 147 if pattern is None: 148 return (target is None) 149 raise TypeError('invalid query type: %s' % type(pattern)) 150 return True
151 return match 152
153 154 -def _function_matcher(matcher_func):
155 """Safer attribute lookup -- returns False instead of raising an error.""" 156 def match(node): 157 try: 158 return matcher_func(node) 159 except (LookupError, AttributeError, ValueError, TypeError): 160 return False
161 return match 162
163 164 -def _object_matcher(obj):
165 """Retrieve a matcher function by passing an arbitrary object. 166 167 i.e. passing a `TreeElement` such as a `Clade` or `Tree` instance returns an 168 identity matcher, passing a type such as the `PhyloXML.Taxonomy` class 169 returns a class matcher, and passing a dictionary returns an attribute 170 matcher. 171 172 The resulting 'match' function returns True when given an object matching 173 the specification (identity, type or attribute values), otherwise False. 174 This is useful for writing functions that search the tree, and probably 175 shouldn't be used directly by the end user. 176 """ 177 if isinstance(obj, TreeElement): 178 return _identity_matcher(obj) 179 if isinstance(obj, type): 180 return _class_matcher(obj) 181 if isinstance(obj, basestring): 182 return _string_matcher(obj) 183 if isinstance(obj, dict): 184 return _attribute_matcher(obj) 185 if callable(obj): 186 return _function_matcher(obj) 187 raise ValueError("%s (type %s) is not a valid type for comparison." 188 % (obj, type(obj)))
189
190 191 -def _combine_matchers(target, kwargs, require_spec):
192 """Merge target specifications with keyword arguments. 193 194 Dispatch the components to the various matcher functions, then merge into a 195 single boolean function. 196 """ 197 if not target: 198 if not kwargs: 199 if require_spec: 200 raise ValueError("you must specify a target object or keyword " 201 "arguments.") 202 return lambda x: True 203 return _attribute_matcher(kwargs) 204 match_obj = _object_matcher(target) 205 if not kwargs: 206 return match_obj 207 match_kwargs = _attribute_matcher(kwargs) 208 return (lambda x: match_obj(x) and match_kwargs(x))
209
210 211 -def _combine_args(first, *rest):
212 """Convert ``[targets]`` or ``*targets`` arguments to a single iterable. 213 214 This helps other functions work like the built-in functions `max` and 215 `min`. 216 """ 217 # Background: is_monophyletic takes a single list or iterable (like the 218 # same method in Bio.Nexus.Trees); root_with_outgroup and common_ancestor 219 # take separate arguments. This mismatch was in the initial release and I 220 # didn't notice the inconsistency until after Biopython 1.55. I can think 221 # of cases where either style is more convenient, so let's support both 222 # (for backward compatibility and consistency between methods). 223 if hasattr(first, '__iter__') and not (isinstance(first, TreeElement) or 224 isinstance(first, type) or 225 isinstance(first, basestring) or 226 isinstance(first, dict)): 227 # `terminals` is an iterable of targets 228 if rest: 229 raise ValueError("Arguments must be either a single list of " 230 "targets, or separately specified targets " 231 "(e.g. foo(t1, t2, t3)), but not both.") 232 return first 233 # `terminals` is a single target -- wrap in a container 234 return itertools.chain([first], rest)
235
236 237 # Class definitions 238 239 -class TreeElement(object):
240 """Base class for all Bio.Phylo classes.""" 241
242 - def __repr__(self):
243 """Show this object's constructor with its primitive arguments.""" 244 def pair_as_kwarg_string(key, val): 245 if isinstance(val, basestring): 246 return ("%s='%s'" 247 % (key, _utils.trim_str(as_string(val), 60, '...'))) 248 return "%s=%s" % (key, val)
249 return ('%s(%s)' 250 % (self.__class__.__name__, 251 ', '.join(pair_as_kwarg_string(key, val) 252 for key, val in sorted(self.__dict__.items()) 253 if val is not None and 254 type(val) in (str, int, float, bool, unicode))))
255 256 __str__ = __repr__ 257
258 259 -class TreeMixin(object):
260 """Methods for Tree- and Clade-based classes. 261 262 This lets `Tree` and `Clade` support the same traversal and searching 263 operations without requiring Clade to inherit from Tree, so Clade isn't 264 required to have all of Tree's attributes -- just ``root`` (a Clade 265 instance) and ``is_terminal``. 266 """ 267 # Traversal methods 268
269 - def _filter_search(self, filter_func, order, follow_attrs):
270 """Perform a BFS or DFS traversal through all elements in this tree. 271 272 :returns: generator of all elements for which `filter_func` is True. 273 """ 274 order_opts = {'preorder': _preorder_traverse, 275 'postorder': _postorder_traverse, 276 'level': _level_traverse} 277 try: 278 order_func = order_opts[order] 279 except KeyError: 280 raise ValueError("Invalid order '%s'; must be one of: %s" 281 % (order, tuple(order_opts))) 282 if follow_attrs: 283 get_children = _sorted_attrs 284 root = self 285 else: 286 get_children = lambda elem: elem.clades 287 root = self.root 288 return filter(filter_func, order_func(root, get_children))
289
290 - def find_any(self, *args, **kwargs):
291 """Return the first element found by find_elements(), or None. 292 293 This is also useful for checking whether any matching element exists in 294 the tree, and can be used in a conditional expression. 295 """ 296 hits = self.find_elements(*args, **kwargs) 297 try: 298 return next(hits) 299 except StopIteration: 300 return None
301
302 - def find_elements(self, target=None, terminal=None, order='preorder', 303 **kwargs):
304 """Find all tree elements matching the given attributes. 305 306 The arbitrary keyword arguments indicate the attribute name of the 307 sub-element and the value to match: string, integer or boolean. Strings 308 are evaluated as regular expression matches; integers are compared 309 directly for equality, and booleans evaluate the attribute's truth value 310 (True or False) before comparing. To handle nonzero floats, search with 311 a boolean argument, then filter the result manually. 312 313 If no keyword arguments are given, then just the class type is used for 314 matching. 315 316 The result is an iterable through all matching objects, by depth-first 317 search. (Not necessarily the same order as the elements appear in the 318 source file!) 319 320 :Parameters: 321 target : TreeElement instance, type, dict, or callable 322 Specifies the characteristics to search for. (The default, 323 TreeElement, matches any standard Bio.Phylo type.) 324 terminal : bool 325 A boolean value to select for or against terminal nodes (a.k.a. 326 leaf nodes). True searches for only terminal nodes, False 327 excludes terminal nodes, and the default, None, searches both 328 terminal and non-terminal nodes, as well as any tree elements 329 lacking the ``is_terminal`` method. 330 order : {'preorder', 'postorder', 'level'} 331 Tree traversal order: 'preorder' (default) is depth-first 332 search, 'postorder' is DFS with child nodes preceding parents, 333 and 'level' is breadth-first search. 334 335 Example 336 ------- 337 338 >>> from Bio.Phylo.IO import PhyloXMIO 339 >>> phx = PhyloXMLIO.read('phyloxml_examples.xml') 340 >>> matches = phx.phylogenies[5].find_elements(code='OCTVU') 341 >>> next(matches) 342 Taxonomy(code='OCTVU', scientific_name='Octopus vulgaris') 343 344 """ 345 if terminal is not None: 346 kwargs['terminal'] = terminal 347 is_matching_elem = _combine_matchers(target, kwargs, False) 348 return self._filter_search(is_matching_elem, order, True)
349
350 - def find_clades(self, target=None, terminal=None, order='preorder', 351 **kwargs):
352 """Find each clade containing a matching element. 353 354 That is, find each element as with find_elements(), but return the 355 corresponding clade object. (This is usually what you want.) 356 357 :returns: an iterable through all matching objects, searching 358 depth-first (preorder) by default. 359 """ 360 def match_attrs(elem): 361 orig_clades = elem.__dict__.pop('clades') 362 found = elem.find_any(target, **kwargs) 363 elem.clades = orig_clades 364 return (found is not None)
365 if terminal is None: 366 is_matching_elem = match_attrs 367 else: 368 def is_matching_elem(elem): 369 return ((elem.is_terminal() == terminal) and 370 match_attrs(elem))
371 return self._filter_search(is_matching_elem, order, False) 372
373 - def get_path(self, target=None, **kwargs):
374 """List the clades directly between this root and the given target. 375 376 :returns: list of all clade objects along this path, ending with the 377 given target, but excluding the root clade. 378 """ 379 # Only one path will work -- ignore weights and visits 380 path = [] 381 match = _combine_matchers(target, kwargs, True) 382 383 def check_in_path(v): 384 if match(v): 385 path.append(v) 386 return True 387 elif v.is_terminal(): 388 return False 389 for child in v: 390 if check_in_path(child): 391 path.append(v) 392 return True 393 return False
394 395 if not check_in_path(self.root): 396 return None 397 return path[-2::-1] 398
399 - def get_nonterminals(self, order='preorder'):
400 """Get a list of all of this tree's nonterminal (internal) nodes.""" 401 return list(self.find_clades(terminal=False, order=order))
402
403 - def get_terminals(self, order='preorder'):
404 """Get a list of all of this tree's terminal (leaf) nodes.""" 405 return list(self.find_clades(terminal=True, order=order))
406
407 - def trace(self, start, finish):
408 """List of all clade object between two targets in this tree. 409 410 Excluding `start`, including `finish`. 411 """ 412 mrca = self.common_ancestor(start, finish) 413 fromstart = mrca.get_path(start)[-2::-1] 414 to = mrca.get_path(finish) 415 return fromstart + [mrca] + to
416 417 # Information methods 418
419 - def common_ancestor(self, targets, *more_targets):
420 """Most recent common ancestor (clade) of all the given targets. 421 422 Edge cases: 423 - If no target is given, returns self.root 424 - If 1 target is given, returns the target 425 - If any target is not found in this tree, raises a ValueError 426 """ 427 paths = [self.get_path(t) 428 for t in _combine_args(targets, *more_targets)] 429 # Validation -- otherwise izip throws a spooky error below 430 for p, t in zip(paths, targets): 431 if p is None: 432 raise ValueError("target %s is not in this tree" % repr(t)) 433 mrca = self.root 434 for level in zip(*paths): 435 ref = level[0] 436 for other in level[1:]: 437 if ref is not other: 438 break 439 else: 440 mrca = ref 441 if ref is not mrca: 442 break 443 return mrca
444
445 - def count_terminals(self):
446 """Counts the number of terminal (leaf) nodes within this tree.""" 447 return _utils.iterlen(self.find_clades(terminal=True))
448
449 - def depths(self, unit_branch_lengths=False):
450 """Create a mapping of tree clades to depths (by branch length). 451 452 :Parameters: 453 unit_branch_lengths : bool 454 If True, count only the number of branches (levels in the tree). 455 By default the distance is the cumulative branch length leading 456 to the clade. 457 458 :returns: dict of {clade: depth}, where keys are all of the Clade 459 instances in the tree, and values are the distance from the root to 460 each clade (including terminals). 461 """ 462 if unit_branch_lengths: 463 depth_of = lambda c: 1 464 else: 465 depth_of = lambda c: c.branch_length or 0 466 depths = {} 467 468 def update_depths(node, curr_depth): 469 depths[node] = curr_depth 470 for child in node.clades: 471 new_depth = curr_depth + depth_of(child) 472 update_depths(child, new_depth)
473 474 update_depths(self.root, self.root.branch_length or 0) 475 return depths 476
477 - def distance(self, target1, target2=None):
478 """Calculate the sum of the branch lengths between two targets. 479 480 If only one target is specified, the other is the root of this tree. 481 """ 482 if target2 is None: 483 return sum(n.branch_length for n in self.get_path(target1) 484 if n.branch_length is not None) 485 mrca = self.common_ancestor(target1, target2) 486 return mrca.distance(target1) + mrca.distance(target2)
487
488 - def is_bifurcating(self):
489 """Return True if tree downstream of node is strictly bifurcating. 490 491 I.e., all nodes have either 2 or 0 children (internal or external, 492 respectively). The root may have 3 descendents and still be considered 493 part of a bifurcating tree, because it has no ancestor. 494 """ 495 # Root can be trifurcating 496 if isinstance(self, Tree) and len(self.root) == 3: 497 return (self.root.clades[0].is_bifurcating() and 498 self.root.clades[1].is_bifurcating() and 499 self.root.clades[2].is_bifurcating()) 500 if len(self.root) == 2: 501 return (self.root.clades[0].is_bifurcating() and 502 self.root.clades[1].is_bifurcating()) 503 if len(self.root) == 0: 504 return True 505 return False
506
507 - def is_monophyletic(self, terminals, *more_terminals):
508 """MRCA of terminals if they comprise a complete subclade, or False. 509 510 I.e., there exists a clade such that its terminals are the same set as 511 the given targets. 512 513 The given targets must be terminals of the tree. 514 515 To match both `Bio.Nexus.Trees` and the other multi-target methods in 516 Bio.Phylo, arguments to this method can be specified either of two ways: 517 (i) as a single list of targets, or (ii) separately specified targets, 518 e.g. is_monophyletic(t1, t2, t3) -- but not both. 519 520 For convenience, this method returns the common ancestor (MCRA) of the 521 targets if they are monophyletic (instead of the value True), and False 522 otherwise. 523 524 :returns: common ancestor if terminals are monophyletic, otherwise False. 525 """ 526 target_set = set(_combine_args(terminals, *more_terminals)) 527 current = self.root 528 while True: 529 if set(current.get_terminals()) == target_set: 530 return current 531 # Try a narrower subclade 532 for subclade in current.clades: 533 if set(subclade.get_terminals()).issuperset(target_set): 534 current = subclade 535 break 536 else: 537 return False
538
539 - def is_parent_of(self, target=None, **kwargs):
540 """True if target is a descendent of this tree. 541 542 Not required to be a direct descendent. 543 544 To check only direct descendents of a clade, simply use list membership 545 testing: ``if subclade in clade: ...`` 546 """ 547 return self.get_path(target, **kwargs) is not None
548
549 - def is_preterminal(self):
550 """True if all direct descendents are terminal.""" 551 if self.root.is_terminal(): 552 return False 553 for clade in self.root.clades: 554 if not clade.is_terminal(): 555 return False 556 return True
557
558 - def total_branch_length(self):
559 """Calculate the sum of all the branch lengths in this tree.""" 560 return sum(node.branch_length 561 for node in self.find_clades(branch_length=True))
562 563 # Tree manipulation methods 564
565 - def collapse(self, target=None, **kwargs):
566 """Deletes target from the tree, relinking its children to its parent. 567 568 :returns: the parent clade. 569 """ 570 path = self.get_path(target, **kwargs) 571 if not path: 572 raise ValueError("couldn't collapse %s in this tree" 573 % (target or kwargs)) 574 if len(path) == 1: 575 parent = self.root 576 else: 577 parent = path[-2] 578 popped = parent.clades.pop(parent.clades.index(path[-1])) 579 extra_length = popped.branch_length or 0 580 for child in popped: 581 child.branch_length += extra_length 582 parent.clades.extend(popped.clades) 583 return parent
584
585 - def collapse_all(self, target=None, **kwargs):
586 """Collapse all the descendents of this tree, leaving only terminals. 587 588 Total branch lengths are preserved, i.e. the distance to each terminal 589 stays the same. 590 591 For example, this will safely collapse nodes with poor bootstrap 592 support: 593 594 >>> tree.collapse_all(lambda c: c.confidence is not None and 595 ... c.confidence < 70) 596 597 This implementation avoids strange side-effects by using level-order 598 traversal and testing all clade properties (versus the target 599 specification) up front. In particular, if a clade meets the target 600 specification in the original tree, it will be collapsed. For example, 601 if the condition is: 602 603 >>> tree.collapse_all(lambda c: c.branch_length < 0.1) 604 605 Collapsing a clade's parent node adds the parent's branch length to the 606 child, so during the execution of collapse_all, a clade's branch_length 607 may increase. In this implementation, clades are collapsed according to 608 their properties in the original tree, not the properties when tree 609 traversal reaches the clade. (It's easier to debug.) If you want the 610 other behavior (incremental testing), modifying the source code of this 611 function is straightforward. 612 """ 613 # Read the iterable into a list to protect against in-place changes 614 matches = list(self.find_clades(target, False, 'level', **kwargs)) 615 if not matches: 616 # No matching nodes to collapse 617 return 618 # Skip the root node -- it can't be collapsed 619 if matches[0] == self.root: 620 matches.pop(0) 621 for clade in matches: 622 self.collapse(clade)
623
624 - def ladderize(self, reverse=False):
625 """Sort clades in-place according to the number of terminal nodes. 626 627 Deepest clades are last by default. Use ``reverse=True`` to sort clades 628 deepest-to-shallowest. 629 """ 630 self.root.clades.sort(key=lambda c: c.count_terminals(), 631 reverse=reverse) 632 for subclade in self.root.clades: 633 subclade.ladderize(reverse=reverse)
634
635 - def prune(self, target=None, **kwargs):
636 """Prunes a terminal clade from the tree. 637 638 If taxon is from a bifurcation, the connecting node will be collapsed 639 and its branch length added to remaining terminal node. This might be no 640 longer be a meaningful value. 641 642 :returns: parent clade of the pruned target 643 """ 644 if 'terminal' in kwargs and kwargs['terminal']: 645 raise ValueError("target must be terminal") 646 path = self.get_path(target, terminal=True, **kwargs) 647 if not path: 648 raise ValueError("can't find a matching target below this root") 649 if len(path) == 1: 650 parent = self.root 651 else: 652 parent = path[-2] 653 parent.clades.remove(path[-1]) 654 if len(parent) == 1: 655 # We deleted a branch from a bifurcation 656 if parent == self.root: 657 # If we're at the root, move the root upwards 658 # NB: This loses the length of the original branch 659 newroot = parent.clades[0] 660 newroot.branch_length = None 661 parent = self.root = newroot 662 else: 663 # If we're not at the root, collapse this parent 664 child = parent.clades[0] 665 if child.branch_length is not None: 666 child.branch_length += (parent.branch_length or 0.0) 667 if len(path) < 3: 668 grandparent = self.root 669 else: 670 grandparent = path[-3] 671 # Replace parent with child at the same place in grandparent 672 index = grandparent.clades.index(parent) 673 grandparent.clades.pop(index) 674 grandparent.clades.insert(index, child) 675 parent = grandparent 676 return parent
677
678 - def split(self, n=2, branch_length=1.0):
679 """Generate n (default 2) new descendants. 680 681 In a species tree, this is a speciation event. 682 683 New clades have the given branch_length and the same name as this 684 clade's root plus an integer suffix (counting from 0). For example, 685 splitting a clade named "A" produces sub-clades named "A0" and "A1". 686 If the clade has no name, the prefix "n" is used for child nodes, e.g. 687 "n0" and "n1". 688 """ 689 clade_cls = type(self.root) 690 base_name = self.root.name or 'n' 691 for i in range(n): 692 clade = clade_cls(name=base_name + str(i), 693 branch_length=branch_length) 694 self.root.clades.append(clade)
695
696 697 -class Tree(TreeElement, TreeMixin):
698 """A phylogenetic tree, containing global info for the phylogeny. 699 700 The structure and node-specific data is accessible through the 'root' 701 clade attached to the Tree instance. 702 703 :Parameters: 704 root : Clade 705 The starting node of the tree. If the tree is rooted, this will 706 usually be the root node. 707 rooted : bool 708 Whether or not the tree is rooted. By default, a tree is assumed to 709 be rooted. 710 id : str 711 The identifier of the tree, if there is one. 712 name : str 713 The name of the tree, in essence a label. 714 """ 715
716 - def __init__(self, root=None, rooted=True, id=None, name=None):
717 self.root = root or Clade() 718 self.rooted = rooted 719 self.id = id 720 self.name = name
721 722 @classmethod
723 - def from_clade(cls, clade, **kwargs):
724 """Create a new Tree object given a clade. 725 726 Keyword arguments are the usual `Tree` constructor parameters. 727 """ 728 root = copy.deepcopy(clade) 729 return cls(root, **kwargs)
730 731 @classmethod
732 - def randomized(cls, taxa, branch_length=1.0, branch_stdev=None):
733 """Create a randomized bifurcating tree given a list of taxa. 734 735 :param taxa: Either an integer specifying the number of taxa to create 736 (automatically named taxon#), or an iterable of taxon names, as 737 strings. 738 739 :returns: a tree of the same type as this class. 740 """ 741 if isinstance(taxa, int): 742 taxa = ['taxon%s' % (i + 1) for i in range(taxa)] 743 elif hasattr(taxa, '__iter__'): 744 taxa = list(taxa) 745 else: 746 raise TypeError("taxa argument must be integer (# taxa) or " 747 "iterable of taxon names.") 748 rtree = cls() 749 terminals = [rtree.root] 750 while len(terminals) < len(taxa): 751 newsplit = random.choice(terminals) 752 newsplit.split(branch_length=branch_length) 753 newterms = newsplit.clades 754 if branch_stdev: 755 # Add some noise to the branch lengths 756 for nt in newterms: 757 nt.branch_length = max(0, 758 random.gauss(branch_length, branch_stdev)) 759 terminals.remove(newsplit) 760 terminals.extend(newterms) 761 # Distribute taxon labels randomly 762 random.shuffle(taxa) 763 for node, name in zip(terminals, taxa): 764 node.name = name 765 return rtree
766 767 @property
768 - def clade(self):
769 """The first clade in this tree (not itself).""" 770 return self.root
771
772 - def as_phyloxml(self, **kwargs):
773 """Convert this tree to a PhyloXML-compatible Phylogeny. 774 775 This lets you use the additional annotation types PhyloXML defines, and 776 save this information when you write this tree as 'phyloxml'. 777 """ 778 from Bio.Phylo.PhyloXML import Phylogeny 779 return Phylogeny.from_tree(self, **kwargs)
780 781 # XXX Py3 Compatibility: In Python 3.0+, **kwargs can be replaced with the 782 # named keyword argument outgroup_branch_length=None
783 - def root_with_outgroup(self, outgroup_targets, *more_targets, **kwargs):
784 """Reroot this tree with the outgroup clade containing outgroup_targets. 785 786 Operates in-place. 787 788 Edge cases: 789 790 - If ``outgroup == self.root``, no change 791 - If outgroup is terminal, create new bifurcating root node with a 792 0-length branch to the outgroup 793 - If outgroup is internal, use the given outgroup node as the new 794 trifurcating root, keeping branches the same 795 - If the original root was bifurcating, drop it from the tree, 796 preserving total branch lengths 797 798 :param outgroup_branch_length: length of the branch leading to the 799 outgroup after rerooting. If not specified (None), then: 800 801 - If the outgroup is an internal node (not a single terminal taxon), 802 then use that node as the new root. 803 - Otherwise, create a new root node as the parent of the outgroup. 804 805 """ 806 # This raises a ValueError if any target is not in this tree 807 # Otherwise, common_ancestor guarantees outgroup is in this tree 808 outgroup = self.common_ancestor(outgroup_targets, *more_targets) 809 outgroup_path = self.get_path(outgroup) 810 if len(outgroup_path) == 0: 811 # Outgroup is the current root -- no change 812 return 813 814 prev_blen = outgroup.branch_length or 0.0 815 # Hideous kludge because Py2.x doesn't allow keyword args after *args 816 outgroup_branch_length = kwargs.get('outgroup_branch_length') 817 if outgroup_branch_length is not None: 818 assert 0 <= outgroup_branch_length <= prev_blen, \ 819 "outgroup_branch_length must be between 0 and the " \ 820 "original length of the branch leading to the outgroup." 821 822 if outgroup.is_terminal() or outgroup_branch_length is not None: 823 # Create a new root with a 0-length branch to the outgroup 824 outgroup.branch_length = outgroup_branch_length or 0.0 825 new_root = self.root.__class__( 826 branch_length=self.root.branch_length, clades=[outgroup]) 827 # The first branch reversal (see the upcoming loop) is modified 828 if len(outgroup_path) == 1: 829 # No nodes between the original root and outgroup to rearrange. 830 # Most of the code below will be skipped, but we still need 831 # 'new_parent' pointing at the new root. 832 new_parent = new_root 833 else: 834 parent = outgroup_path.pop(-2) 835 # First iteration of reversing the path to the outgroup 836 parent.clades.pop(parent.clades.index(outgroup)) 837 (prev_blen, parent.branch_length) = (parent.branch_length, 838 prev_blen - outgroup.branch_length) 839 new_root.clades.insert(0, parent) 840 new_parent = parent 841 else: 842 # Use the given outgroup node as the new (trifurcating) root 843 new_root = outgroup 844 new_root.branch_length = self.root.branch_length 845 new_parent = new_root 846 847 # Tracing the outgroup lineage backwards, reattach the subclades under a 848 # new root clade. Reverse the branches directly above the outgroup in 849 # the tree, but keep the descendants of those clades as they are. 850 for parent in outgroup_path[-2::-1]: 851 parent.clades.pop(parent.clades.index(new_parent)) 852 prev_blen, parent.branch_length = parent.branch_length, prev_blen 853 new_parent.clades.insert(0, parent) 854 new_parent = parent 855 856 # Finally, handle the original root according to number of descendents 857 old_root = self.root 858 if outgroup in old_root.clades: 859 assert len(outgroup_path) == 1 860 old_root.clades.pop(old_root.clades.index(outgroup)) 861 else: 862 old_root.clades.pop(old_root.clades.index(new_parent)) 863 if len(old_root) == 1: 864 # Delete the old bifurcating root & add branch lengths 865 ingroup = old_root.clades[0] 866 if ingroup.branch_length: 867 ingroup.branch_length += prev_blen 868 else: 869 ingroup.branch_length = prev_blen 870 new_parent.clades.insert(0, ingroup) 871 # ENH: If annotations are attached to old_root, do... something. 872 else: 873 # Keep the old trifurcating/polytomous root as an internal node 874 old_root.branch_length = prev_blen 875 new_parent.clades.insert(0, old_root) 876 877 self.root = new_root 878 self.rooted = True 879 return
880
881 - def root_at_midpoint(self):
882 """Root the tree at the midpoint of the two most distant taxa. 883 884 This operates in-place, leaving a bifurcating root. The topology of the 885 tree is otherwise retained, though no guarantees are made about the 886 stability of clade/node/taxon ordering. 887 """ 888 # Identify the largest pairwise distance 889 max_distance = 0.0 890 tips = self.get_terminals() 891 for tip in tips: 892 self.root_with_outgroup(tip) 893 new_max = max(self.depths().items(), key=lambda nd: nd[1]) 894 if new_max[1] > max_distance: 895 tip1 = tip 896 tip2 = new_max[0] 897 max_distance = new_max[1] 898 self.root_with_outgroup(tip1) 899 # Depth to go from the ingroup tip toward the outgroup tip 900 root_remainder = 0.5 * (max_distance - (self.root.branch_length or 0)) 901 assert root_remainder >= 0 902 # Identify the midpoint and reroot there. 903 # Trace the path to the outgroup tip until all of the root depth has 904 # been traveled/accounted for. 905 for node in self.get_path(tip2): 906 root_remainder -= node.branch_length 907 if root_remainder < 0: 908 outgroup_node = node 909 outgroup_branch_length = -root_remainder 910 break 911 else: 912 raise ValueError("Somehow, failed to find the midpoint!") 913 self.root_with_outgroup(outgroup_node, 914 outgroup_branch_length=outgroup_branch_length)
915 916 # Method assumed by TreeMixin 917
918 - def is_terminal(self):
919 """True if the root of this tree is terminal.""" 920 return (not self.root.clades)
921 922 # Convention from SeqRecord and Alignment classes 923
924 - def __format__(self, format_spec):
925 """Serialize the tree as a string in the specified file format. 926 927 This method supports the ``format`` built-in function added in Python 928 2.6/3.0. 929 930 :param format_spec: a lower-case string supported by `Bio.Phylo.write` 931 as an output file format. 932 """ 933 if format_spec: 934 from Bio._py3k import StringIO 935 from Bio.Phylo import _io 936 handle = StringIO() 937 _io.write([self], handle, format_spec) 938 return handle.getvalue() 939 else: 940 # Follow python convention and default to using __str__ 941 return str(self)
942
943 - def format(self, format):
944 """Serialize the tree as a string in the specified file format. 945 946 This duplicates the __format__ magic method for pre-2.6 Pythons. 947 """ 948 return self.__format__(format)
949 950 # Pretty-printer for the entire tree hierarchy 951
952 - def __str__(self):
953 """String representation of the entire tree. 954 955 Serializes each sub-clade recursively using ``repr`` to create a summary 956 of the object structure. 957 """ 958 TAB = ' ' 959 textlines = [] 960 961 def print_tree(obj, indent): 962 """Recursively serialize sub-elements. 963 964 This closes over textlines and modifies it in-place. 965 """ 966 if isinstance(obj, (Tree, Clade)): 967 # Avoid infinite recursion or special formatting from str() 968 objstr = repr(obj) 969 else: 970 objstr = as_string(obj) 971 textlines.append(TAB * indent + objstr) 972 indent += 1 973 for attr in obj.__dict__: 974 child = getattr(obj, attr) 975 if isinstance(child, TreeElement): 976 print_tree(child, indent) 977 elif isinstance(child, list): 978 for elem in child: 979 if isinstance(elem, TreeElement): 980 print_tree(elem, indent)
981 982 print_tree(self, 0) 983 return '\n'.join(textlines)
984
985 986 -class Clade(TreeElement, TreeMixin):
987 """A recursively defined sub-tree. 988 989 :Parameters: 990 branch_length : str 991 The length of the branch leading to the root node of this clade. 992 name : str 993 The clade's name (a label). 994 clades : list 995 Sub-trees rooted directly under this tree's root. 996 confidence : number 997 Support. 998 color : BranchColor 999 The display color of the branch and descendents. 1000 width : number 1001 The display width of the branch and descendents. 1002 """ 1003
1004 - def __init__(self, branch_length=None, name=None, clades=None, 1005 confidence=None, color=None, width=None):
1006 self.branch_length = branch_length 1007 self.name = name 1008 self.clades = clades or [] 1009 self.confidence = confidence 1010 self.color = color 1011 self.width = width
1012 1013 @property
1014 - def root(self):
1015 """Allow TreeMixin methods to traverse clades properly.""" 1016 return self
1017
1018 - def is_terminal(self):
1019 """True if this is a terminal (leaf) node.""" 1020 return (not self.clades)
1021 1022 # Sequence-type behavior methods 1023
1024 - def __getitem__(self, index):
1025 """Get clades by index (integer or slice).""" 1026 if isinstance(index, int) or isinstance(index, slice): 1027 return self.clades[index] 1028 ref = self 1029 for idx in index: 1030 ref = ref[idx] 1031 return ref
1032
1033 - def __iter__(self):
1034 """Iterate through this tree's direct descendent clades (sub-trees).""" 1035 return iter(self.clades)
1036
1037 - def __len__(self):
1038 """Number of clades directy under the root.""" 1039 return len(self.clades)
1040 1041 # Python 3:
1042 - def __bool__(self):
1043 """Boolean value of an instance of this class (True). 1044 1045 NB: If this method is not defined, but ``__len__`` is, then the object 1046 is considered true if the result of ``__len__()`` is nonzero. We want 1047 Clade instances to always be considered True. 1048 """ 1049 return True
1050 # Python 2: 1051 __nonzero__ = __bool__ 1052
1053 - def __str__(self):
1054 if self.name: 1055 return _utils.trim_str(self.name, 40, '...') 1056 return self.__class__.__name__
1057 1058 # Syntax sugar for setting the branch color
1059 - def _get_color(self):
1060 return self._color
1061
1062 - def _set_color(self, arg):
1063 if arg is None or isinstance(arg, BranchColor): 1064 self._color = arg 1065 elif isinstance(arg, basestring): 1066 if arg in BranchColor.color_names: 1067 # Known color name 1068 self._color = BranchColor.from_name(arg) 1069 elif arg.startswith('#') and len(arg) == 7: 1070 # HTML-style hex string 1071 self._color = BranchColor.from_hex(arg) 1072 else: 1073 raise ValueError("invalid color string %s" % arg) 1074 elif hasattr(arg, '__iter__') and len(arg) == 3: 1075 # RGB triplet 1076 self._color = BranchColor(*arg) 1077 else: 1078 raise ValueError("invalid color value %s" % arg)
1079 1080 color = property(_get_color, _set_color, doc="Branch color.")
1081
1082 1083 -class BranchColor(object):
1084 """Indicates the color of a clade when rendered graphically. 1085 1086 The color should be interpreted by client code (e.g. visualization 1087 programs) as applying to the whole clade, unless overwritten by the 1088 color(s) of sub-clades. 1089 1090 Color values must be integers from 0 to 255. 1091 """ 1092 1093 color_names = { 1094 'red': (255, 0, 0), 1095 'r': (255, 0, 0), 1096 'yellow': (255, 255, 0), 1097 'y': (255, 255, 0), 1098 'green': ( 0, 128, 0), 1099 'g': ( 0, 128, 0), 1100 'cyan': ( 0, 255, 255), 1101 'c': ( 0, 255, 255), 1102 'blue': ( 0, 0, 255), 1103 'b': ( 0, 0, 255), 1104 'magenta': (255, 0, 255), 1105 'm': (255, 0, 255), 1106 'black': ( 0, 0, 0), 1107 'k': ( 0, 0, 0), 1108 'white': (255, 255, 255), 1109 'w': (255, 255, 255), 1110 # Names standardized in HTML/CSS spec 1111 # http://w3schools.com/html/html_colornames.asp 1112 'maroon': (128, 0, 0), 1113 'olive': (128, 128, 0), 1114 'lime': ( 0, 255, 0), 1115 'aqua': ( 0, 255, 255), 1116 'teal': ( 0, 128, 128), 1117 'navy': ( 0, 0, 128), 1118 'fuchsia': (255, 0, 255), 1119 'purple': (128, 0, 128), 1120 'silver': (192, 192, 192), 1121 'gray': (128, 128, 128), 1122 # More definitions from matplotlib/gcolor2 1123 'grey': (128, 128, 128), 1124 'pink': (255, 192, 203), 1125 'salmon': (250, 128, 114), 1126 'orange': (255, 165, 0), 1127 'gold': (255, 215, 0), 1128 'tan': (210, 180, 140), 1129 'brown': (165, 42, 42), 1130 } 1131
1132 - def __init__(self, red, green, blue):
1133 for color in (red, green, blue): 1134 assert (isinstance(color, int) and 1135 0 <= color <= 255 1136 ), "Color values must be integers between 0 and 255." 1137 self.red = red 1138 self.green = green 1139 self.blue = blue
1140 1141 @classmethod
1142 - def from_hex(cls, hexstr):
1143 """Construct a BranchColor object from a hexadecimal string. 1144 1145 The string format is the same style used in HTML and CSS, such as 1146 '#FF8000' for an RGB value of (255, 128, 0). 1147 """ 1148 assert (isinstance(hexstr, basestring) and 1149 hexstr.startswith('#') and 1150 len(hexstr) == 7 1151 ), "need a 24-bit hexadecimal string, e.g. #000000" 1152 1153 RGB = hexstr[1:3], hexstr[3:5], hexstr[5:] 1154 return cls(*[int('0x' + cc, base=16) for cc in RGB])
1155 1156 @classmethod
1157 - def from_name(cls, colorname):
1158 """Construct a BranchColor object by the color's name.""" 1159 return cls(*cls.color_names[colorname])
1160
1161 - def to_hex(self):
1162 """Return a 24-bit hexadecimal RGB representation of this color. 1163 1164 The returned string is suitable for use in HTML/CSS, as a color 1165 parameter in matplotlib, and perhaps other situations. 1166 1167 Example: 1168 1169 >>> bc = BranchColor(12, 200, 100) 1170 >>> bc.to_hex() 1171 '#0cc864' 1172 """ 1173 return "#%02x%02x%02x" % (self.red, self.green, self.blue)
1174
1175 - def to_rgb(self):
1176 """Return a tuple of RGB values (0 to 255) representing this color. 1177 1178 Example: 1179 1180 >>> bc = BranchColor(255, 165, 0) 1181 >>> bc.to_rgb() 1182 (255, 165, 0) 1183 """ 1184 return (self.red, self.green, self.blue)
1185
1186 - def __repr__(self):
1187 """Preserve the standard RGB order when representing this object.""" 1188 return ('%s(red=%d, green=%d, blue=%d)' 1189 % (self.__class__.__name__, self.red, self.green, self.blue))
1190
1191 - def __str__(self):
1192 """Show the color's RGB values.""" 1193 return "(%d, %d, %d)" % (self.red, self.green, self.blue)
1194