Package Bio :: Package Phylo :: Module Consensus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.Consensus

  1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Classes and methods for finding consensus trees. 
  7   
  8  This module contains a ``_BitString`` class to assist the consensus tree 
  9  searching and some common consensus algorithms such as strict, majority rule and 
 10  adam consensus. 
 11  """ 
 12  from __future__ import division 
 13   
 14  import random 
 15  import itertools 
 16   
 17  from ast import literal_eval 
 18  from Bio.Phylo import BaseTree 
19 20 21 -class _BitString(str):
22 """Helper class for binary string data (PRIVATE). 23 24 Assistant class of binary string data used for storing and 25 counting compatible clades in consensus tree searching. It includes 26 some binary manipulation(&|^~) methods. 27 28 _BitString is a sub-class of ``str`` object that only accepts two 29 characters('0' and '1'), with additional functions for binary-like 30 manipulation(&|^~). It is used to count and store the clades in 31 multiple trees in consensus tree searching. During counting, the 32 clades will be considered the same if their terminals(in terms of 33 ``name`` attribute) are the same. 34 35 For example, let's say two trees are provided as below to search 36 their strict consensus tree:: 37 38 tree1: (((A, B), C),(D, E)) 39 tree2: ((A, (B, C)),(D, E)) 40 41 For both trees, a _BitString object '11111' will represent their 42 root clade. Each '1' stands for the terminal clade in the list 43 [A, B, C, D, E](the order might not be the same, it's determined 44 by the ``get_terminal`` method of the first tree provided). For 45 the clade ((A, B), C) in tree1 and (A, (B, C)) in tree2, they both 46 can be represented by '11100'. Similarly, '11000' represents clade 47 (A, B) in tree1, '01100' represents clade (B, C) in tree2, and '00011' 48 represents clade (D, E) in both trees. 49 50 So, with the ``_count_clades`` function in this module, finally we 51 can get the clade counts and their _BitString representation as follows 52 (the root and terminals are omitted):: 53 54 clade _BitString count 55 ABC '11100' 2 56 DE '00011' 2 57 AB '11000' 1 58 BC '01100' 1 59 60 To get the _BitString representation of a clade, we can use the following 61 code snippet:: 62 63 # suppose we are provided with a tree list, the first thing to do is 64 # to get all the terminal names in the first tree 65 term_names = [term.name for term in trees[0].get_terminals()] 66 # for a specific clade in any of the tree, also get its terminal names 67 clade_term_names = [term.name for term in clade.get_terminals()] 68 # then create a boolean list 69 boolvals = [name in clade_term_names for name in term_names] 70 # create the string version and pass it to _BitString 71 bitstr = _BitString(''.join(map(str, map(int, boolvals)))) 72 # or, equivalently: 73 bitstr = _BitString.from_bool(boolvals) 74 75 To convert back:: 76 77 # get all the terminal clades of the first tree 78 terms = [term for term in trees[0].get_terminals()] 79 # get the index of terminal clades in bitstr 80 index_list = bitstr.index_one() 81 # get all terminal clades by index 82 clade_terms = [terms[i] for i in index_list] 83 # create a new calde and append all the terminal clades 84 new_clade = BaseTree.Clade() 85 new_clade.clades.extend(clade_terms) 86 87 88 Example 89 ------- 90 91 >>> from Bio.Phylo.Consensus import _BitString 92 >>> bitstr1 = _BitString('11111') 93 >>> bitstr2 = _BitString('11100') 94 >>> bitstr3 = _BitString('01101') 95 >>> bitstr1 96 _BitString('11111') 97 >>> bitstr2 & bitstr3 98 _BitString('01100') 99 >>> bitstr2 | bitstr3 100 _BitString('11101') 101 >>> bitstr2 ^ bitstr3 102 _BitString('10001') 103 >>> bitstr2.index_one() 104 [0, 1, 2] 105 >>> bitstr3.index_one() 106 [1, 2, 4] 107 >>> bitstr3.index_zero() 108 [0, 3] 109 >>> bitstr1.contains(bitstr2) 110 True 111 >>> bitstr2.contains(bitstr3) 112 False 113 >>> bitstr2.independent(bitstr3) 114 False 115 >>> bitstr1.iscompatible(bitstr2) 116 True 117 >>> bitstr2.iscompatible(bitstr3) 118 False 119 """ 120
121 - def __new__(cls, strdata):
122 """Init from a binary string data""" 123 if (isinstance(strdata, str) and 124 len(strdata) == strdata.count('0') + strdata.count('1')): 125 return str.__new__(cls, strdata) 126 else: 127 raise TypeError( 128 "The input should be a binary string composed of '0' and '1'")
129
130 - def __and__(self, other):
131 selfint = literal_eval('0b' + self) 132 otherint = literal_eval('0b' + other) 133 resultint = selfint & otherint 134 return _BitString(bin(resultint)[2:].zfill(len(self)))
135
136 - def __or__(self, other):
137 selfint = literal_eval('0b' + self) 138 otherint = literal_eval('0b' + other) 139 resultint = selfint | otherint 140 return _BitString(bin(resultint)[2:].zfill(len(self)))
141
142 - def __xor__(self, other):
143 selfint = literal_eval('0b' + self) 144 otherint = literal_eval('0b' + other) 145 resultint = selfint ^ otherint 146 return _BitString(bin(resultint)[2:].zfill(len(self)))
147
148 - def __rand__(self, other):
149 selfint = literal_eval('0b' + self) 150 otherint = literal_eval('0b' + other) 151 resultint = otherint & selfint 152 return _BitString(bin(resultint)[2:].zfill(len(self)))
153
154 - def __ror__(self, other):
155 selfint = literal_eval('0b' + self) 156 otherint = literal_eval('0b' + other) 157 resultint = otherint | selfint 158 return _BitString(bin(resultint)[2:].zfill(len(self)))
159
160 - def __rxor__(self, other):
161 selfint = literal_eval('0b' + self) 162 otherint = literal_eval('0b' + other) 163 resultint = otherint ^ selfint 164 return _BitString(bin(resultint)[2:].zfill(len(self)))
165
166 - def __repr__(self):
167 return '_BitString(' + str.__repr__(self) + ')'
168
169 - def index_one(self):
170 """Return a list of positions where the element is '1'""" 171 return [i for i, n in enumerate(self) if n == '1']
172
173 - def index_zero(self):
174 """Return a list of positions where the element is '0'""" 175 return [i for i, n in enumerate(self) if n == '0']
176
177 - def contains(self, other):
178 """Check if current bitstr1 contains another one bitstr2. 179 180 That is to say, the bitstr2.index_one() is a subset of 181 bitstr1.index_one(). 182 183 Examples: 184 "011011" contains "011000", "011001", "000011" 185 186 Be careful, "011011" also contains "000000". Actually, all _BitString 187 objects contain all-zero _BitString of the same length. 188 189 """ 190 xorbit = self ^ other 191 return (xorbit.count('1') == self.count('1') - other.count('1'))
192
193 - def independent(self, other):
194 """Check if current bitstr1 is independent of another one bitstr2. 195 196 That is to say the bitstr1.index_one() and bitstr2.index_one() have 197 no intersection. 198 199 Be careful, all _BitString objects are independent of all-zero _BitString 200 of the same length. 201 """ 202 xorbit = self ^ other 203 return (xorbit.count('1') == self.count('1') + other.count('1'))
204
205 - def iscompatible(self, other):
206 """Check if current bitstr1 is compatible with another bitstr2. 207 208 Two conditions are considered as compatible: 209 1. bitstr1.contain(bitstr2) or vise versa; 210 2. bitstr1.independent(bitstr2). 211 212 """ 213 return (self.contains(other) or other.contains(self) or 214 self.independent(other))
215 216 @classmethod
217 - def from_bool(cls, bools):
218 return cls(''.join(map(str, map(int, bools))))
219
220 221 -def strict_consensus(trees):
222 """Search strict consensus tree from multiple trees. 223 224 :Parameters: 225 trees : iterable 226 iterable of trees to produce consensus tree. 227 228 """ 229 trees_iter = iter(trees) 230 first_tree = next(trees_iter) 231 232 terms = first_tree.get_terminals() 233 bitstr_counts, tree_count = _count_clades( 234 itertools.chain([first_tree], trees_iter)) 235 236 # Store bitstrs for strict clades 237 strict_bitstrs = [bitstr for bitstr, t in bitstr_counts.items() 238 if t[0] == tree_count] 239 strict_bitstrs.sort(key=lambda bitstr: bitstr.count('1'), reverse=True) 240 # Create root 241 root = BaseTree.Clade() 242 if strict_bitstrs[0].count('1') == len(terms): 243 root.clades.extend(terms) 244 else: 245 raise ValueError('Taxons in provided trees should be consistent') 246 # make a bitstr to clades dict and store root clade 247 bitstr_clades = {strict_bitstrs[0]: root} 248 # create inner clades 249 for bitstr in strict_bitstrs[1:]: 250 clade_terms = [terms[i] for i in bitstr.index_one()] 251 clade = BaseTree.Clade() 252 clade.clades.extend(clade_terms) 253 for bs, c in bitstr_clades.items(): 254 # check if it should be the parent of current clade 255 if bs.contains(bitstr): 256 # remove old bitstring 257 del bitstr_clades[bs] 258 # update clade childs 259 new_childs = [child for child in c.clades 260 if child not in clade_terms] 261 c.clades = new_childs 262 # set current clade as child of c 263 c.clades.append(clade) 264 # update bitstring 265 bs = bs ^ bitstr 266 # update clade 267 bitstr_clades[bs] = c 268 break 269 # put new clade 270 bitstr_clades[bitstr] = clade 271 return BaseTree.Tree(root=root)
272
273 274 -def majority_consensus(trees, cutoff=0):
275 """Search majority rule consensus tree from multiple trees. 276 277 This is a extend majority rule method, which means the you can set any 278 cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to 279 create a relaxed binary consensus tree in any condition (as long as one of 280 the provided trees is a binary tree). The branch length of each consensus 281 clade in the result consensus tree is the average length of all counts for 282 that clade. 283 284 :Parameters: 285 trees : iterable 286 iterable of trees to produce consensus tree. 287 288 """ 289 tree_iter = iter(trees) 290 first_tree = next(tree_iter) 291 292 terms = first_tree.get_terminals() 293 bitstr_counts, tree_count = _count_clades( 294 itertools.chain([first_tree], tree_iter)) 295 296 # Sort bitstrs by descending #occurrences, then #tips, then tip order 297 bitstrs = sorted(bitstr_counts.keys(), 298 key=lambda bitstr: (bitstr_counts[bitstr][0], 299 bitstr.count('1'), 300 str(bitstr)), 301 reverse=True) 302 root = BaseTree.Clade() 303 if bitstrs[0].count('1') == len(terms): 304 root.clades.extend(terms) 305 else: 306 raise ValueError('Taxons in provided trees should be consistent') 307 # Make a bitstr-to-clades dict and store root clade 308 bitstr_clades = {bitstrs[0]: root} 309 # create inner clades 310 for bitstr in bitstrs[1:]: 311 # apply majority rule 312 count_in_trees, branch_length_sum = bitstr_counts[bitstr] 313 confidence = 100.0 * count_in_trees / tree_count 314 if confidence < cutoff * 100.0: 315 break 316 clade_terms = [terms[i] for i in bitstr.index_one()] 317 clade = BaseTree.Clade() 318 clade.clades.extend(clade_terms) 319 clade.confidence = confidence 320 clade.branch_length = branch_length_sum / count_in_trees 321 bsckeys = sorted(bitstr_clades, key=lambda bs: bs.count('1'), 322 reverse=True) 323 324 # check if current clade is compatible with previous clades and 325 # record it's possible parent and child clades. 326 compatible = True 327 parent_bitstr = None 328 child_bitstrs = [] # multiple independent childs 329 for bs in bsckeys: 330 if not bs.iscompatible(bitstr): 331 compatible = False 332 break 333 # assign the closest ancestor as its parent 334 # as bsckeys is sorted, it should be the last one 335 if bs.contains(bitstr): 336 parent_bitstr = bs 337 # assign the closest descendant as its child 338 # the largest and independent clades 339 if (bitstr.contains(bs) and bs != bitstr and 340 all(c.independent(bs) for c in child_bitstrs)): 341 child_bitstrs.append(bs) 342 if not compatible: 343 continue 344 345 if parent_bitstr: 346 # insert current clade; remove old bitstring 347 parent_clade = bitstr_clades.pop(parent_bitstr) 348 # update parent clade childs 349 parent_clade.clades = [c for c in parent_clade.clades 350 if c not in clade_terms] 351 # set current clade as child of parent_clade 352 parent_clade.clades.append(clade) 353 # update bitstring 354 # parent = parent ^ bitstr 355 # update clade 356 bitstr_clades[parent_bitstr] = parent_clade 357 358 if child_bitstrs: 359 remove_list = [] 360 for c in child_bitstrs: 361 remove_list.extend(c.index_one()) 362 child_clade = bitstr_clades[c] 363 parent_clade.clades.remove(child_clade) 364 clade.clades.append(child_clade) 365 remove_terms = [terms[i] for i in remove_list] 366 clade.clades = [c for c in clade.clades if c not in remove_terms] 367 # put new clade 368 bitstr_clades[bitstr] = clade 369 if ((len(bitstr_clades) == len(terms) - 1) or 370 (len(bitstr_clades) == len(terms) - 2 and len(root.clades) == 3)): 371 break 372 return BaseTree.Tree(root=root)
373
374 375 -def adam_consensus(trees):
376 """Search Adam Consensus tree from multiple trees 377 378 :Parameters: 379 trees : list 380 list of trees to produce consensus tree. 381 382 """ 383 clades = [tree.root for tree in trees] 384 return BaseTree.Tree(root=_part(clades), rooted=True)
385
386 387 -def _part(clades):
388 """Recursive function of adam consensus algorithm""" 389 new_clade = None 390 terms = clades[0].get_terminals() 391 term_names = [term.name for term in terms] 392 if len(terms) == 1 or len(terms) == 2: 393 new_clade = clades[0] 394 else: 395 bitstrs = set([_BitString('1' * len(terms))]) 396 for clade in clades: 397 for child in clade.clades: 398 bitstr = _clade_to_bitstr(child, term_names) 399 to_remove = set() 400 to_add = set() 401 for bs in bitstrs: 402 if bs == bitstr: 403 continue 404 elif bs.contains(bitstr): 405 to_add.add(bitstr) 406 to_add.add(bs ^ bitstr) 407 to_remove.add(bs) 408 elif bitstr.contains(bs): 409 to_add.add(bs ^ bitstr) 410 elif not bs.independent(bitstr): 411 to_add.add(bs & bitstr) 412 to_add.add(bs & bitstr ^ bitstr) 413 to_add.add(bs & bitstr ^ bs) 414 to_remove.add(bs) 415 # bitstrs = bitstrs | to_add 416 bitstrs ^= to_remove 417 if to_add: 418 for ta in sorted(to_add, key=lambda bs: bs.count('1')): 419 independent = True 420 for bs in bitstrs: 421 if not ta.independent(bs): 422 independent = False 423 break 424 if independent: 425 bitstrs.add(ta) 426 new_clade = BaseTree.Clade() 427 for bitstr in sorted(bitstrs): 428 indices = bitstr.index_one() 429 if len(indices) == 1: 430 new_clade.clades.append(terms[indices[0]]) 431 elif len(indices) == 2: 432 bifur_clade = BaseTree.Clade() 433 bifur_clade.clades.append(terms[indices[0]]) 434 bifur_clade.clades.append(terms[indices[1]]) 435 new_clade.clades.append(bifur_clade) 436 elif len(indices) > 2: 437 part_names = [term_names[i] for i in indices] 438 next_clades = [] 439 for clade in clades: 440 next_clades.append(_sub_clade(clade, part_names)) 441 # next_clades = [clade.common_ancestor([clade.find_any(name=name) for name in part_names]) for clade in clades] 442 new_clade.clades.append(_part(next_clades)) 443 return new_clade
444
445 446 -def _sub_clade(clade, term_names):
447 """Extract a compatible subclade that only contains the given terminal names""" 448 term_clades = [clade.find_any(name) for name in term_names] 449 sub_clade = clade.common_ancestor(term_clades) 450 if len(term_names) != sub_clade.count_terminals(): 451 temp_clade = BaseTree.Clade() 452 temp_clade.clades.extend(term_clades) 453 for c in sub_clade.find_clades(terminal=False, order="preorder"): 454 if c == sub_clade.root: 455 continue 456 childs = set(c.find_clades(terminal=True)) & set(term_clades) 457 if childs: 458 for tc in temp_clade.find_clades(terminal=False, 459 order="preorder"): 460 tc_childs = set(tc.clades) 461 tc_new_clades = tc_childs - childs 462 if childs.issubset(tc_childs) and tc_new_clades: 463 tc.clades = list(tc_new_clades) 464 child_clade = BaseTree.Clade() 465 child_clade.clades.extend(list(childs)) 466 tc.clades.append(child_clade) 467 sub_clade = temp_clade 468 return sub_clade
469
470 471 -def _count_clades(trees):
472 """Count distinct clades (different sets of terminal names) in the trees. 473 474 Return a tuple first a dict of bitstring (representing clade) and a tuple of its count of 475 occurrences and sum of branch length for that clade, second the number of trees processed. 476 477 :Parameters: 478 trees : iterable 479 An iterable that returns the trees to count 480 481 """ 482 bitstrs = {} 483 tree_count = 0 484 for tree in trees: 485 tree_count += 1 486 clade_bitstrs = _tree_to_bitstrs(tree) 487 for clade in tree.find_clades(terminal=False): 488 bitstr = clade_bitstrs[clade] 489 if bitstr in bitstrs: 490 count, sum_bl = bitstrs[bitstr] 491 count += 1 492 sum_bl += clade.branch_length or 0 493 bitstrs[bitstr] = (count, sum_bl) 494 else: 495 bitstrs[bitstr] = (1, clade.branch_length or 0) 496 return bitstrs, tree_count
497
498 499 -def get_support(target_tree, trees, len_trees=None):
500 """Calculate branch support for a target tree given bootstrap replicate trees. 501 502 :Parameters: 503 target_tree : Tree 504 tree to calculate branch support for. 505 trees : iterable 506 iterable of trees used to calculate branch support. 507 len_trees : int 508 optional count of replicates in trees. len_trees must be provided 509 when len(trees) is not a valid operation. 510 511 """ 512 term_names = sorted(term.name 513 for term in target_tree.find_clades(terminal=True)) 514 bitstrs = {} 515 516 size = len_trees 517 if size is None: 518 try: 519 size = len(trees) 520 except TypeError: 521 raise TypeError("Trees does not support len(trees), " 522 "you must provide the number of replicates in trees " 523 "as the optional parameter len_trees.") 524 525 for clade in target_tree.find_clades(terminal=False): 526 bitstr = _clade_to_bitstr(clade, term_names) 527 bitstrs[bitstr] = (clade, 0) 528 for tree in trees: 529 for clade in tree.find_clades(terminal=False): 530 bitstr = _clade_to_bitstr(clade, term_names) 531 if bitstr in bitstrs: 532 c, t = bitstrs[bitstr] 533 c.confidence = (t + 1) * 100.0 / size 534 bitstrs[bitstr] = (c, t + 1) 535 return target_tree
536
537 538 -def bootstrap(msa, times):
539 """Generate bootstrap replicates from a multiple sequence alignment object 540 541 :Parameters: 542 msa : MultipleSeqAlignment 543 multiple sequence alignment to generate replicates. 544 times : int 545 number of bootstrap times. 546 547 """ 548 length = len(msa[0]) 549 i = 0 550 while i < times: 551 i += 1 552 item = None 553 for j in range(length): 554 col = random.randint(0, length - 1) 555 if not item: 556 item = msa[:, col:col + 1] 557 else: 558 item += msa[:, col:col + 1] 559 yield item
560
561 562 -def bootstrap_trees(msa, times, tree_constructor):
563 """Generate bootstrap replicate trees from a multiple sequence alignment. 564 565 :Parameters: 566 msa : MultipleSeqAlignment 567 multiple sequence alignment to generate replicates. 568 times : int 569 number of bootstrap times. 570 tree_constructor : TreeConstructor 571 tree constructor to be used to build trees. 572 573 """ 574 msas = bootstrap(msa, times) 575 for aln in msas: 576 tree = tree_constructor.build_tree(aln) 577 yield tree
578
579 580 -def bootstrap_consensus(msa, times, tree_constructor, consensus):
581 """Consensus tree of a series of bootstrap trees for a multiple sequence alignment 582 583 :Parameters: 584 msa : MultipleSeqAlignment 585 Multiple sequence alignment to generate replicates. 586 times : int 587 Number of bootstrap times. 588 tree_constructor : TreeConstructor 589 Tree constructor to be used to build trees. 590 consensus : function 591 Consensus method in this module: `strict_consensus`, 592 `majority_consensus`, `adam_consensus`. 593 594 """ 595 trees = bootstrap_trees(msa, times, tree_constructor) 596 tree = consensus(list(trees)) 597 return tree
598
599 600 -def _clade_to_bitstr(clade, tree_term_names):
601 """Create a BitString representing a clade, given ordered tree taxon names.""" 602 clade_term_names = set(term.name for term in 603 clade.find_clades(terminal=True)) 604 return _BitString.from_bool((name in clade_term_names) 605 for name in tree_term_names)
606
607 608 -def _tree_to_bitstrs(tree):
609 """Create a dict of a tree's clades to corresponding BitStrings.""" 610 clades_bitstrs = {} 611 term_names = [term.name for term in tree.find_clades(terminal=True)] 612 for clade in tree.find_clades(terminal=False): 613 bitstr = _clade_to_bitstr(clade, term_names) 614 clades_bitstrs[clade] = bitstr 615 return clades_bitstrs
616
617 618 -def _bitstring_topology(tree):
619 """Generates a branch length dict for a tree, keyed by BitStrings. 620 621 Create a dict of all clades' BitStrings to the corresponding branch 622 lengths (rounded to 5 decimal places). 623 """ 624 bitstrs = {} 625 for clade, bitstr in _tree_to_bitstrs(tree).items(): 626 bitstrs[bitstr] = round(clade.branch_length or 0.0, 5) 627 return bitstrs
628
629 630 -def _equal_topology(tree1, tree2):
631 """Are two trees are equal in terms of topology and branch lengths. 632 633 (Branch lengths checked to 5 decimal places.) 634 """ 635 term_names1 = set(term.name for term in tree1.find_clades(terminal=True)) 636 term_names2 = set(term.name for term in tree2.find_clades(terminal=True)) 637 return ((term_names1 == term_names2) and 638 (_bitstring_topology(tree1) == _bitstring_topology(tree2)))
639