Package Bio :: Package Phylo :: Module Consensus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.Consensus

  1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ Classes and methods for finding consensus trees. 
  7   
  8  This module contains a ``_BitString`` class to assist the consensus tree 
  9  searching and some common consensus algorithms such as strict, majority rule and 
 10  adam consensus. 
 11  """ 
 12  from __future__ import division 
 13   
 14  import random 
 15  import itertools 
 16   
 17  from ast import literal_eval 
 18  from Bio.Phylo import BaseTree 
19 20 21 -class _BitString(str):
22 """Helper class for binary string data (PRIVATE). 23 24 Assistant class of binary string data used for storing and 25 counting compatible clades in consensus tree searching. It includes 26 some binary manipulation(&|^~) methods. 27 28 _BitString is a sub-class of ``str`` object that only accepts two 29 characters('0' and '1'), with additional functions for binary-like 30 manipulation(&|^~). It is used to count and store the clades in 31 multiple trees in consensus tree searching. During counting, the 32 clades will be considered the same if their terminals(in terms of 33 ``name`` attribute) are the same. 34 35 For example, let's say two trees are provided as below to search 36 their strict consensus tree:: 37 38 tree1: (((A, B), C),(D, E)) 39 tree2: ((A, (B, C)),(D, E)) 40 41 For both trees, a _BitString object '11111' will represent their 42 root clade. Each '1' stands for the terminal clade in the list 43 [A, B, C, D, E](the order might not be the same, it's determined 44 by the ``get_terminal`` method of the first tree provided). For 45 the clade ((A, B), C) in tree1 and (A, (B, C)) in tree2, they both 46 can be represented by '11100'. Similarly, '11000' represents clade 47 (A, B) in tree1, '01100' represents clade (B, C) in tree2, and '00011' 48 represents clade (D, E) in both trees. 49 50 So, with the ``_count_clades`` function in this module, finally we 51 can get the clade counts and their _BitString representation as follows 52 (the root and terminals are omitted):: 53 54 clade _BitString count 55 ABC '11100' 2 56 DE '00011' 2 57 AB '11000' 1 58 BC '01100' 1 59 60 To get the _BitString representation of a clade, we can use the following 61 code snippet:: 62 63 # suppose we are provided with a tree list, the first thing to do is 64 # to get all the terminal names in the first tree 65 term_names = [term.name for term in trees[0].get_terminals()] 66 # for a specific clade in any of the tree, also get its terminal names 67 clade_term_names = [term.name for term in clade.get_terminals()] 68 # then create a boolean list 69 boolvals = [name in clade_term_names for name in term_names] 70 # create the string version and pass it to _BitString 71 bitstr = _BitString(''.join(map(str, map(int, boolvals)))) 72 # or, equivalently: 73 bitstr = _BitString.from_bool(boolvals) 74 75 To convert back:: 76 77 # get all the terminal clades of the first tree 78 terms = [term for term in trees[0].get_terminals()] 79 # get the index of terminal clades in bitstr 80 index_list = bitstr.index_one() 81 # get all terminal clades by index 82 clade_terms = [terms[i] for i in index_list] 83 # create a new calde and append all the terminal clades 84 new_clade = BaseTree.Clade() 85 new_clade.clades.extend(clade_terms) 86 87 88 Example 89 ------- 90 91 >>> from Bio.Phylo.Consensus import _BitString 92 >>> bitstr1 = _BitString('11111') 93 >>> bitstr2 = _BitString('11100') 94 >>> bitstr3 = _BitString('01101') 95 >>> bitstr1 96 _BitString('11111') 97 >>> bitstr2 & bitstr3 98 _BitString('01100') 99 >>> bitstr2 | bitstr3 100 _BitString('11101') 101 >>> bitstr2 ^ bitstr3 102 _BitString('10001') 103 >>> bitstr2.index_one() 104 [0, 1, 2] 105 >>> bitstr3.index_one() 106 [1, 2, 4] 107 >>> bitstr3.index_zero() 108 [0, 3] 109 >>> bitstr1.contains(bitstr2) 110 True 111 >>> bitstr2.contains(bitstr3) 112 False 113 >>> bitstr2.independent(bitstr3) 114 False 115 >>> bitstr1.iscompatible(bitstr2) 116 True 117 >>> bitstr2.iscompatible(bitstr3) 118 False 119 """ 120
121 - def __new__(cls, strdata):
122 """init from a binary string data""" 123 if (isinstance(strdata, str) and 124 len(strdata) == strdata.count('0') + strdata.count('1')): 125 return str.__new__(cls, strdata) 126 else: 127 raise TypeError( 128 "The input should be a binary string composed of '0' and '1'")
129
130 - def __and__(self, other):
131 selfint = literal_eval('0b' + self) 132 otherint = literal_eval('0b' + other) 133 resultint = selfint & otherint 134 return _BitString(bin(resultint)[2:].zfill(len(self)))
135
136 - def __or__(self, other):
137 selfint = literal_eval('0b' + self) 138 otherint = literal_eval('0b' + other) 139 resultint = selfint | otherint 140 return _BitString(bin(resultint)[2:].zfill(len(self)))
141
142 - def __xor__(self, other):
143 selfint = literal_eval('0b' + self) 144 otherint = literal_eval('0b' + other) 145 resultint = selfint ^ otherint 146 return _BitString(bin(resultint)[2:].zfill(len(self)))
147
148 - def __rand__(self, other):
149 selfint = literal_eval('0b' + self) 150 otherint = literal_eval('0b' + other) 151 resultint = otherint & selfint 152 return _BitString(bin(resultint)[2:].zfill(len(self)))
153
154 - def __ror__(self, other):
155 selfint = literal_eval('0b' + self) 156 otherint = literal_eval('0b' + other) 157 resultint = otherint | selfint 158 return _BitString(bin(resultint)[2:].zfill(len(self)))
159
160 - def __rxor__(self, other):
161 selfint = literal_eval('0b' + self) 162 otherint = literal_eval('0b' + other) 163 resultint = otherint ^ selfint 164 return _BitString(bin(resultint)[2:].zfill(len(self)))
165
166 - def __repr__(self):
167 return '_BitString(' + str.__repr__(self) + ')'
168
169 - def index_one(self):
170 """Return a list of positions where the element is '1'""" 171 return [i for i, n in enumerate(self) if n == '1']
172
173 - def index_zero(self):
174 """Return a list of positions where the element is '0'""" 175 return [i for i, n in enumerate(self) if n == '0']
176
177 - def contains(self, other):
178 """Check if current bitstr1 contains another one bitstr2. 179 180 That is to say, the bitstr2.index_one() is a subset of 181 bitstr1.index_one(). 182 183 Examples: 184 "011011" contains "011000", "011001", "000011" 185 186 Be careful, "011011" also contains "000000". Actually, all _BitString 187 objects contain all-zero _BitString of the same length. 188 """ 189 xorbit = self ^ other 190 return (xorbit.count('1') == self.count('1') - other.count('1'))
191
192 - def independent(self, other):
193 """Check if current bitstr1 is independent of another one bitstr2. 194 195 That is to say the bitstr1.index_one() and bitstr2.index_one() have 196 no intersection. 197 198 Be careful, all _BitString objects are independent of all-zero _BitString 199 of the same length. 200 """ 201 xorbit = self ^ other 202 return (xorbit.count('1') == self.count('1') + other.count('1'))
203
204 - def iscompatible(self, other):
205 """Check if current bitstr1 is compatible with another bitstr2. 206 207 Two conditions are considered as compatible: 208 209 1. bitstr1.contain(bitstr2) or vise versa; 210 2. bitstr1.independent(bitstr2). 211 """ 212 return (self.contains(other) or other.contains(self) or 213 self.independent(other))
214 215 @classmethod
216 - def from_bool(cls, bools):
217 return cls(''.join(map(str, map(int, bools))))
218
219 220 -def strict_consensus(trees):
221 """Search strict consensus tree from multiple trees. 222 223 :Parameters: 224 trees : iterable 225 iterable of trees to produce consensus tree. 226 """ 227 trees_iter = iter(trees) 228 first_tree = next(trees_iter) 229 230 terms = first_tree.get_terminals() 231 bitstr_counts, tree_count = _count_clades( 232 itertools.chain([first_tree], trees_iter)) 233 234 # Store bitstrs for strict clades 235 strict_bitstrs = [bitstr for bitstr, t in bitstr_counts.items() 236 if t[0] == tree_count] 237 strict_bitstrs.sort(key=lambda bitstr: bitstr.count('1'), reverse=True) 238 # Create root 239 root = BaseTree.Clade() 240 if strict_bitstrs[0].count('1') == len(terms): 241 root.clades.extend(terms) 242 else: 243 raise ValueError('Taxons in provided trees should be consistent') 244 # make a bitstr to clades dict and store root clade 245 bitstr_clades = {strict_bitstrs[0]: root} 246 # create inner clades 247 for bitstr in strict_bitstrs[1:]: 248 clade_terms = [terms[i] for i in bitstr.index_one()] 249 clade = BaseTree.Clade() 250 clade.clades.extend(clade_terms) 251 for bs, c in bitstr_clades.items(): 252 # check if it should be the parent of current clade 253 if bs.contains(bitstr): 254 # remove old bitstring 255 del bitstr_clades[bs] 256 # update clade childs 257 new_childs = [child for child in c.clades 258 if child not in clade_terms] 259 c.clades = new_childs 260 # set current clade as child of c 261 c.clades.append(clade) 262 # update bitstring 263 bs = bs ^ bitstr 264 # update clade 265 bitstr_clades[bs] = c 266 break 267 # put new clade 268 bitstr_clades[bitstr] = clade 269 return BaseTree.Tree(root=root)
270
271 272 -def majority_consensus(trees, cutoff=0):
273 """Search majority rule consensus tree from multiple trees. 274 275 This is a extend majority rule method, which means the you can set any 276 cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to 277 create a relaxed binary consensus tree in any condition (as long as one of 278 the provided trees is a binary tree). The branch length of each consensus 279 clade in the result consensus tree is the average length of all counts for 280 that clade. 281 282 :Parameters: 283 trees : iterable 284 iterable of trees to produce consensus tree. 285 """ 286 tree_iter = iter(trees) 287 first_tree = next(tree_iter) 288 289 terms = first_tree.get_terminals() 290 bitstr_counts, tree_count = _count_clades( 291 itertools.chain([first_tree], tree_iter)) 292 293 # Sort bitstrs by descending #occurrences, then #tips, then tip order 294 bitstrs = sorted(bitstr_counts.keys(), 295 key=lambda bitstr: (bitstr_counts[bitstr][0], 296 bitstr.count('1'), 297 str(bitstr)), 298 reverse=True) 299 root = BaseTree.Clade() 300 if bitstrs[0].count('1') == len(terms): 301 root.clades.extend(terms) 302 else: 303 raise ValueError('Taxons in provided trees should be consistent') 304 # Make a bitstr-to-clades dict and store root clade 305 bitstr_clades = {bitstrs[0]: root} 306 # create inner clades 307 for bitstr in bitstrs[1:]: 308 # apply majority rule 309 count_in_trees, branch_length_sum = bitstr_counts[bitstr] 310 confidence = 100.0 * count_in_trees / tree_count 311 if confidence < cutoff * 100.0: 312 break 313 clade_terms = [terms[i] for i in bitstr.index_one()] 314 clade = BaseTree.Clade() 315 clade.clades.extend(clade_terms) 316 clade.confidence = confidence 317 clade.branch_length = branch_length_sum / count_in_trees 318 bsckeys = sorted(bitstr_clades, key=lambda bs: bs.count('1'), 319 reverse=True) 320 321 # check if current clade is compatible with previous clades and 322 # record it's possible parent and child clades. 323 compatible = True 324 parent_bitstr = None 325 child_bitstrs = [] # multiple independent childs 326 for bs in bsckeys: 327 if not bs.iscompatible(bitstr): 328 compatible = False 329 break 330 # assign the closest ancestor as its parent 331 # as bsckeys is sorted, it should be the last one 332 if bs.contains(bitstr): 333 parent_bitstr = bs 334 # assign the closest descendant as its child 335 # the largest and independent clades 336 if (bitstr.contains(bs) and bs != bitstr and 337 all(c.independent(bs) for c in child_bitstrs)): 338 child_bitstrs.append(bs) 339 if not compatible: 340 continue 341 342 if parent_bitstr: 343 # insert current clade; remove old bitstring 344 parent_clade = bitstr_clades.pop(parent_bitstr) 345 # update parent clade childs 346 parent_clade.clades = [c for c in parent_clade.clades 347 if c not in clade_terms] 348 # set current clade as child of parent_clade 349 parent_clade.clades.append(clade) 350 # update bitstring 351 # parent = parent ^ bitstr 352 # update clade 353 bitstr_clades[parent_bitstr] = parent_clade 354 355 if child_bitstrs: 356 remove_list = [] 357 for c in child_bitstrs: 358 remove_list.extend(c.index_one()) 359 child_clade = bitstr_clades[c] 360 parent_clade.clades.remove(child_clade) 361 clade.clades.append(child_clade) 362 remove_terms = [terms[i] for i in remove_list] 363 clade.clades = [c for c in clade.clades if c not in remove_terms] 364 # put new clade 365 bitstr_clades[bitstr] = clade 366 if ((len(bitstr_clades) == len(terms) - 1) or 367 (len(bitstr_clades) == len(terms) - 2 and len(root.clades) == 3)): 368 break 369 return BaseTree.Tree(root=root)
370
371 372 -def adam_consensus(trees):
373 """Search Adam Consensus tree from multiple trees 374 375 :Parameters: 376 trees : list 377 list of trees to produce consensus tree. 378 """ 379 clades = [tree.root for tree in trees] 380 return BaseTree.Tree(root=_part(clades), rooted=True)
381
382 383 -def _part(clades):
384 """recursive function of adam consensus algorithm""" 385 new_clade = None 386 terms = clades[0].get_terminals() 387 term_names = [term.name for term in terms] 388 if len(terms) == 1 or len(terms) == 2: 389 new_clade = clades[0] 390 else: 391 bitstrs = set([_BitString('1' * len(terms))]) 392 for clade in clades: 393 for child in clade.clades: 394 bitstr = _clade_to_bitstr(child, term_names) 395 to_remove = set() 396 to_add = set() 397 for bs in bitstrs: 398 if bs == bitstr: 399 continue 400 elif bs.contains(bitstr): 401 to_add.add(bitstr) 402 to_add.add(bs ^ bitstr) 403 to_remove.add(bs) 404 elif bitstr.contains(bs): 405 to_add.add(bs ^ bitstr) 406 elif not bs.independent(bitstr): 407 to_add.add(bs & bitstr) 408 to_add.add(bs & bitstr ^ bitstr) 409 to_add.add(bs & bitstr ^ bs) 410 to_remove.add(bs) 411 # bitstrs = bitstrs | to_add 412 bitstrs ^= to_remove 413 if to_add: 414 for ta in sorted(to_add, key=lambda bs: bs.count('1')): 415 independent = True 416 for bs in bitstrs: 417 if not ta.independent(bs): 418 independent = False 419 break 420 if independent: 421 bitstrs.add(ta) 422 new_clade = BaseTree.Clade() 423 for bitstr in sorted(bitstrs): 424 indices = bitstr.index_one() 425 if len(indices) == 1: 426 new_clade.clades.append(terms[indices[0]]) 427 elif len(indices) == 2: 428 bifur_clade = BaseTree.Clade() 429 bifur_clade.clades.append(terms[indices[0]]) 430 bifur_clade.clades.append(terms[indices[1]]) 431 new_clade.clades.append(bifur_clade) 432 elif len(indices) > 2: 433 part_names = [term_names[i] for i in indices] 434 next_clades = [] 435 for clade in clades: 436 next_clades.append(_sub_clade(clade, part_names)) 437 # next_clades = [clade.common_ancestor([clade.find_any(name=name) for name in part_names]) for clade in clades] 438 new_clade.clades.append(_part(next_clades)) 439 return new_clade
440
441 442 -def _sub_clade(clade, term_names):
443 """extract a compatible subclade that only contains the given terminal names""" 444 term_clades = [clade.find_any(name) for name in term_names] 445 sub_clade = clade.common_ancestor(term_clades) 446 if len(term_names) != sub_clade.count_terminals(): 447 temp_clade = BaseTree.Clade() 448 temp_clade.clades.extend(term_clades) 449 for c in sub_clade.find_clades(terminal=False, order="preorder"): 450 if c == sub_clade.root: 451 continue 452 childs = set(c.find_clades(terminal=True)) & set(term_clades) 453 if childs: 454 for tc in temp_clade.find_clades(terminal=False, 455 order="preorder"): 456 tc_childs = set(tc.clades) 457 tc_new_clades = tc_childs - childs 458 if childs.issubset(tc_childs) and tc_new_clades: 459 tc.clades = list(tc_new_clades) 460 child_clade = BaseTree.Clade() 461 child_clade.clades.extend(list(childs)) 462 tc.clades.append(child_clade) 463 sub_clade = temp_clade 464 return sub_clade
465
466 467 -def _count_clades(trees):
468 """Count distinct clades (different sets of terminal names) in the trees. 469 470 Return a tuple first a dict of bitstring (representing clade) and a tuple of its count of 471 occurrences and sum of branch length for that clade, second the number of trees processed. 472 473 :Parameters: 474 trees : iterable 475 An iterable that returns the trees to count 476 """ 477 bitstrs = {} 478 tree_count = 0 479 for tree in trees: 480 tree_count += 1 481 clade_bitstrs = _tree_to_bitstrs(tree) 482 for clade in tree.find_clades(terminal=False): 483 bitstr = clade_bitstrs[clade] 484 if bitstr in bitstrs: 485 count, sum_bl = bitstrs[bitstr] 486 count += 1 487 sum_bl += clade.branch_length or 0 488 bitstrs[bitstr] = (count, sum_bl) 489 else: 490 bitstrs[bitstr] = (1, clade.branch_length or 0) 491 return bitstrs, tree_count
492
493 494 -def get_support(target_tree, trees, len_trees=None):
495 """Calculate branch support for a target tree given bootstrap replicate trees. 496 497 :Parameters: 498 target_tree : Tree 499 tree to calculate branch support for. 500 trees : iterable 501 iterable of trees used to calculate branch support. 502 len_trees : int 503 optional count of replicates in trees. len_trees must be provided 504 when len(trees) is not a valid operation. 505 """ 506 term_names = sorted(term.name 507 for term in target_tree.find_clades(terminal=True)) 508 bitstrs = {} 509 510 size = len_trees 511 if size is None: 512 try: 513 size = len(trees) 514 except TypeError: 515 raise TypeError("Trees does not support len(trees), " 516 "you must provide the number of replicates in trees " 517 "as the optional parameter len_trees.") 518 519 for clade in target_tree.find_clades(terminal=False): 520 bitstr = _clade_to_bitstr(clade, term_names) 521 bitstrs[bitstr] = (clade, 0) 522 for tree in trees: 523 for clade in tree.find_clades(terminal=False): 524 bitstr = _clade_to_bitstr(clade, term_names) 525 if bitstr in bitstrs: 526 c, t = bitstrs[bitstr] 527 c.confidence = (t + 1) * 100.0 / size 528 bitstrs[bitstr] = (c, t + 1) 529 return target_tree
530
531 532 -def bootstrap(msa, times):
533 """Generate bootstrap replicates from a multiple sequence alignment object 534 535 :Parameters: 536 msa : MultipleSeqAlignment 537 multiple sequence alignment to generate replicates. 538 times : int 539 number of bootstrap times. 540 """ 541 542 length = len(msa[0]) 543 i = 0 544 while i < times: 545 i += 1 546 item = None 547 for j in range(length): 548 col = random.randint(0, length - 1) 549 if not item: 550 item = msa[:, col:col + 1] 551 else: 552 item += msa[:, col:col + 1] 553 yield item
554
555 556 -def bootstrap_trees(msa, times, tree_constructor):
557 """Generate bootstrap replicate trees from a multiple sequence alignment. 558 559 :Parameters: 560 msa : MultipleSeqAlignment 561 multiple sequence alignment to generate replicates. 562 times : int 563 number of bootstrap times. 564 tree_constructor : TreeConstructor 565 tree constructor to be used to build trees. 566 """ 567 568 msas = bootstrap(msa, times) 569 for aln in msas: 570 tree = tree_constructor.build_tree(aln) 571 yield tree
572
573 574 -def bootstrap_consensus(msa, times, tree_constructor, consensus):
575 """Consensus tree of a series of bootstrap trees for a multiple sequence alignment 576 577 :Parameters: 578 msa : MultipleSeqAlignment 579 Multiple sequence alignment to generate replicates. 580 times : int 581 Number of bootstrap times. 582 tree_constructor : TreeConstructor 583 Tree constructor to be used to build trees. 584 consensus : function 585 Consensus method in this module: `strict_consensus`, 586 `majority_consensus`, `adam_consensus`. 587 """ 588 trees = bootstrap_trees(msa, times, tree_constructor) 589 tree = consensus(list(trees)) 590 return tree
591
592 593 -def _clade_to_bitstr(clade, tree_term_names):
594 """Create a BitString representing a clade, given ordered tree taxon names.""" 595 clade_term_names = set(term.name for term in 596 clade.find_clades(terminal=True)) 597 return _BitString.from_bool((name in clade_term_names) 598 for name in tree_term_names)
599
600 601 -def _tree_to_bitstrs(tree):
602 """Create a dict of a tree's clades to corresponding BitStrings.""" 603 clades_bitstrs = {} 604 term_names = [term.name for term in tree.find_clades(terminal=True)] 605 for clade in tree.find_clades(terminal=False): 606 bitstr = _clade_to_bitstr(clade, term_names) 607 clades_bitstrs[clade] = bitstr 608 return clades_bitstrs
609
610 611 -def _bitstring_topology(tree):
612 """Generates a branch length dict for a tree, keyed by BitStrings. 613 614 Create a dict of all clades' BitStrings to the corresponding branch 615 lengths (rounded to 5 decimal places).""" 616 bitstrs = {} 617 for clade, bitstr in _tree_to_bitstrs(tree).items(): 618 bitstrs[bitstr] = round(clade.branch_length or 0.0, 5) 619 return bitstrs
620
621 622 -def _equal_topology(tree1, tree2):
623 """Are two trees are equal in terms of topology and branch lengths. 624 625 (Branch lengths checked to 5 decimal places.) 626 """ 627 term_names1 = set(term.name for term in tree1.find_clades(terminal=True)) 628 term_names2 = set(term.name for term in tree2.find_clades(terminal=True)) 629 return ((term_names1 == term_names2) and 630 (_bitstring_topology(tree1) == _bitstring_topology(tree2)))
631