Package Bio :: Package Phylo :: Module Consensus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.Consensus

  1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ Classes and methods for finding consensus trees. 
  7   
  8  This module contains a ``_BitString`` class to assist the consensus tree 
  9  searching and some common consensus algorithms such as strict, majority rule and 
 10  adam consensus. 
 11  """ 
 12  from __future__ import division 
 13   
 14  import random 
 15   
 16  from ast import literal_eval 
 17  from Bio.Phylo import BaseTree 
18 19 20 -class _BitString(str):
21 """Assistant class of binary string data used for storing and 22 counting compatible clades in consensus tree searching. It includes 23 some binary manipulation(&|^~) methods. 24 25 _BitString is a sub-class of ``str`` object that only accepts two 26 characters('0' and '1'), with additional functions for binary-like 27 manipulation(&|^~). It is used to count and store the clades in 28 multiple trees in consensus tree searching. During counting, the 29 clades will be considered the same if their terminals(in terms of 30 ``name`` attribute) are the same. 31 32 For example, let's say two trees are provided as below to search 33 their strict consensus tree: 34 35 tree1: (((A, B), C),(D, E)) 36 tree2: ((A, (B, C)),(D, E)) 37 38 For both trees, a _BitString object '11111' will represent their 39 root clade. Each '1' stands for the terminal clade in the list 40 [A, B, C, D, E](the order might not be the same, it's determined 41 by the ``get_terminal`` method of the first tree provided). For 42 the clade ((A, B), C) in tree1 and (A, (B, C)) in tree2, they both 43 can be represented by '11100'. Similarly, '11000' represents clade 44 (A, B) in tree1, '01100' represents clade (B, C) in tree2, and '00011' 45 represents clade (D, E) in both trees. 46 47 So, with the ``_count_clades`` function in this module, finally we 48 can get the clade counts and their _BitString representation as follows 49 (the root and terminals are omitted): 50 51 clade _BitString count 52 ABC '11100' 2 53 DE '00011' 2 54 AB '11000' 1 55 BC '01100' 1 56 57 To get the _BitString representation of a clade, we can use the following 58 code snippet: 59 60 # suppose we are provided with a tree list, the first thing to do is 61 # to get all the terminal names in the first tree 62 term_names = [term.name for term in trees[0].get_terminals()] 63 # for a specific clade in any of the tree, also get its terminal names 64 clade_term_names = [term.name for term in clade.get_terminals()] 65 # then create a boolean list 66 boolvals = [name in clade_term_names for name in term_names] 67 # create the string version and pass it to _BitString 68 bitstr = _BitString(''.join(map(str, map(int, boolvals)))) 69 # or, equivalently: 70 bitstr = _BitString.from_bool(boolvals) 71 72 To convert back: 73 74 # get all the terminal clades of the first tree 75 terms = [term for term in trees[0].get_terminals()] 76 # get the index of terminal clades in bitstr 77 index_list = bitstr.index_one() 78 # get all terminal clades by index 79 clade_terms = [terms[i] for i in index_list] 80 # create a new calde and append all the terminal clades 81 new_clade = BaseTree.Clade() 82 new_clade.clades.extend(clade_terms) 83 84 85 Example 86 ------- 87 88 >>> from Bio.Phylo.Consensus import _BitString 89 >>> bitstr1 = _BitString('11111') 90 >>> bitstr2 = _BitString('11100') 91 >>> bitstr3 = _BitString('01101') 92 >>> bitstr1 93 _BitString('11111') 94 >>> bitstr2 & bitstr3 95 _BitString('01100') 96 >>> bitstr2 | bitstr3 97 _BitString('11101') 98 >>> bitstr2 ^ bitstr3 99 _BitString('10001') 100 >>> bitstr2.index_one() 101 [0, 1, 2] 102 >>> bitstr3.index_one() 103 [1, 2, 4] 104 >>> bitstr3.index_zero() 105 [0, 3] 106 >>> bitstr1.contains(bitstr2) 107 True 108 >>> bitstr2.contains(bitstr3) 109 False 110 >>> bitstr2.independent(bitstr3) 111 False 112 >>> bitstr2.independent(bitstr4) 113 True 114 >>> bitstr1.iscompatible(bitstr2) 115 True 116 >>> bitstr2.iscompatible(bitstr3) 117 False 118 >>> bitstr2.iscompatible(bitstr4) 119 True 120 """ 121
122 - def __new__(cls, strdata):
123 """init from a binary string data""" 124 if (isinstance(strdata, str) and 125 len(strdata) == strdata.count('0') + strdata.count('1')): 126 return str.__new__(cls, strdata) 127 else: 128 raise TypeError("The input should be a binary string composed of '0' and '1'")
129
130 - def __and__(self, other):
131 selfint = literal_eval('0b' + self) 132 otherint = literal_eval('0b' + other) 133 resultint = selfint & otherint 134 return _BitString(bin(resultint)[2:].zfill(len(self)))
135
136 - def __or__(self, other):
137 selfint = literal_eval('0b' + self) 138 otherint = literal_eval('0b' + other) 139 resultint = selfint | otherint 140 return _BitString(bin(resultint)[2:].zfill(len(self)))
141
142 - def __xor__(self, other):
143 selfint = literal_eval('0b' + self) 144 otherint = literal_eval('0b' + other) 145 resultint = selfint ^ otherint 146 return _BitString(bin(resultint)[2:].zfill(len(self)))
147 148
149 - def __rand__(self, other):
150 selfint = literal_eval('0b' + self) 151 otherint = literal_eval('0b' + other) 152 resultint = otherint & selfint 153 return _BitString(bin(resultint)[2:].zfill(len(self)))
154
155 - def __ror__(self, other):
156 selfint = literal_eval('0b' + self) 157 otherint = literal_eval('0b' + other) 158 resultint = otherint | selfint 159 return _BitString(bin(resultint)[2:].zfill(len(self)))
160
161 - def __rxor__(self, other):
162 selfint = literal_eval('0b' + self) 163 otherint = literal_eval('0b' + other) 164 resultint = otherint ^ selfint 165 return _BitString(bin(resultint)[2:].zfill(len(self)))
166
167 - def __repr__(self):
168 return '_BitString(' + str.__repr__(self) + ')'
169
170 - def index_one(self):
171 """Return a list of positions where the element is '1'""" 172 return [i for i, n in enumerate(self) if n == '1']
173
174 - def index_zero(self):
175 """Return a list of positions where the element is '0'""" 176 return [i for i, n in enumerate(self) if n == '0']
177
178 - def contains(self, other):
179 """Check if current bitstr1 contains another one bitstr2. 180 181 That is to say, the bitstr2.index_one() is a subset of 182 bitstr1.index_one(). 183 184 Examples: 185 "011011" contains "011000", "011001", "000011" 186 187 Be careful, "011011" also contains "000000". Actually, all _BitString 188 objects contain all-zero _BitString of the same length. 189 """ 190 xorbit = self ^ other 191 return (xorbit.count('1') == self.count('1') - other.count('1'))
192
193 - def independent(self, other):
194 """Check if current bitstr1 is independent of another one bitstr2. 195 That is to say the bitstr1.index_one() and bitstr2.index_one() have 196 no intersection. 197 198 Be careful, all _BitString objects are independent of all-zero _BitString 199 of the same length. 200 """ 201 xorbit = self ^ other 202 return (xorbit.count('1') == self.count('1') + other.count('1'))
203
204 - def iscompatible(self, other):
205 """Check if current bitstr1 is compatible with another bitstr2. 206 207 Two conditions are considered as compatible: 208 209 1. bitstr1.contain(bitstr2) or vise versa; 210 2. bitstr1.independent(bitstr2). 211 """ 212 return (self.contains(other) or other.contains(self) or 213 self.independent(other))
214 215 @classmethod
216 - def from_bool(cls, bools):
217 return cls(''.join(map(str, map(int, bools))))
218
219 220 221 -def strict_consensus(trees):
222 """Search strict consensus tree from multiple trees. 223 224 :Parameters: 225 trees: list 226 list of trees to produce consensus tree. 227 """ 228 terms = trees[0].get_terminals() 229 bitstr_counts = _count_clades(trees) 230 # Store bitstrs for strict clades 231 strict_bitstrs = [bitstr for bitstr, t in bitstr_counts.items() 232 if t[0] == len(trees)] 233 strict_bitstrs.sort(key=lambda bitstr: bitstr.count('1'), reverse=True) 234 # Create root 235 root = BaseTree.Clade() 236 if strict_bitstrs[0].count('1') == len(terms): 237 root.clades.extend(terms) 238 else: 239 raise ValueError('Taxons in provided trees should be consistent') 240 # make a bitstr to clades dict and store root clade 241 bitstr_clades = {strict_bitstrs[0]: root} 242 # create inner clades 243 for bitstr in strict_bitstrs[1:]: 244 clade_terms = [terms[i] for i in bitstr.index_one()] 245 clade = BaseTree.Clade() 246 clade.clades.extend(clade_terms) 247 for bs, c in bitstr_clades.items(): 248 # check if it should be the parent of current clade 249 if bs.contains(bitstr): 250 # remove old bitstring 251 del bitstr_clades[bs] 252 # update clade childs 253 new_childs = [child for child in c.clades 254 if child not in clade_terms] 255 c.clades = new_childs 256 # set current clade as child of c 257 c.clades.append(clade) 258 # update bitstring 259 bs = bs ^ bitstr 260 # update clade 261 bitstr_clades[bs] = c 262 break 263 # put new clade 264 bitstr_clades[bitstr] = clade 265 return BaseTree.Tree(root=root)
266
267 268 -def majority_consensus(trees, cutoff=0):
269 """Search majority rule consensus tree from multiple trees. 270 271 This is a extend majority rule method, which means the you can set any 272 cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to 273 create a relaxed binary consensus tree in any condition (as long as one of 274 the provided trees is a binary tree). The branch length of each consensus 275 clade in the result consensus tree is the average length of all counts for 276 that clade. 277 278 :Parameters: 279 trees: list 280 list of trees to produce consensus tree. 281 """ 282 terms = trees[0].get_terminals() 283 bitstr_counts = _count_clades(trees) 284 # Sort bitstrs by descending #occurrences, then #tips, then tip order 285 bitstrs = sorted(bitstr_counts.keys(), 286 key=lambda bitstr: (bitstr_counts[bitstr][0], 287 bitstr.count('1'), 288 str(bitstr)), 289 reverse=True) 290 root = BaseTree.Clade() 291 if bitstrs[0].count('1') == len(terms): 292 root.clades.extend(terms) 293 else: 294 raise ValueError('Taxons in provided trees should be consistent') 295 # Make a bitstr-to-clades dict and store root clade 296 bitstr_clades = {bitstrs[0]: root} 297 # create inner clades 298 for bitstr in bitstrs[1:]: 299 # apply majority rule 300 count_in_trees, branch_length_sum = bitstr_counts[bitstr] 301 confidence = 100.0 * count_in_trees / len(trees) 302 if confidence < cutoff * 100.0: 303 break 304 clade_terms = [terms[i] for i in bitstr.index_one()] 305 clade = BaseTree.Clade() 306 clade.clades.extend(clade_terms) 307 clade.confidence = confidence 308 clade.branch_length = branch_length_sum / count_in_trees 309 bsckeys = sorted(bitstr_clades, key=lambda bs: bs.count('1'), 310 reverse=True) 311 312 # check if current clade is compatible with previous clades and 313 # record it's possible parent and child clades. 314 compatible = True 315 parent_bitstr = None 316 child_bitstrs = [] # multiple independent childs 317 for bs in bsckeys: 318 if not bs.iscompatible(bitstr): 319 compatible = False 320 break 321 # assign the closest ancestor as its parent 322 # as bsckeys is sorted, it should be the last one 323 if bs.contains(bitstr): 324 parent_bitstr = bs 325 # assign the closest descendant as its child 326 # the largest and independent clades 327 if (bitstr.contains(bs) and bs != bitstr and 328 all(c.independent(bs) for c in child_bitstrs)): 329 child_bitstrs.append(bs) 330 if not compatible: 331 continue 332 333 if parent_bitstr: 334 # insert current clade; remove old bitstring 335 parent_clade = bitstr_clades.pop(parent_bitstr) 336 # update parent clade childs 337 parent_clade.clades = [c for c in parent_clade.clades 338 if c not in clade_terms] 339 # set current clade as child of parent_clade 340 parent_clade.clades.append(clade) 341 # update bitstring 342 # parent = parent ^ bitstr 343 # update clade 344 bitstr_clades[parent_bitstr] = parent_clade 345 346 if child_bitstrs: 347 remove_list = [] 348 for c in child_bitstrs: 349 remove_list.extend(c.index_one()) 350 child_clade = bitstr_clades[c] 351 parent_clade.clades.remove(child_clade) 352 clade.clades.append(child_clade) 353 remove_terms = [terms[i] for i in remove_list] 354 clade.clades = [c for c in clade.clades if c not in remove_terms] 355 # put new clade 356 bitstr_clades[bitstr] = clade 357 if ((len(bitstr_clades) == len(terms) - 1) or 358 (len(bitstr_clades) == len(terms) - 2 and len(root.clades) == 3)): 359 break 360 return BaseTree.Tree(root=root)
361
362 363 -def adam_consensus(trees):
364 """Search Adam Consensus tree from multiple trees 365 366 :Parameters: 367 trees: list 368 list of trees to produce consensus tree. 369 """ 370 clades = [tree.root for tree in trees] 371 return BaseTree.Tree(root=_part(clades), rooted=True)
372
373 374 -def _part(clades):
375 """recursive function of adam consensus algorithm""" 376 new_clade = None 377 terms = clades[0].get_terminals() 378 term_names = [term.name for term in terms] 379 if len(terms) == 1 or len(terms) == 2: 380 new_clade = clades[0] 381 else: 382 bitstrs = set([_BitString('1' * len(terms))]) 383 for clade in clades: 384 for child in clade.clades: 385 bitstr = _clade_to_bitstr(child, term_names) 386 to_remove = set() 387 to_add = set() 388 for bs in bitstrs: 389 if bs == bitstr: 390 continue 391 elif bs.contains(bitstr): 392 to_add.add(bitstr) 393 to_add.add(bs ^ bitstr) 394 to_remove.add(bs) 395 elif bitstr.contains(bs): 396 to_add.add(bs ^ bitstr) 397 elif not bs.independent(bitstr): 398 to_add.add(bs & bitstr) 399 to_add.add(bs & bitstr ^ bitstr) 400 to_add.add(bs & bitstr ^ bs) 401 to_remove.add(bs) 402 #bitstrs = bitstrs | to_add 403 bitstrs ^= to_remove 404 if to_add: 405 for ta in sorted(to_add, key=lambda bs: bs.count('1')): 406 independent = True 407 for bs in bitstrs: 408 if not ta.independent(bs): 409 independent = False 410 break 411 if independent: 412 bitstrs.add(ta) 413 new_clade = BaseTree.Clade() 414 for bitstr in sorted(bitstrs): 415 indices = bitstr.index_one() 416 if len(indices) == 1: 417 new_clade.clades.append(terms[indices[0]]) 418 elif len(indices) == 2: 419 bifur_clade = BaseTree.Clade() 420 bifur_clade.clades.append(terms[indices[0]]) 421 bifur_clade.clades.append(terms[indices[1]]) 422 new_clade.clades.append(bifur_clade) 423 elif len(indices) > 2: 424 part_names = [term_names[i] for i in indices] 425 next_clades = [] 426 for clade in clades: 427 next_clades.append(_sub_clade(clade, part_names)) 428 # next_clades = [clade.common_ancestor([clade.find_any(name=name) for name in part_names]) for clade in clades] 429 new_clade.clades.append(_part(next_clades)) 430 return new_clade
431
432 433 -def _sub_clade(clade, term_names):
434 """extract a compatible subclade that only contains the given terminal names 435 """ 436 term_clades = [clade.find_any(name) for name in term_names] 437 sub_clade = clade.common_ancestor(term_clades) 438 if len(term_names) != sub_clade.count_terminals(): 439 temp_clade = BaseTree.Clade() 440 temp_clade.clades.extend(term_clades) 441 for c in sub_clade.find_clades(terminal=False, order="preorder"): 442 if c == sub_clade.root: 443 continue 444 childs = set(c.find_clades(terminal=True)) & set(term_clades) 445 if childs: 446 for tc in temp_clade.find_clades(terminal=False, 447 order="preorder"): 448 tc_childs = set(tc.clades) 449 tc_new_clades = tc_childs - childs 450 if childs.issubset(tc_childs) and tc_new_clades: 451 tc.clades = list(tc_new_clades) 452 child_clade = BaseTree.Clade() 453 child_clade.clades.extend(list(childs)) 454 tc.clades.append(child_clade) 455 sub_clade = temp_clade 456 return sub_clade
457
458 459 -def _count_clades(trees):
460 """Count distinct clades (different sets of terminal names) in the trees. 461 462 Return a dict of bitstring (representing clade) and a tuple of its count of 463 occurrences and sum of branch length for that clade. 464 """ 465 bitstrs = {} 466 for tree in trees: 467 clade_bitstrs = _tree_to_bitstrs(tree) 468 for clade in tree.find_clades(terminal=False): 469 bitstr = clade_bitstrs[clade] 470 if bitstr in bitstrs: 471 count, sum_bl = bitstrs[bitstr] 472 count += 1 473 sum_bl += clade.branch_length or 0 474 bitstrs[bitstr] = (count, sum_bl) 475 else: 476 bitstrs[bitstr] = (1, clade.branch_length or 0) 477 return bitstrs
478
479 480 -def get_support(target_tree, trees):
481 """Calculate branch support given a target tree and a list of bootstrap 482 replicate trees 483 484 :Parameters: 485 target_tree: Tree 486 trees: list 487 list of trees calculate branch support. 488 """ 489 term_names = sorted(term.name 490 for term in target_tree.find_clades(terminal=True)) 491 bitstrs = {} 492 size = len(trees) 493 for clade in target_tree.find_clades(terminal=False): 494 bitstr = _clade_to_bitstr(clade, term_names) 495 bitstrs[bitstr] = (clade, 0) 496 for tree in trees: 497 for clade in tree.find_clades(terminal=False): 498 bitstr = _clade_to_bitstr(clade, term_names) 499 if bitstr in bitstrs: 500 c, t = bitstrs[bitstr] 501 c.confidence = (t + 1) * 100.0 / size 502 bitstrs[bitstr] = (c, t + 1) 503 return target_tree
504
505 506 -def bootstrap(msa, times):
507 """yield a series of bootstrap replicates from a multiple sequence 508 alignment object 509 510 :Parameters: 511 msa: MultipleSeqAlignment 512 multiple sequence alignment to generate replicates. 513 times: int 514 number of bootstrap times. 515 """ 516 517 length = len(msa[0]) 518 i = 0 519 while i < times: 520 i += 1 521 item = None 522 for j in range(length): 523 col = random.randint(0, length - 1) 524 if not item: 525 item = msa[:,col:col + 1] 526 else: 527 item += msa[:,col:col + 1] 528 yield item
529
530 531 -def bootstrap_trees(msa, times, tree_constructor):
532 """Yield a series of bootstrap replicate trees from a multiple sequence 533 alignment. 534 535 :Parameters: 536 msa: MultipleSeqAlignment 537 multiple sequence alignment to generate replicates. 538 times: int 539 number of bootstrap times. 540 tree_constructor: TreeConstructor 541 tree constructor to be used to build trees. 542 """ 543 544 msas = bootstrap(msa, times) 545 for aln in msas: 546 tree = tree_constructor.build_tree(aln) 547 yield tree
548
549 550 -def bootstrap_consensus(msa, times, tree_constructor, consensus):
551 """get the consensus tree of a series of bootstrap trees for 552 a multiple sequence alignment 553 554 :Parameters: 555 msa: MultipleSeqAlignment 556 Multiple sequence alignment to generate replicates. 557 times: int 558 Number of bootstrap times. 559 tree_constructor: TreeConstructor 560 Tree constructor to be used to build trees. 561 consensus: function 562 Consensus method in this module: `strict_consensus`, 563 `majority_consensus`, `adam_consensus`. 564 """ 565 trees = bootstrap_trees(msa, times, tree_constructor) 566 tree = consensus(list(trees)) 567 return tree
568
569 570 -def _clade_to_bitstr(clade, tree_term_names):
571 """Create a BitString representing a clade, given ordered tree taxon names. 572 """ 573 clade_term_names = set(term.name for term in 574 clade.find_clades(terminal=True)) 575 return _BitString.from_bool((name in clade_term_names) 576 for name in tree_term_names)
577
578 579 -def _tree_to_bitstrs(tree):
580 """Create a dict of a tree's clades to corresponding BitStrings.""" 581 clades_bitstrs = {} 582 term_names = [term.name for term in tree.find_clades(terminal=True)] 583 for clade in tree.find_clades(terminal=False): 584 bitstr = _clade_to_bitstr(clade, term_names) 585 clades_bitstrs[clade] = bitstr 586 return clades_bitstrs
587
588 589 -def _bitstring_topology(tree):
590 """Create a dict of all clades' BitStrings to the corresponding branch 591 lengths (rounded to 5 decimal places).""" 592 bitstrs = {} 593 for clade, bitstr in _tree_to_bitstrs(tree).items(): 594 bitstrs[bitstr] = round(clade.branch_length or 0.0, 5) 595 return bitstrs
596
597 598 -def _equal_topology(tree1, tree2):
599 """True if two trees are equal in terms of topology and branch lengths 600 (to 5 decimal places).""" 601 term_names1 = set(term.name for term in tree1.find_clades(terminal=True)) 602 term_names2 = set(term.name for term in tree2.find_clades(terminal=True)) 603 return ((term_names1 == term_names2) and 604 (_bitstring_topology(tree1) == _bitstring_topology(tree2)))
605