Package Bio :: Package Phylo :: Module TreeConstruction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.TreeConstruction

   1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes and methods for tree construction""" 
   7   
   8  import itertools 
   9  import copy 
  10  from Bio.Phylo import BaseTree 
  11  from Bio.Align import MultipleSeqAlignment 
  12  from Bio.SubsMat import MatrixInfo 
  13  from Bio import _py3k 
  14   
  15   
16 -def _is_numeric(x):
17 return _py3k._is_int_or_long(x) or isinstance(x, (float, complex))
18 19
20 -class _Matrix(object):
21 """Base class for distance matrix or scoring matrix 22 23 Accepts a list of names and a lower triangular matrix.:: 24 25 matrix = [[0], 26 [1, 0], 27 [2, 3, 0], 28 [4, 5, 6, 0]] 29 represents the symmetric matrix of 30 [0,1,2,4] 31 [1,0,3,5] 32 [2,3,0,6] 33 [4,5,6,0] 34 35 :Parameters: 36 names : list 37 names of elements, used for indexing 38 matrix : list 39 nested list of numerical lists in lower triangular format 40 41 Example 42 ------- 43 44 >>> from Bio.Phylo.TreeConstruction import _Matrix 45 >>> names = ['Alpha', 'Beta', 'Gamma', 'Delta'] 46 >>> matrix = [[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]] 47 >>> m = _Matrix(names, matrix) 48 >>> m 49 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]]) 50 51 You can use two indices to get or assign an element in the matrix. 52 53 >>> m[1,2] 54 3 55 >>> m['Beta','Gamma'] 56 3 57 >>> m['Beta','Gamma'] = 4 58 >>> m['Beta','Gamma'] 59 4 60 61 Further more, you can use one index to get or assign a list of elements related to that index. 62 63 >>> m[0] 64 [0, 1, 2, 4] 65 >>> m['Alpha'] 66 [0, 1, 2, 4] 67 >>> m['Alpha'] = [0, 7, 8, 9] 68 >>> m[0] 69 [0, 7, 8, 9] 70 >>> m[0,1] 71 7 72 73 Also you can delete or insert a column&row of elemets by index. 74 75 >>> m 76 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 77 >>> del m['Alpha'] 78 >>> m 79 _Matrix(names=['Beta', 'Gamma', 'Delta'], matrix=[[0], [4, 0], [5, 6, 0]]) 80 >>> m.insert('Alpha', [0, 7, 8, 9] , 0) 81 >>> m 82 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 83 84 """ 85
86 - def __init__(self, names, matrix=None):
87 """Initialize matrix by a list of names and a list of 88 lower triangular matrix data""" 89 # check names 90 if isinstance(names, list) and all(isinstance(s, str) for s in names): 91 if len(set(names)) == len(names): 92 self.names = names 93 else: 94 raise ValueError("Duplicate names found") 95 else: 96 raise TypeError("'names' should be a list of strings") 97 98 # check matrix 99 if matrix is None: 100 # create a new one with 0 if matrix is not assigned 101 matrix = [[0] * i for i in range(1, len(self) + 1)] 102 self.matrix = matrix 103 else: 104 # check if all elements are numbers 105 if (isinstance(matrix, list) and 106 all(isinstance(l, list) for l in matrix) and 107 all(_is_numeric(n) for n in [item for sublist in matrix 108 for item in sublist])): 109 # check if the same length with names 110 if len(matrix) == len(names): 111 # check if is lower triangle format 112 if [len(m) for m in matrix] == list(range(1, len(self) + 1)): 113 self.matrix = matrix 114 else: 115 raise ValueError( 116 "'matrix' should be in lower triangle format") 117 else: 118 raise ValueError( 119 "'names' and 'matrix' should be the same size") 120 else: 121 raise TypeError("'matrix' should be a list of numerical lists")
122
123 - def __getitem__(self, item):
124 """Access value(s) by the index(s) or name(s). 125 126 For a _Matrix object 'dm':: 127 128 dm[i] get a value list from the given 'i' to others; 129 dm[i, j] get the value between 'i' and 'j'; 130 dm['name'] map name to index first 131 dm['name1', 'name2'] map name to index first 132 """ 133 # Handle single indexing 134 if isinstance(item, (int, str)): 135 index = None 136 if isinstance(item, int): 137 index = item 138 elif isinstance(item, str): 139 if item in self.names: 140 index = self.names.index(item) 141 else: 142 raise ValueError("Item not found.") 143 else: 144 raise TypeError("Invalid index type.") 145 # check index 146 if index > len(self) - 1: 147 raise IndexError("Index out of range.") 148 return [self.matrix[index][i] for i in range(0, index)] + [self.matrix[i][index] for i in range(index, len(self))] 149 # Handle double indexing 150 elif len(item) == 2: 151 row_index = None 152 col_index = None 153 if all(isinstance(i, int) for i in item): 154 row_index, col_index = item 155 elif all(isinstance(i, str) for i in item): 156 row_name, col_name = item 157 if row_name in self.names and col_name in self.names: 158 row_index = self.names.index(row_name) 159 col_index = self.names.index(col_name) 160 else: 161 raise ValueError("Item not found.") 162 else: 163 raise TypeError("Invalid index type.") 164 # check index 165 if row_index > len(self) - 1 or col_index > len(self) - 1: 166 raise IndexError("Index out of range.") 167 if row_index > col_index: 168 return self.matrix[row_index][col_index] 169 else: 170 return self.matrix[col_index][row_index] 171 else: 172 raise TypeError("Invalid index type.")
173
174 - def __setitem__(self, item, value):
175 """Set value by the index(s) or name(s). 176 177 Similar to __getitem__:: 178 179 dm[1] = [1, 0, 3, 4] set values from '1' to others; 180 dm[i, j] = 2 set the value from 'i' to 'j' 181 182 """ 183 184 # Handle single indexing 185 if isinstance(item, (int, str)): 186 index = None 187 if isinstance(item, int): 188 index = item 189 elif isinstance(item, str): 190 if item in self.names: 191 index = self.names.index(item) 192 else: 193 raise ValueError("Item not found.") 194 else: 195 raise TypeError("Invalid index type.") 196 # check index 197 if index > len(self) - 1: 198 raise IndexError("Index out of range.") 199 # check and assign value 200 if isinstance(value, list) and all(_is_numeric(n) for n in value): 201 if len(value) == len(self): 202 for i in range(0, index): 203 self.matrix[index][i] = value[i] 204 for i in range(index, len(self)): 205 self.matrix[i][index] = value[i] 206 else: 207 raise ValueError("Value not the same size.") 208 else: 209 raise TypeError("Invalid value type.") 210 # Handle double indexing 211 elif len(item) == 2: 212 row_index = None 213 col_index = None 214 if all(isinstance(i, int) for i in item): 215 row_index, col_index = item 216 elif all(isinstance(i, str) for i in item): 217 row_name, col_name = item 218 if row_name in self.names and col_name in self.names: 219 row_index = self.names.index(row_name) 220 col_index = self.names.index(col_name) 221 else: 222 raise ValueError("Item not found.") 223 else: 224 raise TypeError("Invalid index type.") 225 # check index 226 if row_index > len(self) - 1 or col_index > len(self) - 1: 227 raise IndexError("Index out of range.") 228 # check and assign value 229 if _is_numeric(value): 230 if row_index > col_index: 231 self.matrix[row_index][col_index] = value 232 else: 233 self.matrix[col_index][row_index] = value 234 else: 235 raise TypeError("Invalid value type.") 236 else: 237 raise TypeError("Invalid index type.")
238
239 - def __delitem__(self, item):
240 """Delete related distances by the index or name""" 241 index = None 242 if isinstance(item, int): 243 index = item 244 elif isinstance(item, str): 245 index = self.names.index(item) 246 else: 247 raise TypeError("Invalid index type.") 248 # remove distances related to index 249 for i in range(index + 1, len(self)): 250 del self.matrix[i][index] 251 del self.matrix[index] 252 # remove name 253 del self.names[index]
254
255 - def insert(self, name, value, index=None):
256 """Insert distances given the name and value. 257 258 :Parameters: 259 name : str 260 name of a row/col to be inserted 261 value : list 262 a row/col of values to be inserted 263 """ 264 if isinstance(name, str): 265 # insert at the given index or at the end 266 if index is None: 267 index = len(self) 268 if not isinstance(index, int): 269 raise TypeError("Invalid index type.") 270 # insert name 271 self.names.insert(index, name) 272 # insert elements of 0, to be assigned 273 self.matrix.insert(index, [0] * index) 274 for i in range(index, len(self)): 275 self.matrix[i].insert(index, 0) 276 # assign value 277 self[index] = value 278 else: 279 raise TypeError("Invalid name type.")
280
281 - def __len__(self):
282 """Matrix length""" 283 return len(self.names)
284
285 - def __repr__(self):
286 return self.__class__.__name__ \ 287 + "(names=%s, matrix=%s)" \ 288 % tuple(map(repr, (self.names, self.matrix)))
289
290 - def __str__(self):
291 """Get a lower triangular matrix string""" 292 matrix_string = '\n'.join( 293 [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]]) 294 for i in range(0, len(self))]) 295 matrix_string = matrix_string + "\n\t" + "\t".join(self.names) 296 return matrix_string
297 298
299 -class _DistanceMatrix(_Matrix):
300 """Distance matrix class that can be used for distance based tree algorithms. 301 302 All diagonal elements will be zero no matter what the users provide. 303 """ 304
305 - def __init__(self, names, matrix=None):
306 _Matrix.__init__(self, names, matrix) 307 self._set_zero_diagonal()
308
309 - def __setitem__(self, item, value):
310 _Matrix.__setitem__(self, item, value) 311 self._set_zero_diagonal()
312
313 - def _set_zero_diagonal(self):
314 """set all diagonal elements to zero""" 315 for i in range(0, len(self)): 316 self.matrix[i][i] = 0
317 318
319 -class DistanceCalculator(object):
320 """Class to calculate the distance matrix from a DNA or Protein 321 322 Multiple Sequence Alignment(MSA) and the given name of the 323 substitution model. 324 325 Currently only scoring matrices are used. 326 327 :Parameters: 328 model : str 329 Name of the model matrix to be used to calculate distance. 330 The attribute `dna_matrices` contains the available model 331 names for DNA sequences and `protein_matrices` for protein 332 sequences. 333 334 Example 335 ------- 336 337 >>> from Bio.Phylo.TreeConstruction import DistanceCalculator 338 >>> from Bio import AlignIO 339 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 340 >>> print aln 341 SingleLetterAlphabet() alignment with 5 rows and 13 columns 342 AACGTGGCCACAT Alpha 343 AAGGTCGCCACAC Beta 344 GAGATTTCCGCCT Delta 345 GAGATCTCCGCCC Epsilon 346 CAGTTCGCCACAA Gamma 347 348 DNA calculator with 'identity' model:: 349 350 >>> calculator = DistanceCalculator('identity') 351 >>> dm = calculator.get_distance(aln) 352 >>> print dm 353 Alpha 0 354 Beta 0.230769230769 0 355 Gamma 0.384615384615 0.230769230769 0 356 Delta 0.538461538462 0.538461538462 0.538461538462 0 357 Epsilon 0.615384615385 0.384615384615 0.461538461538 0.153846153846 0 358 Alpha Beta Gamma Delta Epsilon 359 360 Protein calculator with 'blosum62' model:: 361 362 >>> calculator = DistanceCalculator('blosum62') 363 >>> dm = calculator.get_distance(aln) 364 >>> print dm 365 Alpha 0 366 Beta 0.369047619048 0 367 Gamma 0.493975903614 0.25 0 368 Delta 0.585365853659 0.547619047619 0.566265060241 0 369 Epsilon 0.7 0.355555555556 0.488888888889 0.222222222222 0 370 Alpha Beta Gamma Delta Epsilon 371 """ 372 373 dna_alphabet = ['A', 'T', 'C', 'G'] 374 375 # BLAST nucleic acid scoring matrix 376 blastn = [[5], 377 [-4, 5], 378 [-4, -4, 5], 379 [-4, -4, -4, 5]] 380 381 # transition/transversion scoring matrix 382 trans = [[6], 383 [-5, 6], 384 [-5, -1, 6], 385 [-1, -5, -5, 6]] 386 387 protein_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 388 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 389 'Z'] 390 391 # matrices available 392 dna_matrices = {'blastn': blastn, 'trans': trans} 393 protein_models = MatrixInfo.available_matrices 394 protein_matrices = dict((name, getattr(MatrixInfo, name)) 395 for name in protein_models) 396 397 dna_models = list(dna_matrices.keys()) 398 399 models = ['identity'] + dna_models + protein_models 400
401 - def __init__(self, model='identity'):
402 """Initialize with a distance model""" 403 404 if model == 'identity': 405 self.scoring_matrix = None 406 elif model in self.dna_models: 407 self.scoring_matrix = _Matrix(self.dna_alphabet, 408 self.dna_matrices[model]) 409 elif model in self.protein_models: 410 self.scoring_matrix = self._build_protein_matrix( 411 self.protein_matrices[model]) 412 else: 413 raise ValueError("Model not supported. Available models: " + 414 ", ".join(self.models))
415
416 - def _pairwise(self, seq1, seq2):
417 """Calculate pairwise distance from two sequences. 418 419 Returns a value between 0 (identical sequences) and 1 (completely 420 different, or seq1 is an empty string.) 421 """ 422 score = 0 423 max_score = 0 424 if self.scoring_matrix: 425 max_score1 = 0 426 max_score2 = 0 427 skip_letters = ['-', '*'] 428 for i in range(0, len(seq1)): 429 l1 = seq1[i] 430 l2 = seq2[i] 431 if l1 in skip_letters or l2 in skip_letters: 432 continue 433 if l1 not in self.scoring_matrix.names: 434 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 435 % (l1, seq1.id, i)) 436 if l2 not in self.scoring_matrix.names: 437 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 438 % (l2, seq2.id, i)) 439 max_score1 += self.scoring_matrix[l1, l1] 440 max_score2 += self.scoring_matrix[l2, l2] 441 score += self.scoring_matrix[l1, l2] 442 # Take the higher score if the matrix is asymmetrical 443 max_score = max(max_score1, max_score2) 444 else: 445 # Score by character identity, not skipping any special letters 446 for i in range(0, len(seq1)): 447 l1 = seq1[i] 448 l2 = seq2[i] 449 if l1 == l2: 450 score += 1 451 max_score = len(seq1) 452 453 if max_score == 0: 454 return 1 # max possible scaled distance 455 return 1 - (score * 1.0 / max_score)
456
457 - def get_distance(self, msa):
458 """Return a _DistanceMatrix for MSA object 459 460 :Parameters: 461 msa : MultipleSeqAlignment 462 DNA or Protein multiple sequence alignment. 463 464 """ 465 466 if not isinstance(msa, MultipleSeqAlignment): 467 raise TypeError("Must provide a MultipleSeqAlignment object.") 468 469 names = [s.id for s in msa] 470 dm = _DistanceMatrix(names) 471 for seq1, seq2 in itertools.combinations(msa, 2): 472 dm[seq1.id, seq2.id] = self._pairwise(seq1, seq2) 473 return dm
474
475 - def _build_protein_matrix(self, subsmat):
476 """Convert matrix from SubsMat format to _Matrix object""" 477 protein_matrix = _Matrix(self.protein_alphabet) 478 for k, v in subsmat.items(): 479 aa1, aa2 = k 480 protein_matrix[aa1, aa2] = v 481 return protein_matrix
482 483
484 -class TreeConstructor(object):
485 """Base class for all tree constructor.""" 486
487 - def build_tree(self, msa):
488 """Caller to built the tree from a MultipleSeqAlignment object. 489 490 This should be implemented in subclass""" 491 raise NotImplementedError("Method not implemented!")
492 493
494 -class DistanceTreeConstructor(TreeConstructor):
495 """Distance based tree constructor. 496 497 :Parameters: 498 method : str 499 Distance tree construction method, 'nj'(default) or 'upgma'. 500 distance_calculator : DistanceCalculator 501 The distance matrix calculator for multiple sequence alignment. 502 It must be provided if `build_tree` will be called. 503 504 Example 505 -------- 506 507 >>> from TreeConstruction import DistanceTreeConstructor 508 >>> constructor = DistanceTreeConstructor() 509 510 UPGMA Tree: 511 512 >>> upgmatree = constructor.upgma(dm) 513 >>> print upgmatree 514 Tree(rooted=True) 515 Clade(name='Inner4') 516 Clade(branch_length=0.171955155115, name='Inner1') 517 Clade(branch_length=0.111111111111, name='Epsilon') 518 Clade(branch_length=0.111111111111, name='Delta') 519 Clade(branch_length=0.0673103855608, name='Inner3') 520 Clade(branch_length=0.0907558806655, name='Inner2') 521 Clade(branch_length=0.125, name='Gamma') 522 Clade(branch_length=0.125, name='Beta') 523 Clade(branch_length=0.215755880666, name='Alpha') 524 525 NJ Tree: 526 527 >>> njtree = constructor.nj(dm) 528 >>> print njtree 529 Tree(rooted=False) 530 Clade(name='Inner3') 531 Clade(branch_length=0.0142054862889, name='Inner2') 532 Clade(branch_length=0.239265540676, name='Inner1') 533 Clade(branch_length=0.0853101915988, name='Epsilon') 534 Clade(branch_length=0.136912030623, name='Delta') 535 Clade(branch_length=0.292306275042, name='Alpha') 536 Clade(branch_length=0.0747705106139, name='Beta') 537 Clade(branch_length=0.175229489386, name='Gamma') 538 539 540 """ 541 542 methods = ['nj', 'upgma'] 543
544 - def __init__(self, distance_calculator=None, method="nj"):
545 if (distance_calculator is None or 546 isinstance(distance_calculator, DistanceCalculator)): 547 self.distance_calculator = distance_calculator 548 else: 549 raise TypeError("Must provide a DistanceCalculator object.") 550 if isinstance(method, str) and method in self.methods: 551 self.method = method 552 else: 553 raise TypeError("Bad method: " + method + 554 ". Available methods: " + ", ".join(self.methods))
555
556 - def build_tree(self, msa):
557 if self.distance_calculator: 558 dm = self.distance_calculator.get_distance(msa) 559 tree = None 560 if self.method == 'upgma': 561 tree = self.upgma(dm) 562 else: 563 tree = self.nj(dm) 564 return tree 565 else: 566 raise TypeError("Must provide a DistanceCalculator object.")
567
568 - def upgma(self, distance_matrix):
569 """Construct and return an UPGMA tree. 570 571 Constructs and returns an Unweighted Pair Group Method 572 with Arithmetic mean (UPGMA) tree. 573 574 :Parameters: 575 distance_matrix : _DistanceMatrix 576 The distance matrix for tree construction. 577 """ 578 if not isinstance(distance_matrix, _DistanceMatrix): 579 raise TypeError("Must provide a _DistanceMatrix object.") 580 581 # make a copy of the distance matrix to be used 582 dm = copy.deepcopy(distance_matrix) 583 # init terminal clades 584 clades = [BaseTree.Clade(None, name) for name in dm.names] 585 # init minimum index 586 min_i = 0 587 min_j = 0 588 inner_count = 0 589 while len(dm) > 1: 590 min_dist = dm[1, 0] 591 # find minimum index 592 for i in range(1, len(dm)): 593 for j in range(0, i): 594 if min_dist >= dm[i, j]: 595 min_dist = dm[i, j] 596 min_i = i 597 min_j = j 598 599 # create clade 600 clade1 = clades[min_i] 601 clade2 = clades[min_j] 602 inner_count += 1 603 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 604 inner_clade.clades.append(clade1) 605 inner_clade.clades.append(clade2) 606 # assign branch length 607 if clade1.is_terminal(): 608 clade1.branch_length = min_dist * 1.0 / 2 609 else: 610 clade1.branch_length = min_dist * \ 611 1.0 / 2 - self._height_of(clade1) 612 613 if clade2.is_terminal(): 614 clade2.branch_length = min_dist * 1.0 / 2 615 else: 616 clade2.branch_length = min_dist * \ 617 1.0 / 2 - self._height_of(clade2) 618 619 # update node list 620 clades[min_j] = inner_clade 621 del clades[min_i] 622 623 # rebuild distance matrix, 624 # set the distances of new node at the index of min_j 625 for k in range(0, len(dm)): 626 if k != min_i and k != min_j: 627 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2 628 629 dm.names[min_j] = "Inner" + str(inner_count) 630 631 del dm[min_i] 632 inner_clade.branch_length = 0 633 return BaseTree.Tree(inner_clade)
634
635 - def nj(self, distance_matrix):
636 """Construct and return an Neighbor Joining tree. 637 638 :Parameters: 639 distance_matrix : _DistanceMatrix 640 The distance matrix for tree construction. 641 """ 642 643 if not isinstance(distance_matrix, _DistanceMatrix): 644 raise TypeError("Must provide a _DistanceMatrix object.") 645 646 # make a copy of the distance matrix to be used 647 dm = copy.deepcopy(distance_matrix) 648 # init terminal clades 649 clades = [BaseTree.Clade(None, name) for name in dm.names] 650 # init node distance 651 node_dist = [0] * len(dm) 652 # init minimum index 653 min_i = 0 654 min_j = 0 655 inner_count = 0 656 while len(dm) > 2: 657 # calculate nodeDist 658 for i in range(0, len(dm)): 659 node_dist[i] = 0 660 for j in range(0, len(dm)): 661 node_dist[i] += dm[i, j] 662 node_dist[i] = node_dist[i] / (len(dm) - 2) 663 664 # find minimum distance pair 665 min_dist = dm[1, 0] - node_dist[1] - node_dist[0] 666 min_i = 0 667 min_j = 1 668 for i in range(1, len(dm)): 669 for j in range(0, i): 670 temp = dm[i, j] - node_dist[i] - node_dist[j] 671 if min_dist > temp: 672 min_dist = temp 673 min_i = i 674 min_j = j 675 # create clade 676 clade1 = clades[min_i] 677 clade2 = clades[min_j] 678 inner_count += 1 679 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 680 inner_clade.clades.append(clade1) 681 inner_clade.clades.append(clade2) 682 # assign branch length 683 clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] - 684 node_dist[min_j]) / 2.0 685 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length 686 687 # update node list 688 clades[min_j] = inner_clade 689 del clades[min_i] 690 691 # rebuild distance matrix, 692 # set the distances of new node at the index of min_j 693 for k in range(0, len(dm)): 694 if k != min_i and k != min_j: 695 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] - 696 dm[min_i, min_j]) / 2.0 697 698 dm.names[min_j] = "Inner" + str(inner_count) 699 del dm[min_i] 700 701 # set the last clade as one of the child of the inner_clade 702 root = None 703 if clades[0] == inner_clade: 704 clades[0].branch_length = 0 705 clades[1].branch_length = dm[1, 0] 706 clades[0].clades.append(clades[1]) 707 root = clades[0] 708 else: 709 clades[0].branch_length = dm[1, 0] 710 clades[1].branch_length = 0 711 clades[1].clades.append(clades[0]) 712 root = clades[1] 713 714 return BaseTree.Tree(root, rooted=False)
715
716 - def _height_of(self, clade):
717 """calculate clade height -- the longest path to any terminal.""" 718 height = 0 719 if clade.is_terminal(): 720 height = clade.branch_length 721 else: 722 height = height + max([self._height_of(c) for c in clade.clades]) 723 return height
724 725 # #################### Tree Scoring and Searching Classes ##################### 726 727
728 -class Scorer(object):
729 """Base class for all tree scoring methods""" 730
731 - def get_score(self, tree, alignment):
732 """Caller to get the score of a tree for the given alignment. 733 734 This should be implemented in subclass""" 735 raise NotImplementedError("Method not implemented!")
736 737
738 -class TreeSearcher(object):
739 """Base class for all tree searching methods""" 740
741 - def search(self, starting_tree, alignment):
742 """Caller to search the best tree with a starting tree. 743 744 This should be implemented in subclass""" 745 raise NotImplementedError("Method not implemented!")
746 747
748 -class NNITreeSearcher(TreeSearcher):
749 """Tree searching with Nearest Neighbor Interchanges (NNI) algorithm. 750 751 :Parameters: 752 scorer : ParsimonyScorer 753 parsimony scorer to calculate the parsimony score of 754 different trees during NNI algorithm. 755 """ 756
757 - def __init__(self, scorer):
758 if isinstance(scorer, Scorer): 759 self.scorer = scorer 760 else: 761 raise TypeError("Must provide a Scorer object.")
762
763 - def search(self, starting_tree, alignment):
764 """Implement the TreeSearcher.search method. 765 766 :Parameters: 767 starting_tree : Tree 768 starting tree of NNI method. 769 alignment : MultipleSeqAlignment 770 multiple sequence alignment used to calculate parsimony 771 score of different NNI trees. 772 """ 773 774 return self._nni(starting_tree, alignment)
775
776 - def _nni(self, starting_tree, alignment):
777 """Search for the best parsimony tree using the NNI algorithm.""" 778 best_tree = starting_tree 779 while True: 780 best_score = self.scorer.get_score(best_tree, alignment) 781 temp = best_score 782 for t in self._get_neighbors(best_tree): 783 score = self.scorer.get_score(t, alignment) 784 if score < best_score: 785 best_score = score 786 best_tree = t 787 # stop if no smaller score exist 788 if best_score >= temp: 789 break 790 return best_tree
791
792 - def _get_neighbors(self, tree):
793 """Get all neighbor trees of the given tree. 794 795 Currently only for binary rooted trees. 796 """ 797 # make child to parent dict 798 parents = {} 799 for clade in tree.find_clades(): 800 if clade != tree.root: 801 node_path = tree.get_path(clade) 802 # cannot get the parent if the parent is root. Bug? 803 if len(node_path) == 1: 804 parents[clade] = tree.root 805 else: 806 parents[clade] = node_path[-2] 807 neighbors = [] 808 root_childs = [] 809 for clade in tree.get_nonterminals(order="level"): 810 if clade == tree.root: 811 left = clade.clades[0] 812 right = clade.clades[1] 813 root_childs.append(left) 814 root_childs.append(right) 815 if not left.is_terminal() and not right.is_terminal(): 816 # make changes around the left_left clade 817 # left_left = left.clades[0] 818 left_right = left.clades[1] 819 right_left = right.clades[0] 820 right_right = right.clades[1] 821 # neightbor 1 (left_left + right_right) 822 del left.clades[1] 823 del right.clades[1] 824 left.clades.append(right_right) 825 right.clades.append(left_right) 826 temp_tree = copy.deepcopy(tree) 827 neighbors.append(temp_tree) 828 # neighbor 2 (left_left + right_left) 829 del left.clades[1] 830 del right.clades[0] 831 left.clades.append(right_left) 832 right.clades.append(right_right) 833 temp_tree = copy.deepcopy(tree) 834 neighbors.append(temp_tree) 835 # change back (left_left + left_right) 836 del left.clades[1] 837 del right.clades[0] 838 left.clades.append(left_right) 839 right.clades.insert(0, right_left) 840 elif clade in root_childs: 841 # skip root child 842 continue 843 else: 844 # method for other clades 845 # make changes around the parent clade 846 left = clade.clades[0] 847 right = clade.clades[1] 848 parent = parents[clade] 849 if clade == parent.clades[0]: 850 sister = parent.clades[1] 851 # neighbor 1 (parent + right) 852 del parent.clades[1] 853 del clade.clades[1] 854 parent.clades.append(right) 855 clade.clades.append(sister) 856 temp_tree = copy.deepcopy(tree) 857 neighbors.append(temp_tree) 858 # neighbor 2 (parent + left) 859 del parent.clades[1] 860 del clade.clades[0] 861 parent.clades.append(left) 862 clade.clades.append(right) 863 temp_tree = copy.deepcopy(tree) 864 neighbors.append(temp_tree) 865 # change back (parent + sister) 866 del parent.clades[1] 867 del clade.clades[0] 868 parent.clades.append(sister) 869 clade.clades.insert(0, left) 870 else: 871 sister = parent.clades[0] 872 # neighbor 1 (parent + right) 873 del parent.clades[0] 874 del clade.clades[1] 875 parent.clades.insert(0, right) 876 clade.clades.append(sister) 877 temp_tree = copy.deepcopy(tree) 878 neighbors.append(temp_tree) 879 # neighbor 2 (parent + left) 880 del parent.clades[0] 881 del clade.clades[0] 882 parent.clades.insert(0, left) 883 clade.clades.append(right) 884 temp_tree = copy.deepcopy(tree) 885 neighbors.append(temp_tree) 886 # change back (parent + sister) 887 del parent.clades[0] 888 del clade.clades[0] 889 parent.clades.insert(0, sister) 890 clade.clades.insert(0, left) 891 return neighbors
892 893 # ######################## Parsimony Classes ########################## 894 895
896 -class ParsimonyScorer(Scorer):
897 """Parsimony scorer with a scoring matrix. 898 899 This is a combination of Fitch algorithm and Sankoff algorithm. 900 See ParsimonyTreeConstructor for usage. 901 902 :Parameters: 903 matrix : _Matrix 904 scoring matrix used in parsimony score calculation. 905 """ 906
907 - def __init__(self, matrix=None):
908 if not matrix or isinstance(matrix, _Matrix): 909 self.matrix = matrix 910 else: 911 raise TypeError("Must provide a _Matrix object.")
912
913 - def get_score(self, tree, alignment):
914 """Calculate and return the parsimony score given a tree and 915 the MSA using the Fitch algorithm without the penalty matrix 916 the Sankoff algorithm with the matrix""" 917 # make sure the tree is rooted and bifurcating 918 if not tree.is_bifurcating(): 919 raise ValueError("The tree provided should be bifurcating.") 920 if not tree.rooted: 921 tree.root_at_midpoint() 922 # sort tree terminals and alignment 923 terms = tree.get_terminals() 924 terms.sort(key=lambda term: term.name) 925 alignment.sort() 926 if not all(t.name == a.id for t, a in zip(terms, alignment)): 927 raise ValueError( 928 "Taxon names of the input tree should be the same with the alignment.") 929 # term_align = dict(zip(terms, alignment)) 930 score = 0 931 for i in range(len(alignment[0])): 932 # parsimony score for column_i 933 score_i = 0 934 # get column 935 column_i = alignment[:, i] 936 # skip non-informative column 937 if column_i == len(column_i) * column_i[0]: 938 continue 939 940 # start calculating score_i using the tree and column_i 941 942 # Fitch algorithm without the penalty matrix 943 if not self.matrix: 944 # init by mapping terminal clades and states in column_i 945 clade_states = dict(zip(terms, [set([c]) for c in column_i])) 946 for clade in tree.get_nonterminals(order="postorder"): 947 clade_childs = clade.clades 948 left_state = clade_states[clade_childs[0]] 949 right_state = clade_states[clade_childs[1]] 950 state = left_state & right_state 951 if not state: 952 state = left_state | right_state 953 score_i = score_i + 1 954 clade_states[clade] = state 955 # Sankoff algorithm with the penalty matrix 956 else: 957 inf = float('inf') 958 # init score arrays for terminal clades 959 alphabet = self.matrix.names 960 length = len(alphabet) 961 clade_scores = {} 962 for j in range(len(column_i)): 963 array = [inf] * length 964 index = alphabet.index(column_i[j]) 965 array[index] = 0 966 clade_scores[terms[j]] = array 967 # bottom up calculation 968 for clade in tree.get_nonterminals(order="postorder"): 969 clade_childs = clade.clades 970 left_score = clade_scores[clade_childs[0]] 971 right_score = clade_scores[clade_childs[1]] 972 array = [] 973 for m in range(length): 974 min_l = inf 975 min_r = inf 976 for n in range(length): 977 sl = self.matrix[ 978 alphabet[m], alphabet[n]] + left_score[n] 979 sr = self.matrix[ 980 alphabet[m], alphabet[n]] + right_score[n] 981 if min_l > sl: 982 min_l = sl 983 if min_r > sr: 984 min_r = sr 985 array.append(min_l + min_r) 986 clade_scores[clade] = array 987 # minimum from root score 988 score_i = min(array) 989 # TODO: resolve internal states 990 score = score + score_i 991 return score
992 993
994 -class ParsimonyTreeConstructor(TreeConstructor):
995 """Parsimony tree constructor. 996 997 :Parameters: 998 searcher : TreeSearcher 999 tree searcher to search the best parsimony tree. 1000 starting_tree : Tree 1001 starting tree provided to the searcher. 1002 1003 Example 1004 -------- 1005 1006 >>> from Bio import AlignIO 1007 >>> from TreeConstruction import * 1008 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 1009 >>> print aln 1010 SingleLetterAlphabet() alignment with 5 rows and 13 columns 1011 AACGTGGCCACAT Alpha 1012 AAGGTCGCCACAC Beta 1013 GAGATTTCCGCCT Delta 1014 GAGATCTCCGCCC Epsilon 1015 CAGTTCGCCACAA Gamma 1016 >>> starting_tree = Phylo.read('Tests/TreeConstruction/nj.tre', 'newick') 1017 >>> print tree 1018 Tree(weight=1.0, rooted=False) 1019 Clade(branch_length=0.0, name='Inner3') 1020 Clade(branch_length=0.01421, name='Inner2') 1021 Clade(branch_length=0.23927, name='Inner1') 1022 Clade(branch_length=0.08531, name='Epsilon') 1023 Clade(branch_length=0.13691, name='Delta') 1024 Clade(branch_length=0.29231, name='Alpha') 1025 Clade(branch_length=0.07477, name='Beta') 1026 Clade(branch_length=0.17523, name='Gamma') 1027 >>> from TreeConstruction import * 1028 >>> scorer = ParsimonyScorer() 1029 >>> searcher = NNITreeSearcher(scorer) 1030 >>> constructor = ParsimonyTreeConstructor(searcher, starting_tree) 1031 >>> pars_tree = constructor.build_tree(aln) 1032 >>> print pars_tree 1033 Tree(weight=1.0, rooted=True) 1034 Clade(branch_length=0.0) 1035 Clade(branch_length=0.197335, name='Inner1') 1036 Clade(branch_length=0.13691, name='Delta') 1037 Clade(branch_length=0.08531, name='Epsilon') 1038 Clade(branch_length=0.041935, name='Inner2') 1039 Clade(branch_length=0.01421, name='Inner3') 1040 Clade(branch_length=0.17523, name='Gamma') 1041 Clade(branch_length=0.07477, name='Beta') 1042 Clade(branch_length=0.29231, name='Alpha') 1043 """ 1044
1045 - def __init__(self, searcher, starting_tree=None):
1046 self.searcher = searcher 1047 self.starting_tree = starting_tree
1048
1049 - def build_tree(self, alignment):
1050 """Build the tree. 1051 1052 :Parameters: 1053 alignment : MultipleSeqAlignment 1054 multiple sequence alignment to calculate parsimony tree. 1055 """ 1056 # if starting_tree is none, 1057 # create a upgma tree with 'identity' scoring matrix 1058 if self.starting_tree is None: 1059 dtc = DistanceTreeConstructor(DistanceCalculator("identity"), 1060 "upgma") 1061 self.starting_tree = dtc.build_tree(alignment) 1062 return self.searcher.search(self.starting_tree, alignment)
1063