Package Bio :: Package Phylo :: Module TreeConstruction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.TreeConstruction

   1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes and methods for tree construction""" 
   7   
   8  import itertools 
   9  import copy 
  10  from Bio.Phylo import BaseTree 
  11  from Bio.Align import MultipleSeqAlignment 
  12  from Bio.SubsMat import MatrixInfo 
  13  from Bio import _py3k 
  14   
  15   
16 -def _is_numeric(x):
17 return _py3k._is_int_or_long(x) or isinstance(x, (float, complex))
18 19
20 -class _Matrix(object):
21 """Base class for distance matrix or scoring matrix 22 23 Accepts a list of names and a lower triangular matrix.:: 24 25 matrix = [[0], 26 [1, 0], 27 [2, 3, 0], 28 [4, 5, 6, 0]] 29 represents the symmetric matrix of 30 [0,1,2,4] 31 [1,0,3,5] 32 [2,3,0,6] 33 [4,5,6,0] 34 35 :Parameters: 36 names : list 37 names of elements, used for indexing 38 matrix : list 39 nested list of numerical lists in lower triangular format 40 41 Example 42 ------- 43 44 >>> from Bio.Phylo.TreeConstruction import _Matrix 45 >>> names = ['Alpha', 'Beta', 'Gamma', 'Delta'] 46 >>> matrix = [[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]] 47 >>> m = _Matrix(names, matrix) 48 >>> m 49 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]]) 50 51 You can use two indices to get or assign an element in the matrix. 52 53 >>> m[1,2] 54 3 55 >>> m['Beta','Gamma'] 56 3 57 >>> m['Beta','Gamma'] = 4 58 >>> m['Beta','Gamma'] 59 4 60 61 Further more, you can use one index to get or assign a list of elements related to that index. 62 63 >>> m[0] 64 [0, 1, 2, 4] 65 >>> m['Alpha'] 66 [0, 1, 2, 4] 67 >>> m['Alpha'] = [0, 7, 8, 9] 68 >>> m[0] 69 [0, 7, 8, 9] 70 >>> m[0,1] 71 7 72 73 Also you can delete or insert a column&row of elemets by index. 74 75 >>> m 76 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 77 >>> del m['Alpha'] 78 >>> m 79 _Matrix(names=['Beta', 'Gamma', 'Delta'], matrix=[[0], [4, 0], [5, 6, 0]]) 80 >>> m.insert('Alpha', [0, 7, 8, 9] , 0) 81 >>> m 82 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 83 84 """ 85
86 - def __init__(self, names, matrix=None):
87 """Initialize matrix by a list of names and a list of 88 lower triangular matrix data""" 89 # check names 90 if isinstance(names, list) and all(isinstance(s, str) for s in names): 91 if len(set(names)) == len(names): 92 self.names = names 93 else: 94 raise ValueError("Duplicate names found") 95 else: 96 raise TypeError("'names' should be a list of strings") 97 98 # check matrix 99 if matrix is None: 100 # create a new one with 0 if matrix is not assigned 101 matrix = [[0] * i for i in range(1, len(self) + 1)] 102 self.matrix = matrix 103 else: 104 # check if all elements are numbers 105 if (isinstance(matrix, list) and 106 all(isinstance(l, list) for l in matrix) and 107 all(_is_numeric(n) for n in [item for sublist in matrix 108 for item in sublist])): 109 # check if the same length with names 110 if len(matrix) == len(names): 111 # check if is lower triangle format 112 if [len(m) for m in matrix] == list(range(1, len(self) + 1)): 113 self.matrix = matrix 114 else: 115 raise ValueError( 116 "'matrix' should be in lower triangle format") 117 else: 118 raise ValueError( 119 "'names' and 'matrix' should be the same size") 120 else: 121 raise TypeError("'matrix' should be a list of numerical lists")
122
123 - def __getitem__(self, item):
124 """Access value(s) by the index(s) or name(s). 125 126 For a _Matrix object 'dm':: 127 128 dm[i] get a value list from the given 'i' to others; 129 dm[i, j] get the value between 'i' and 'j'; 130 dm['name'] map name to index first 131 dm['name1', 'name2'] map name to index first 132 """ 133 # Handle single indexing 134 if isinstance(item, (int, str)): 135 index = None 136 if isinstance(item, int): 137 index = item 138 elif isinstance(item, str): 139 if item in self.names: 140 index = self.names.index(item) 141 else: 142 raise ValueError("Item not found.") 143 else: 144 raise TypeError("Invalid index type.") 145 # check index 146 if index > len(self) - 1: 147 raise IndexError("Index out of range.") 148 return [self.matrix[index][i] for i in range(0, index)] + [self.matrix[i][index] for i in range(index, len(self))] 149 # Handle double indexing 150 elif len(item) == 2: 151 row_index = None 152 col_index = None 153 if all(isinstance(i, int) for i in item): 154 row_index, col_index = item 155 elif all(isinstance(i, str) for i in item): 156 row_name, col_name = item 157 if row_name in self.names and col_name in self.names: 158 row_index = self.names.index(row_name) 159 col_index = self.names.index(col_name) 160 else: 161 raise ValueError("Item not found.") 162 else: 163 raise TypeError("Invalid index type.") 164 # check index 165 if row_index > len(self) - 1 or col_index > len(self) - 1: 166 raise IndexError("Index out of range.") 167 if row_index > col_index: 168 return self.matrix[row_index][col_index] 169 else: 170 return self.matrix[col_index][row_index] 171 else: 172 raise TypeError("Invalid index type.")
173
174 - def __setitem__(self, item, value):
175 """Set value by the index(s) or name(s). 176 177 Similar to __getitem__:: 178 179 dm[1] = [1, 0, 3, 4] set values from '1' to others; 180 dm[i, j] = 2 set the value from 'i' to 'j' 181 182 """ 183 # Handle single indexing 184 if isinstance(item, (int, str)): 185 index = None 186 if isinstance(item, int): 187 index = item 188 elif isinstance(item, str): 189 if item in self.names: 190 index = self.names.index(item) 191 else: 192 raise ValueError("Item not found.") 193 else: 194 raise TypeError("Invalid index type.") 195 # check index 196 if index > len(self) - 1: 197 raise IndexError("Index out of range.") 198 # check and assign value 199 if isinstance(value, list) and all(_is_numeric(n) for n in value): 200 if len(value) == len(self): 201 for i in range(0, index): 202 self.matrix[index][i] = value[i] 203 for i in range(index, len(self)): 204 self.matrix[i][index] = value[i] 205 else: 206 raise ValueError("Value not the same size.") 207 else: 208 raise TypeError("Invalid value type.") 209 # Handle double indexing 210 elif len(item) == 2: 211 row_index = None 212 col_index = None 213 if all(isinstance(i, int) for i in item): 214 row_index, col_index = item 215 elif all(isinstance(i, str) for i in item): 216 row_name, col_name = item 217 if row_name in self.names and col_name in self.names: 218 row_index = self.names.index(row_name) 219 col_index = self.names.index(col_name) 220 else: 221 raise ValueError("Item not found.") 222 else: 223 raise TypeError("Invalid index type.") 224 # check index 225 if row_index > len(self) - 1 or col_index > len(self) - 1: 226 raise IndexError("Index out of range.") 227 # check and assign value 228 if _is_numeric(value): 229 if row_index > col_index: 230 self.matrix[row_index][col_index] = value 231 else: 232 self.matrix[col_index][row_index] = value 233 else: 234 raise TypeError("Invalid value type.") 235 else: 236 raise TypeError("Invalid index type.")
237
238 - def __delitem__(self, item):
239 """Delete related distances by the index or name""" 240 index = None 241 if isinstance(item, int): 242 index = item 243 elif isinstance(item, str): 244 index = self.names.index(item) 245 else: 246 raise TypeError("Invalid index type.") 247 # remove distances related to index 248 for i in range(index + 1, len(self)): 249 del self.matrix[i][index] 250 del self.matrix[index] 251 # remove name 252 del self.names[index]
253
254 - def insert(self, name, value, index=None):
255 """Insert distances given the name and value. 256 257 :Parameters: 258 name : str 259 name of a row/col to be inserted 260 value : list 261 a row/col of values to be inserted 262 """ 263 if isinstance(name, str): 264 # insert at the given index or at the end 265 if index is None: 266 index = len(self) 267 if not isinstance(index, int): 268 raise TypeError("Invalid index type.") 269 # insert name 270 self.names.insert(index, name) 271 # insert elements of 0, to be assigned 272 self.matrix.insert(index, [0] * index) 273 for i in range(index, len(self)): 274 self.matrix[i].insert(index, 0) 275 # assign value 276 self[index] = value 277 else: 278 raise TypeError("Invalid name type.")
279
280 - def __len__(self):
281 """Matrix length""" 282 return len(self.names)
283
284 - def __repr__(self):
285 return self.__class__.__name__ \ 286 + "(names=%s, matrix=%s)" \ 287 % tuple(map(repr, (self.names, self.matrix)))
288
289 - def __str__(self):
290 """Get a lower triangular matrix string""" 291 matrix_string = '\n'.join( 292 [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]]) 293 for i in range(0, len(self))]) 294 matrix_string = matrix_string + "\n\t" + "\t".join(self.names) 295 return matrix_string
296 297
298 -class _DistanceMatrix(_Matrix):
299 """Distance matrix class that can be used for distance based tree algorithms. 300 301 All diagonal elements will be zero no matter what the users provide. 302 """ 303
304 - def __init__(self, names, matrix=None):
305 _Matrix.__init__(self, names, matrix) 306 self._set_zero_diagonal()
307
308 - def __setitem__(self, item, value):
309 _Matrix.__setitem__(self, item, value) 310 self._set_zero_diagonal()
311
312 - def _set_zero_diagonal(self):
313 """set all diagonal elements to zero""" 314 for i in range(0, len(self)): 315 self.matrix[i][i] = 0
316 317
318 -class DistanceCalculator(object):
319 """Class to calculate the distance matrix from a DNA or Protein 320 321 Multiple Sequence Alignment(MSA) and the given name of the 322 substitution model. 323 324 Currently only scoring matrices are used. 325 326 :Parameters: 327 model : str 328 Name of the model matrix to be used to calculate distance. 329 The attribute `dna_matrices` contains the available model 330 names for DNA sequences and `protein_matrices` for protein 331 sequences. 332 333 Example 334 ------- 335 336 >>> from Bio.Phylo.TreeConstruction import DistanceCalculator 337 >>> from Bio import AlignIO 338 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 339 >>> print aln 340 SingleLetterAlphabet() alignment with 5 rows and 13 columns 341 AACGTGGCCACAT Alpha 342 AAGGTCGCCACAC Beta 343 GAGATTTCCGCCT Delta 344 GAGATCTCCGCCC Epsilon 345 CAGTTCGCCACAA Gamma 346 347 DNA calculator with 'identity' model:: 348 349 >>> calculator = DistanceCalculator('identity') 350 >>> dm = calculator.get_distance(aln) 351 >>> print dm 352 Alpha 0 353 Beta 0.230769230769 0 354 Gamma 0.384615384615 0.230769230769 0 355 Delta 0.538461538462 0.538461538462 0.538461538462 0 356 Epsilon 0.615384615385 0.384615384615 0.461538461538 0.153846153846 0 357 Alpha Beta Gamma Delta Epsilon 358 359 Protein calculator with 'blosum62' model:: 360 361 >>> calculator = DistanceCalculator('blosum62') 362 >>> dm = calculator.get_distance(aln) 363 >>> print dm 364 Alpha 0 365 Beta 0.369047619048 0 366 Gamma 0.493975903614 0.25 0 367 Delta 0.585365853659 0.547619047619 0.566265060241 0 368 Epsilon 0.7 0.355555555556 0.488888888889 0.222222222222 0 369 Alpha Beta Gamma Delta Epsilon 370 """ 371 372 dna_alphabet = ['A', 'T', 'C', 'G'] 373 374 # BLAST nucleic acid scoring matrix 375 blastn = [[5], 376 [-4, 5], 377 [-4, -4, 5], 378 [-4, -4, -4, 5]] 379 380 # transition/transversion scoring matrix 381 trans = [[6], 382 [-5, 6], 383 [-5, -1, 6], 384 [-1, -5, -5, 6]] 385 386 protein_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 387 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 388 'Z'] 389 390 # matrices available 391 dna_matrices = {'blastn': blastn, 'trans': trans} 392 protein_models = MatrixInfo.available_matrices 393 protein_matrices = dict((name, getattr(MatrixInfo, name)) 394 for name in protein_models) 395 396 dna_models = list(dna_matrices.keys()) 397 398 models = ['identity'] + dna_models + protein_models 399
400 - def __init__(self, model='identity'):
401 """Initialize with a distance model.""" 402 if model == 'identity': 403 self.scoring_matrix = None 404 elif model in self.dna_models: 405 self.scoring_matrix = _Matrix(self.dna_alphabet, 406 self.dna_matrices[model]) 407 elif model in self.protein_models: 408 self.scoring_matrix = self._build_protein_matrix( 409 self.protein_matrices[model]) 410 else: 411 raise ValueError("Model not supported. Available models: " + 412 ", ".join(self.models))
413
414 - def _pairwise(self, seq1, seq2):
415 """Calculate pairwise distance from two sequences. 416 417 Returns a value between 0 (identical sequences) and 1 (completely 418 different, or seq1 is an empty string.) 419 """ 420 score = 0 421 max_score = 0 422 if self.scoring_matrix: 423 max_score1 = 0 424 max_score2 = 0 425 skip_letters = ['-', '*'] 426 for i in range(0, len(seq1)): 427 l1 = seq1[i] 428 l2 = seq2[i] 429 if l1 in skip_letters or l2 in skip_letters: 430 continue 431 if l1 not in self.scoring_matrix.names: 432 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 433 % (l1, seq1.id, i)) 434 if l2 not in self.scoring_matrix.names: 435 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 436 % (l2, seq2.id, i)) 437 max_score1 += self.scoring_matrix[l1, l1] 438 max_score2 += self.scoring_matrix[l2, l2] 439 score += self.scoring_matrix[l1, l2] 440 # Take the higher score if the matrix is asymmetrical 441 max_score = max(max_score1, max_score2) 442 else: 443 # Score by character identity, not skipping any special letters 444 for i in range(0, len(seq1)): 445 l1 = seq1[i] 446 l2 = seq2[i] 447 if l1 == l2: 448 score += 1 449 max_score = len(seq1) 450 451 if max_score == 0: 452 return 1 # max possible scaled distance 453 return 1 - (score * 1.0 / max_score)
454
455 - def get_distance(self, msa):
456 """Return a _DistanceMatrix for MSA object 457 458 :Parameters: 459 msa : MultipleSeqAlignment 460 DNA or Protein multiple sequence alignment. 461 462 """ 463 if not isinstance(msa, MultipleSeqAlignment): 464 raise TypeError("Must provide a MultipleSeqAlignment object.") 465 466 names = [s.id for s in msa] 467 dm = _DistanceMatrix(names) 468 for seq1, seq2 in itertools.combinations(msa, 2): 469 dm[seq1.id, seq2.id] = self._pairwise(seq1, seq2) 470 return dm
471
472 - def _build_protein_matrix(self, subsmat):
473 """Convert matrix from SubsMat format to _Matrix object""" 474 protein_matrix = _Matrix(self.protein_alphabet) 475 for k, v in subsmat.items(): 476 aa1, aa2 = k 477 protein_matrix[aa1, aa2] = v 478 return protein_matrix
479 480
481 -class TreeConstructor(object):
482 """Base class for all tree constructor.""" 483
484 - def build_tree(self, msa):
485 """Caller to built the tree from a MultipleSeqAlignment object. 486 487 This should be implemented in subclass""" 488 raise NotImplementedError("Method not implemented!")
489 490
491 -class DistanceTreeConstructor(TreeConstructor):
492 """Distance based tree constructor. 493 494 :Parameters: 495 method : str 496 Distance tree construction method, 'nj'(default) or 'upgma'. 497 distance_calculator : DistanceCalculator 498 The distance matrix calculator for multiple sequence alignment. 499 It must be provided if `build_tree` will be called. 500 501 Example 502 -------- 503 504 >>> from TreeConstruction import DistanceTreeConstructor 505 >>> constructor = DistanceTreeConstructor() 506 507 UPGMA Tree: 508 509 >>> upgmatree = constructor.upgma(dm) 510 >>> print upgmatree 511 Tree(rooted=True) 512 Clade(name='Inner4') 513 Clade(branch_length=0.171955155115, name='Inner1') 514 Clade(branch_length=0.111111111111, name='Epsilon') 515 Clade(branch_length=0.111111111111, name='Delta') 516 Clade(branch_length=0.0673103855608, name='Inner3') 517 Clade(branch_length=0.0907558806655, name='Inner2') 518 Clade(branch_length=0.125, name='Gamma') 519 Clade(branch_length=0.125, name='Beta') 520 Clade(branch_length=0.215755880666, name='Alpha') 521 522 NJ Tree: 523 524 >>> njtree = constructor.nj(dm) 525 >>> print njtree 526 Tree(rooted=False) 527 Clade(name='Inner3') 528 Clade(branch_length=0.0142054862889, name='Inner2') 529 Clade(branch_length=0.239265540676, name='Inner1') 530 Clade(branch_length=0.0853101915988, name='Epsilon') 531 Clade(branch_length=0.136912030623, name='Delta') 532 Clade(branch_length=0.292306275042, name='Alpha') 533 Clade(branch_length=0.0747705106139, name='Beta') 534 Clade(branch_length=0.175229489386, name='Gamma') 535 536 537 """ 538 539 methods = ['nj', 'upgma'] 540
541 - def __init__(self, distance_calculator=None, method="nj"):
542 if (distance_calculator is None or 543 isinstance(distance_calculator, DistanceCalculator)): 544 self.distance_calculator = distance_calculator 545 else: 546 raise TypeError("Must provide a DistanceCalculator object.") 547 if isinstance(method, str) and method in self.methods: 548 self.method = method 549 else: 550 raise TypeError("Bad method: " + method + 551 ". Available methods: " + ", ".join(self.methods))
552
553 - def build_tree(self, msa):
554 if self.distance_calculator: 555 dm = self.distance_calculator.get_distance(msa) 556 tree = None 557 if self.method == 'upgma': 558 tree = self.upgma(dm) 559 else: 560 tree = self.nj(dm) 561 return tree 562 else: 563 raise TypeError("Must provide a DistanceCalculator object.")
564
565 - def upgma(self, distance_matrix):
566 """Construct and return an UPGMA tree. 567 568 Constructs and returns an Unweighted Pair Group Method 569 with Arithmetic mean (UPGMA) tree. 570 571 :Parameters: 572 distance_matrix : _DistanceMatrix 573 The distance matrix for tree construction. 574 """ 575 if not isinstance(distance_matrix, _DistanceMatrix): 576 raise TypeError("Must provide a _DistanceMatrix object.") 577 578 # make a copy of the distance matrix to be used 579 dm = copy.deepcopy(distance_matrix) 580 # init terminal clades 581 clades = [BaseTree.Clade(None, name) for name in dm.names] 582 # init minimum index 583 min_i = 0 584 min_j = 0 585 inner_count = 0 586 while len(dm) > 1: 587 min_dist = dm[1, 0] 588 # find minimum index 589 for i in range(1, len(dm)): 590 for j in range(0, i): 591 if min_dist >= dm[i, j]: 592 min_dist = dm[i, j] 593 min_i = i 594 min_j = j 595 596 # create clade 597 clade1 = clades[min_i] 598 clade2 = clades[min_j] 599 inner_count += 1 600 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 601 inner_clade.clades.append(clade1) 602 inner_clade.clades.append(clade2) 603 # assign branch length 604 if clade1.is_terminal(): 605 clade1.branch_length = min_dist * 1.0 / 2 606 else: 607 clade1.branch_length = min_dist * \ 608 1.0 / 2 - self._height_of(clade1) 609 610 if clade2.is_terminal(): 611 clade2.branch_length = min_dist * 1.0 / 2 612 else: 613 clade2.branch_length = min_dist * \ 614 1.0 / 2 - self._height_of(clade2) 615 616 # update node list 617 clades[min_j] = inner_clade 618 del clades[min_i] 619 620 # rebuild distance matrix, 621 # set the distances of new node at the index of min_j 622 for k in range(0, len(dm)): 623 if k != min_i and k != min_j: 624 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2 625 626 dm.names[min_j] = "Inner" + str(inner_count) 627 628 del dm[min_i] 629 inner_clade.branch_length = 0 630 return BaseTree.Tree(inner_clade)
631
632 - def nj(self, distance_matrix):
633 """Construct and return an Neighbor Joining tree. 634 635 :Parameters: 636 distance_matrix : _DistanceMatrix 637 The distance matrix for tree construction. 638 """ 639 if not isinstance(distance_matrix, _DistanceMatrix): 640 raise TypeError("Must provide a _DistanceMatrix object.") 641 642 # make a copy of the distance matrix to be used 643 dm = copy.deepcopy(distance_matrix) 644 # init terminal clades 645 clades = [BaseTree.Clade(None, name) for name in dm.names] 646 # init node distance 647 node_dist = [0] * len(dm) 648 # init minimum index 649 min_i = 0 650 min_j = 0 651 inner_count = 0 652 while len(dm) > 2: 653 # calculate nodeDist 654 for i in range(0, len(dm)): 655 node_dist[i] = 0 656 for j in range(0, len(dm)): 657 node_dist[i] += dm[i, j] 658 node_dist[i] = node_dist[i] / (len(dm) - 2) 659 660 # find minimum distance pair 661 min_dist = dm[1, 0] - node_dist[1] - node_dist[0] 662 min_i = 0 663 min_j = 1 664 for i in range(1, len(dm)): 665 for j in range(0, i): 666 temp = dm[i, j] - node_dist[i] - node_dist[j] 667 if min_dist > temp: 668 min_dist = temp 669 min_i = i 670 min_j = j 671 # create clade 672 clade1 = clades[min_i] 673 clade2 = clades[min_j] 674 inner_count += 1 675 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 676 inner_clade.clades.append(clade1) 677 inner_clade.clades.append(clade2) 678 # assign branch length 679 clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] - 680 node_dist[min_j]) / 2.0 681 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length 682 683 # update node list 684 clades[min_j] = inner_clade 685 del clades[min_i] 686 687 # rebuild distance matrix, 688 # set the distances of new node at the index of min_j 689 for k in range(0, len(dm)): 690 if k != min_i and k != min_j: 691 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] - 692 dm[min_i, min_j]) / 2.0 693 694 dm.names[min_j] = "Inner" + str(inner_count) 695 del dm[min_i] 696 697 # set the last clade as one of the child of the inner_clade 698 root = None 699 if clades[0] == inner_clade: 700 clades[0].branch_length = 0 701 clades[1].branch_length = dm[1, 0] 702 clades[0].clades.append(clades[1]) 703 root = clades[0] 704 else: 705 clades[0].branch_length = dm[1, 0] 706 clades[1].branch_length = 0 707 clades[1].clades.append(clades[0]) 708 root = clades[1] 709 710 return BaseTree.Tree(root, rooted=False)
711
712 - def _height_of(self, clade):
713 """calculate clade height -- the longest path to any terminal.""" 714 height = 0 715 if clade.is_terminal(): 716 height = clade.branch_length 717 else: 718 height = height + max([self._height_of(c) for c in clade.clades]) 719 return height
720 721 # #################### Tree Scoring and Searching Classes ##################### 722 723
724 -class Scorer(object):
725 """Base class for all tree scoring methods""" 726
727 - def get_score(self, tree, alignment):
728 """Caller to get the score of a tree for the given alignment. 729 730 This should be implemented in subclass. 731 """ 732 raise NotImplementedError("Method not implemented!")
733 734
735 -class TreeSearcher(object):
736 """Base class for all tree searching methods""" 737
738 - def search(self, starting_tree, alignment):
739 """Caller to search the best tree with a starting tree. 740 741 This should be implemented in subclass. 742 """ 743 raise NotImplementedError("Method not implemented!")
744 745
746 -class NNITreeSearcher(TreeSearcher):
747 """Tree searching with Nearest Neighbor Interchanges (NNI) algorithm. 748 749 :Parameters: 750 scorer : ParsimonyScorer 751 parsimony scorer to calculate the parsimony score of 752 different trees during NNI algorithm. 753 """ 754
755 - def __init__(self, scorer):
756 if isinstance(scorer, Scorer): 757 self.scorer = scorer 758 else: 759 raise TypeError("Must provide a Scorer object.")
760
761 - def search(self, starting_tree, alignment):
762 """Implement the TreeSearcher.search method. 763 764 :Parameters: 765 starting_tree : Tree 766 starting tree of NNI method. 767 alignment : MultipleSeqAlignment 768 multiple sequence alignment used to calculate parsimony 769 score of different NNI trees. 770 """ 771 return self._nni(starting_tree, alignment)
772
773 - def _nni(self, starting_tree, alignment):
774 """Search for the best parsimony tree using the NNI algorithm.""" 775 best_tree = starting_tree 776 while True: 777 best_score = self.scorer.get_score(best_tree, alignment) 778 temp = best_score 779 for t in self._get_neighbors(best_tree): 780 score = self.scorer.get_score(t, alignment) 781 if score < best_score: 782 best_score = score 783 best_tree = t 784 # stop if no smaller score exist 785 if best_score >= temp: 786 break 787 return best_tree
788
789 - def _get_neighbors(self, tree):
790 """Get all neighbor trees of the given tree. 791 792 Currently only for binary rooted trees. 793 """ 794 # make child to parent dict 795 parents = {} 796 for clade in tree.find_clades(): 797 if clade != tree.root: 798 node_path = tree.get_path(clade) 799 # cannot get the parent if the parent is root. Bug? 800 if len(node_path) == 1: 801 parents[clade] = tree.root 802 else: 803 parents[clade] = node_path[-2] 804 neighbors = [] 805 root_childs = [] 806 for clade in tree.get_nonterminals(order="level"): 807 if clade == tree.root: 808 left = clade.clades[0] 809 right = clade.clades[1] 810 root_childs.append(left) 811 root_childs.append(right) 812 if not left.is_terminal() and not right.is_terminal(): 813 # make changes around the left_left clade 814 # left_left = left.clades[0] 815 left_right = left.clades[1] 816 right_left = right.clades[0] 817 right_right = right.clades[1] 818 # neightbor 1 (left_left + right_right) 819 del left.clades[1] 820 del right.clades[1] 821 left.clades.append(right_right) 822 right.clades.append(left_right) 823 temp_tree = copy.deepcopy(tree) 824 neighbors.append(temp_tree) 825 # neighbor 2 (left_left + right_left) 826 del left.clades[1] 827 del right.clades[0] 828 left.clades.append(right_left) 829 right.clades.append(right_right) 830 temp_tree = copy.deepcopy(tree) 831 neighbors.append(temp_tree) 832 # change back (left_left + left_right) 833 del left.clades[1] 834 del right.clades[0] 835 left.clades.append(left_right) 836 right.clades.insert(0, right_left) 837 elif clade in root_childs: 838 # skip root child 839 continue 840 else: 841 # method for other clades 842 # make changes around the parent clade 843 left = clade.clades[0] 844 right = clade.clades[1] 845 parent = parents[clade] 846 if clade == parent.clades[0]: 847 sister = parent.clades[1] 848 # neighbor 1 (parent + right) 849 del parent.clades[1] 850 del clade.clades[1] 851 parent.clades.append(right) 852 clade.clades.append(sister) 853 temp_tree = copy.deepcopy(tree) 854 neighbors.append(temp_tree) 855 # neighbor 2 (parent + left) 856 del parent.clades[1] 857 del clade.clades[0] 858 parent.clades.append(left) 859 clade.clades.append(right) 860 temp_tree = copy.deepcopy(tree) 861 neighbors.append(temp_tree) 862 # change back (parent + sister) 863 del parent.clades[1] 864 del clade.clades[0] 865 parent.clades.append(sister) 866 clade.clades.insert(0, left) 867 else: 868 sister = parent.clades[0] 869 # neighbor 1 (parent + right) 870 del parent.clades[0] 871 del clade.clades[1] 872 parent.clades.insert(0, right) 873 clade.clades.append(sister) 874 temp_tree = copy.deepcopy(tree) 875 neighbors.append(temp_tree) 876 # neighbor 2 (parent + left) 877 del parent.clades[0] 878 del clade.clades[0] 879 parent.clades.insert(0, left) 880 clade.clades.append(right) 881 temp_tree = copy.deepcopy(tree) 882 neighbors.append(temp_tree) 883 # change back (parent + sister) 884 del parent.clades[0] 885 del clade.clades[0] 886 parent.clades.insert(0, sister) 887 clade.clades.insert(0, left) 888 return neighbors
889 890 # ######################## Parsimony Classes ########################## 891 892
893 -class ParsimonyScorer(Scorer):
894 """Parsimony scorer with a scoring matrix. 895 896 This is a combination of Fitch algorithm and Sankoff algorithm. 897 See ParsimonyTreeConstructor for usage. 898 899 :Parameters: 900 matrix : _Matrix 901 scoring matrix used in parsimony score calculation. 902 """ 903
904 - def __init__(self, matrix=None):
905 if not matrix or isinstance(matrix, _Matrix): 906 self.matrix = matrix 907 else: 908 raise TypeError("Must provide a _Matrix object.")
909
910 - def get_score(self, tree, alignment):
911 """Calculate and return the parsimony score given a tree and 912 the MSA using the Fitch algorithm without the penalty matrix 913 the Sankoff algorithm with the matrix""" 914 # make sure the tree is rooted and bifurcating 915 if not tree.is_bifurcating(): 916 raise ValueError("The tree provided should be bifurcating.") 917 if not tree.rooted: 918 tree.root_at_midpoint() 919 # sort tree terminals and alignment 920 terms = tree.get_terminals() 921 terms.sort(key=lambda term: term.name) 922 alignment.sort() 923 if not all(t.name == a.id for t, a in zip(terms, alignment)): 924 raise ValueError( 925 "Taxon names of the input tree should be the same with the alignment.") 926 # term_align = dict(zip(terms, alignment)) 927 score = 0 928 for i in range(len(alignment[0])): 929 # parsimony score for column_i 930 score_i = 0 931 # get column 932 column_i = alignment[:, i] 933 # skip non-informative column 934 if column_i == len(column_i) * column_i[0]: 935 continue 936 937 # start calculating score_i using the tree and column_i 938 939 # Fitch algorithm without the penalty matrix 940 if not self.matrix: 941 # init by mapping terminal clades and states in column_i 942 clade_states = dict(zip(terms, [set([c]) for c in column_i])) 943 for clade in tree.get_nonterminals(order="postorder"): 944 clade_childs = clade.clades 945 left_state = clade_states[clade_childs[0]] 946 right_state = clade_states[clade_childs[1]] 947 state = left_state & right_state 948 if not state: 949 state = left_state | right_state 950 score_i = score_i + 1 951 clade_states[clade] = state 952 # Sankoff algorithm with the penalty matrix 953 else: 954 inf = float('inf') 955 # init score arrays for terminal clades 956 alphabet = self.matrix.names 957 length = len(alphabet) 958 clade_scores = {} 959 for j in range(len(column_i)): 960 array = [inf] * length 961 index = alphabet.index(column_i[j]) 962 array[index] = 0 963 clade_scores[terms[j]] = array 964 # bottom up calculation 965 for clade in tree.get_nonterminals(order="postorder"): 966 clade_childs = clade.clades 967 left_score = clade_scores[clade_childs[0]] 968 right_score = clade_scores[clade_childs[1]] 969 array = [] 970 for m in range(length): 971 min_l = inf 972 min_r = inf 973 for n in range(length): 974 sl = self.matrix[ 975 alphabet[m], alphabet[n]] + left_score[n] 976 sr = self.matrix[ 977 alphabet[m], alphabet[n]] + right_score[n] 978 if min_l > sl: 979 min_l = sl 980 if min_r > sr: 981 min_r = sr 982 array.append(min_l + min_r) 983 clade_scores[clade] = array 984 # minimum from root score 985 score_i = min(array) 986 # TODO: resolve internal states 987 score = score + score_i 988 return score
989 990
991 -class ParsimonyTreeConstructor(TreeConstructor):
992 """Parsimony tree constructor. 993 994 :Parameters: 995 searcher : TreeSearcher 996 tree searcher to search the best parsimony tree. 997 starting_tree : Tree 998 starting tree provided to the searcher. 999 1000 Example 1001 -------- 1002 1003 >>> from Bio import AlignIO 1004 >>> from TreeConstruction import * 1005 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 1006 >>> print aln 1007 SingleLetterAlphabet() alignment with 5 rows and 13 columns 1008 AACGTGGCCACAT Alpha 1009 AAGGTCGCCACAC Beta 1010 GAGATTTCCGCCT Delta 1011 GAGATCTCCGCCC Epsilon 1012 CAGTTCGCCACAA Gamma 1013 >>> starting_tree = Phylo.read('Tests/TreeConstruction/nj.tre', 'newick') 1014 >>> print tree 1015 Tree(weight=1.0, rooted=False) 1016 Clade(branch_length=0.0, name='Inner3') 1017 Clade(branch_length=0.01421, name='Inner2') 1018 Clade(branch_length=0.23927, name='Inner1') 1019 Clade(branch_length=0.08531, name='Epsilon') 1020 Clade(branch_length=0.13691, name='Delta') 1021 Clade(branch_length=0.29231, name='Alpha') 1022 Clade(branch_length=0.07477, name='Beta') 1023 Clade(branch_length=0.17523, name='Gamma') 1024 >>> from TreeConstruction import * 1025 >>> scorer = ParsimonyScorer() 1026 >>> searcher = NNITreeSearcher(scorer) 1027 >>> constructor = ParsimonyTreeConstructor(searcher, starting_tree) 1028 >>> pars_tree = constructor.build_tree(aln) 1029 >>> print pars_tree 1030 Tree(weight=1.0, rooted=True) 1031 Clade(branch_length=0.0) 1032 Clade(branch_length=0.197335, name='Inner1') 1033 Clade(branch_length=0.13691, name='Delta') 1034 Clade(branch_length=0.08531, name='Epsilon') 1035 Clade(branch_length=0.041935, name='Inner2') 1036 Clade(branch_length=0.01421, name='Inner3') 1037 Clade(branch_length=0.17523, name='Gamma') 1038 Clade(branch_length=0.07477, name='Beta') 1039 Clade(branch_length=0.29231, name='Alpha') 1040 """ 1041
1042 - def __init__(self, searcher, starting_tree=None):
1043 self.searcher = searcher 1044 self.starting_tree = starting_tree
1045
1046 - def build_tree(self, alignment):
1047 """Build the tree. 1048 1049 :Parameters: 1050 alignment : MultipleSeqAlignment 1051 multiple sequence alignment to calculate parsimony tree. 1052 """ 1053 # if starting_tree is none, 1054 # create a upgma tree with 'identity' scoring matrix 1055 if self.starting_tree is None: 1056 dtc = DistanceTreeConstructor(DistanceCalculator("identity"), 1057 "upgma") 1058 self.starting_tree = dtc.build_tree(alignment) 1059 return self.searcher.search(self.starting_tree, alignment)
1060