Package Bio :: Package Phylo :: Module TreeConstruction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.TreeConstruction

   1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes and methods for tree construction""" 
   7  __docformat__ = "restructuredtext en" 
   8   
   9  import itertools 
  10  import copy 
  11  from Bio.Phylo import BaseTree 
  12  from Bio.Align import MultipleSeqAlignment 
  13  from Bio.SubsMat import MatrixInfo 
  14  from Bio import _py3k 
  15   
  16   
17 -def _is_numeric(x):
18 return _py3k._is_int_or_long(x) or isinstance(x, (float, complex))
19 20
21 -class _Matrix(object):
22 """Base class for distance matrix or scoring matrix 23 24 Accepts a list of names and a lower triangular matrix.:: 25 26 matrix = [[0], 27 [1, 0], 28 [2, 3, 0], 29 [4, 5, 6, 0]] 30 represents the symmetric matrix of 31 [0,1,2,4] 32 [1,0,3,5] 33 [2,3,0,6] 34 [4,5,6,0] 35 36 :Parameters: 37 names : list 38 names of elements, used for indexing 39 matrix : list 40 nested list of numerical lists in lower triangular format 41 42 Example 43 ------- 44 45 >>> from Bio.Phylo.TreeConstruction import _Matrix 46 >>> names = ['Alpha', 'Beta', 'Gamma', 'Delta'] 47 >>> matrix = [[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]] 48 >>> m = _Matrix(names, matrix) 49 >>> m 50 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]]) 51 52 You can use two indices to get or assign an element in the matrix. 53 54 >>> m[1,2] 55 3 56 >>> m['Beta','Gamma'] 57 3 58 >>> m['Beta','Gamma'] = 4 59 >>> m['Beta','Gamma'] 60 4 61 62 Further more, you can use one index to get or assign a list of elements related to that index. 63 64 >>> m[0] 65 [0, 1, 2, 4] 66 >>> m['Alpha'] 67 [0, 1, 2, 4] 68 >>> m['Alpha'] = [0, 7, 8, 9] 69 >>> m[0] 70 [0, 7, 8, 9] 71 >>> m[0,1] 72 7 73 74 Also you can delete or insert a column&row of elemets by index. 75 76 >>> m 77 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 78 >>> del m['Alpha'] 79 >>> m 80 _Matrix(names=['Beta', 'Gamma', 'Delta'], matrix=[[0], [4, 0], [5, 6, 0]]) 81 >>> m.insert('Alpha', [0, 7, 8, 9] , 0) 82 >>> m 83 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 84 85 """ 86
87 - def __init__(self, names, matrix=None):
88 """Initialize matrix by a list of names and a list of 89 lower triangular matrix data""" 90 # check names 91 if isinstance(names, list) and all(isinstance(s, str) for s in names): 92 if len(set(names)) == len(names): 93 self.names = names 94 else: 95 raise ValueError("Duplicate names found") 96 else: 97 raise TypeError("'names' should be a list of strings") 98 99 # check matrix 100 if matrix is None: 101 # create a new one with 0 if matrix is not assigned 102 matrix = [[0] * i for i in range(1, len(self) + 1)] 103 self.matrix = matrix 104 else: 105 # check if all elements are numbers 106 if (isinstance(matrix, list) 107 and all(isinstance(l, list) for l in matrix) 108 and all(_is_numeric(n) for n in [item for sublist in matrix 109 for item in sublist])): 110 # check if the same length with names 111 if len(matrix) == len(names): 112 # check if is lower triangle format 113 if [len(m) for m in matrix] == list(range(1, len(self) + 1)): 114 self.matrix = matrix 115 else: 116 raise ValueError( 117 "'matrix' should be in lower triangle format") 118 else: 119 raise ValueError( 120 "'names' and 'matrix' should be the same size") 121 else: 122 raise TypeError("'matrix' should be a list of numerical lists")
123
124 - def __getitem__(self, item):
125 """Access value(s) by the index(s) or name(s). 126 127 For a _Matrix object 'dm':: 128 129 dm[i] get a value list from the given 'i' to others; 130 dm[i, j] get the value between 'i' and 'j'; 131 dm['name'] map name to index first 132 dm['name1', 'name2'] map name to index first 133 """ 134 # Handle single indexing 135 if isinstance(item, (int, str)): 136 index = None 137 if isinstance(item, int): 138 index = item 139 elif isinstance(item, str): 140 if item in self.names: 141 index = self.names.index(item) 142 else: 143 raise ValueError("Item not found.") 144 else: 145 raise TypeError("Invalid index type.") 146 # check index 147 if index > len(self) - 1: 148 raise IndexError("Index out of range.") 149 return [self.matrix[index][i] for i in range(0, index)] + [self.matrix[i][index] for i in range(index, len(self))] 150 # Handle double indexing 151 elif len(item) == 2: 152 row_index = None 153 col_index = None 154 if all(isinstance(i, int) for i in item): 155 row_index, col_index = item 156 elif all(isinstance(i, str) for i in item): 157 row_name, col_name = item 158 if row_name in self.names and col_name in self.names: 159 row_index = self.names.index(row_name) 160 col_index = self.names.index(col_name) 161 else: 162 raise ValueError("Item not found.") 163 else: 164 raise TypeError("Invalid index type.") 165 # check index 166 if row_index > len(self) - 1 or col_index > len(self) - 1: 167 raise IndexError("Index out of range.") 168 if row_index > col_index: 169 return self.matrix[row_index][col_index] 170 else: 171 return self.matrix[col_index][row_index] 172 else: 173 raise TypeError("Invalid index type.")
174
175 - def __setitem__(self, item, value):
176 """Set value by the index(s) or name(s). 177 178 Similar to __getitem__:: 179 180 dm[1] = [1, 0, 3, 4] set values from '1' to others; 181 dm[i, j] = 2 set the value from 'i' to 'j' 182 183 """ 184 185 # Handle single indexing 186 if isinstance(item, (int, str)): 187 index = None 188 if isinstance(item, int): 189 index = item 190 elif isinstance(item, str): 191 if item in self.names: 192 index = self.names.index(item) 193 else: 194 raise ValueError("Item not found.") 195 else: 196 raise TypeError("Invalid index type.") 197 # check index 198 if index > len(self) - 1: 199 raise IndexError("Index out of range.") 200 # check and assign value 201 if isinstance(value, list) and all(_is_numeric(n) for n in value): 202 if len(value) == len(self): 203 for i in range(0, index): 204 self.matrix[index][i] = value[i] 205 for i in range(index, len(self)): 206 self.matrix[i][index] = value[i] 207 else: 208 raise ValueError("Value not the same size.") 209 else: 210 raise TypeError("Invalid value type.") 211 # Handle double indexing 212 elif len(item) == 2: 213 row_index = None 214 col_index = None 215 if all(isinstance(i, int) for i in item): 216 row_index, col_index = item 217 elif all(isinstance(i, str) for i in item): 218 row_name, col_name = item 219 if row_name in self.names and col_name in self.names: 220 row_index = self.names.index(row_name) 221 col_index = self.names.index(col_name) 222 else: 223 raise ValueError("Item not found.") 224 else: 225 raise TypeError("Invalid index type.") 226 # check index 227 if row_index > len(self) - 1 or col_index > len(self) - 1: 228 raise IndexError("Index out of range.") 229 # check and assign value 230 if _is_numeric(value): 231 if row_index > col_index: 232 self.matrix[row_index][col_index] = value 233 else: 234 self.matrix[col_index][row_index] = value 235 else: 236 raise TypeError("Invalid value type.") 237 else: 238 raise TypeError("Invalid index type.")
239
240 - def __delitem__(self, item):
241 """Delete related distances by the index or name""" 242 index = None 243 if isinstance(item, int): 244 index = item 245 elif isinstance(item, str): 246 index = self.names.index(item) 247 else: 248 raise TypeError("Invalid index type.") 249 # remove distances related to index 250 for i in range(index + 1, len(self)): 251 del self.matrix[i][index] 252 del self.matrix[index] 253 # remove name 254 del self.names[index]
255
256 - def insert(self, name, value, index=None):
257 """Insert distances given the name and value. 258 259 :Parameters: 260 name : str 261 name of a row/col to be inserted 262 value : list 263 a row/col of values to be inserted 264 """ 265 if isinstance(name, str): 266 # insert at the given index or at the end 267 if index is None: 268 index = len(self) 269 if not isinstance(index, int): 270 raise TypeError("Invalid index type.") 271 # insert name 272 self.names.insert(index, name) 273 # insert elements of 0, to be assigned 274 self.matrix.insert(index, [0] * index) 275 for i in range(index, len(self)): 276 self.matrix[i].insert(index, 0) 277 # assign value 278 self[index] = value 279 else: 280 raise TypeError("Invalid name type.")
281
282 - def __len__(self):
283 """Matrix length""" 284 return len(self.names)
285
286 - def __repr__(self):
287 return self.__class__.__name__ \ 288 + "(names=%s, matrix=%s)" \ 289 % tuple(map(repr, (self.names, self.matrix)))
290
291 - def __str__(self):
292 """Get a lower triangular matrix string""" 293 matrix_string = '\n'.join( 294 [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]]) 295 for i in range(0, len(self))]) 296 matrix_string = matrix_string + "\n\t" + "\t".join(self.names) 297 return matrix_string
298 299
300 -class _DistanceMatrix(_Matrix):
301 """Distance matrix class that can be used for distance based tree algorithms. 302 303 All diagonal elements will be zero no matter what the users provide. 304 """ 305
306 - def __init__(self, names, matrix=None):
307 _Matrix.__init__(self, names, matrix) 308 self._set_zero_diagonal()
309
310 - def __setitem__(self, item, value):
311 _Matrix.__setitem__(self, item, value) 312 self._set_zero_diagonal()
313
314 - def _set_zero_diagonal(self):
315 """set all diagonal elements to zero""" 316 for i in range(0, len(self)): 317 self.matrix[i][i] = 0
318 319
320 -class DistanceCalculator(object):
321 """Class to calculate the distance matrix from a DNA or Protein 322 323 Multiple Sequence Alignment(MSA) and the given name of the 324 substitution model. 325 326 Currently only scoring matrices are used. 327 328 :Parameters: 329 model : str 330 Name of the model matrix to be used to calculate distance. 331 The attribute `dna_matrices` contains the available model 332 names for DNA sequences and `protein_matrices` for protein 333 sequences. 334 335 Example 336 ------- 337 338 >>> from Bio.Phylo.TreeConstruction import DistanceCalculator 339 >>> from Bio import AlignIO 340 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 341 >>> print aln 342 SingleLetterAlphabet() alignment with 5 rows and 13 columns 343 AACGTGGCCACAT Alpha 344 AAGGTCGCCACAC Beta 345 GAGATTTCCGCCT Delta 346 GAGATCTCCGCCC Epsilon 347 CAGTTCGCCACAA Gamma 348 349 DNA calculator with 'identity' model:: 350 351 >>> calculator = DistanceCalculator('identity') 352 >>> dm = calculator.get_distance(aln) 353 >>> print dm 354 Alpha 0 355 Beta 0.230769230769 0 356 Gamma 0.384615384615 0.230769230769 0 357 Delta 0.538461538462 0.538461538462 0.538461538462 0 358 Epsilon 0.615384615385 0.384615384615 0.461538461538 0.153846153846 0 359 Alpha Beta Gamma Delta Epsilon 360 361 Protein calculator with 'blosum62' model:: 362 363 >>> calculator = DistanceCalculator('blosum62') 364 >>> dm = calculator.get_distance(aln) 365 >>> print dm 366 Alpha 0 367 Beta 0.369047619048 0 368 Gamma 0.493975903614 0.25 0 369 Delta 0.585365853659 0.547619047619 0.566265060241 0 370 Epsilon 0.7 0.355555555556 0.488888888889 0.222222222222 0 371 Alpha Beta Gamma Delta Epsilon 372 """ 373 374 dna_alphabet = ['A', 'T', 'C', 'G'] 375 376 # BLAST nucleic acid scoring matrix 377 blastn = [[5], 378 [-4, 5], 379 [-4, -4, 5], 380 [-4, -4, -4, 5]] 381 382 # transition/transversion scoring matrix 383 trans = [[6], 384 [-5, 6], 385 [-5, -1, 6], 386 [-1, -5, -5, 6]] 387 388 protein_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 389 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 390 'Z'] 391 392 # matrices available 393 dna_matrices = {'blastn': blastn, 'trans': trans} 394 protein_models = MatrixInfo.available_matrices 395 protein_matrices = dict((name, getattr(MatrixInfo, name)) 396 for name in protein_models) 397 398 dna_models = list(dna_matrices.keys()) 399 400 models = ['identity'] + dna_models + protein_models 401
402 - def __init__(self, model='identity'):
403 """Initialize with a distance model""" 404 405 if model == 'identity': 406 self.scoring_matrix = None 407 elif model in self.dna_models: 408 self.scoring_matrix = _Matrix(self.dna_alphabet, 409 self.dna_matrices[model]) 410 elif model in self.protein_models: 411 self.scoring_matrix = self._build_protein_matrix( 412 self.protein_matrices[model]) 413 else: 414 raise ValueError("Model not supported. Available models: " 415 + ", ".join(self.models))
416
417 - def _pairwise(self, seq1, seq2):
418 """Calculate pairwise distance from two sequences""" 419 score = 0 420 max_score = 0 421 if self.scoring_matrix: 422 max_score1 = 0 423 max_score2 = 0 424 skip_letters = ['-', '*'] 425 for i in range(0, len(seq1)): 426 l1 = seq1[i] 427 l2 = seq2[i] 428 if l1 in skip_letters or l2 in skip_letters: 429 continue 430 if l1 not in self.scoring_matrix.names: 431 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 432 % (l1, seq1.id, i)) 433 if l2 not in self.scoring_matrix.names: 434 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 435 % (l2, seq2.id, i)) 436 max_score1 += self.scoring_matrix[l1, l1] 437 max_score2 += self.scoring_matrix[l2, l2] 438 score += self.scoring_matrix[l1, l2] 439 440 max_score = max_score1 > max_score2 and max_score1 or max_score2 441 else: 442 for i in range(0, len(seq1)): 443 l1 = seq1[i] 444 l2 = seq2[i] 445 if l1 == l2: 446 score += 1 447 max_score = len(seq1) 448 449 return 1 - (score * 1.0 / max_score)
450
451 - def get_distance(self, msa):
452 """Return a _DistanceMatrix for MSA object 453 454 :Parameters: 455 msa : MultipleSeqAlignment 456 DNA or Protein multiple sequence alignment. 457 458 """ 459 460 if not isinstance(msa, MultipleSeqAlignment): 461 raise TypeError("Must provide a MultipleSeqAlignment object.") 462 463 names = [s.id for s in msa] 464 dm = _DistanceMatrix(names) 465 for seq1, seq2 in itertools.combinations(msa, 2): 466 dm[seq1.id, seq2.id] = self._pairwise(seq1, seq2) 467 return dm
468
469 - def _build_protein_matrix(self, subsmat):
470 """Convert matrix from SubsMat format to _Matrix object""" 471 protein_matrix = _Matrix(self.protein_alphabet) 472 for k, v in subsmat.items(): 473 aa1, aa2 = k 474 protein_matrix[aa1, aa2] = v 475 return protein_matrix
476 477
478 -class TreeConstructor(object):
479 """Base class for all tree constructor.""" 480
481 - def build_tree(self, msa):
482 """Caller to built the tree from a MultipleSeqAlignment object. 483 484 This should be implemented in subclass""" 485 raise NotImplementedError("Method not implemented!")
486 487
488 -class DistanceTreeConstructor(TreeConstructor):
489 """Distance based tree constructor. 490 491 :Parameters: 492 method : str 493 Distance tree construction method, 'nj'(default) or 'upgma'. 494 distance_calculator : DistanceCalculator 495 The distance matrix calculator for multiple sequence alignment. 496 It must be provided if `build_tree` will be called. 497 498 Example 499 -------- 500 501 >>> from TreeConstruction import DistanceTreeConstructor 502 >>> constructor = DistanceTreeConstructor() 503 504 UPGMA Tree: 505 506 >>> upgmatree = constructor.upgma(dm) 507 >>> print upgmatree 508 Tree(rooted=True) 509 Clade(name='Inner4') 510 Clade(branch_length=0.171955155115, name='Inner1') 511 Clade(branch_length=0.111111111111, name='Epsilon') 512 Clade(branch_length=0.111111111111, name='Delta') 513 Clade(branch_length=0.0673103855608, name='Inner3') 514 Clade(branch_length=0.0907558806655, name='Inner2') 515 Clade(branch_length=0.125, name='Gamma') 516 Clade(branch_length=0.125, name='Beta') 517 Clade(branch_length=0.215755880666, name='Alpha') 518 519 NJ Tree: 520 521 >>> njtree = constructor.nj(dm) 522 >>> print njtree 523 Tree(rooted=False) 524 Clade(name='Inner3') 525 Clade(branch_length=0.0142054862889, name='Inner2') 526 Clade(branch_length=0.239265540676, name='Inner1') 527 Clade(branch_length=0.0853101915988, name='Epsilon') 528 Clade(branch_length=0.136912030623, name='Delta') 529 Clade(branch_length=0.292306275042, name='Alpha') 530 Clade(branch_length=0.0747705106139, name='Beta') 531 Clade(branch_length=0.175229489386, name='Gamma') 532 533 534 """ 535 536 methods = ['nj', 'upgma'] 537
538 - def __init__(self, distance_calculator=None, method="nj"):
539 if (distance_calculator is None 540 or isinstance(distance_calculator, DistanceCalculator)): 541 self.distance_calculator = distance_calculator 542 else: 543 raise TypeError("Must provide a DistanceCalculator object.") 544 if isinstance(method, str) and method in self.methods: 545 self.method = method 546 else: 547 raise TypeError("Bad method: " + method + 548 ". Available methods: " + ", ".join(self.methods))
549
550 - def build_tree(self, msa):
551 if self.distance_calculator: 552 dm = self.distance_calculator.get_distance(msa) 553 tree = None 554 if self.method == 'upgma': 555 tree = self.upgma(dm) 556 else: 557 tree = self.nj(dm) 558 return tree 559 else: 560 raise TypeError("Must provide a DistanceCalculator object.")
561
562 - def upgma(self, distance_matrix):
563 """Construct and return an UPGMA tree. 564 565 Constructs and returns an Unweighted Pair Group Method 566 with Arithmetic mean (UPGMA) tree. 567 568 :Parameters: 569 distance_matrix : _DistanceMatrix 570 The distance matrix for tree construction. 571 """ 572 if not isinstance(distance_matrix, _DistanceMatrix): 573 raise TypeError("Must provide a _DistanceMatrix object.") 574 575 # make a copy of the distance matrix to be used 576 dm = copy.deepcopy(distance_matrix) 577 # init terminal clades 578 clades = [BaseTree.Clade(None, name) for name in dm.names] 579 # init minimum index 580 min_i = 0 581 min_j = 0 582 inner_count = 0 583 while len(dm) > 1: 584 min_dist = dm[1, 0] 585 # find minimum index 586 for i in range(1, len(dm)): 587 for j in range(0, i): 588 if min_dist >= dm[i, j]: 589 min_dist = dm[i, j] 590 min_i = i 591 min_j = j 592 593 # create clade 594 clade1 = clades[min_i] 595 clade2 = clades[min_j] 596 inner_count += 1 597 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 598 inner_clade.clades.append(clade1) 599 inner_clade.clades.append(clade2) 600 # assign branch length 601 if clade1.is_terminal(): 602 clade1.branch_length = min_dist * 1.0 / 2 603 else: 604 clade1.branch_length = min_dist * \ 605 1.0 / 2 - self._height_of(clade1) 606 607 if clade2.is_terminal(): 608 clade2.branch_length = min_dist * 1.0 / 2 609 else: 610 clade2.branch_length = min_dist * \ 611 1.0 / 2 - self._height_of(clade2) 612 613 # update node list 614 clades[min_j] = inner_clade 615 del clades[min_i] 616 617 # rebuild distance matrix, 618 # set the distances of new node at the index of min_j 619 for k in range(0, len(dm)): 620 if k != min_i and k != min_j: 621 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2 622 623 dm.names[min_j] = "Inner" + str(inner_count) 624 625 del dm[min_i] 626 inner_clade.branch_length = 0 627 return BaseTree.Tree(inner_clade)
628
629 - def nj(self, distance_matrix):
630 """Construct and return an Neighbor Joining tree. 631 632 :Parameters: 633 distance_matrix : _DistanceMatrix 634 The distance matrix for tree construction. 635 """ 636 637 if not isinstance(distance_matrix, _DistanceMatrix): 638 raise TypeError("Must provide a _DistanceMatrix object.") 639 640 # make a copy of the distance matrix to be used 641 dm = copy.deepcopy(distance_matrix) 642 # init terminal clades 643 clades = [BaseTree.Clade(None, name) for name in dm.names] 644 # init node distance 645 node_dist = [0] * len(dm) 646 # init minimum index 647 min_i = 0 648 min_j = 0 649 inner_count = 0 650 while len(dm) > 2: 651 # calculate nodeDist 652 for i in range(0, len(dm)): 653 node_dist[i] = 0 654 for j in range(0, len(dm)): 655 node_dist[i] += dm[i, j] 656 node_dist[i] = node_dist[i] / (len(dm) - 2) 657 658 # find minimum distance pair 659 min_dist = dm[1, 0] - node_dist[1] - node_dist[0] 660 min_i = 0 661 min_j = 1 662 for i in range(1, len(dm)): 663 for j in range(0, i): 664 temp = dm[i, j] - node_dist[i] - node_dist[j] 665 if min_dist > temp: 666 min_dist = temp 667 min_i = i 668 min_j = j 669 # create clade 670 clade1 = clades[min_i] 671 clade2 = clades[min_j] 672 inner_count += 1 673 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 674 inner_clade.clades.append(clade1) 675 inner_clade.clades.append(clade2) 676 # assign branch length 677 clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] 678 - node_dist[min_j]) / 2.0 679 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length 680 681 # update node list 682 clades[min_j] = inner_clade 683 del clades[min_i] 684 685 # rebuild distance matrix, 686 # set the distances of new node at the index of min_j 687 for k in range(0, len(dm)): 688 if k != min_i and k != min_j: 689 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] 690 - dm[min_i, min_j]) / 2.0 691 692 dm.names[min_j] = "Inner" + str(inner_count) 693 del dm[min_i] 694 695 # set the last clade as one of the child of the inner_clade 696 root = None 697 if clades[0] == inner_clade: 698 clades[0].branch_length = 0 699 clades[1].branch_length = dm[1, 0] 700 clades[0].clades.append(clades[1]) 701 root = clades[0] 702 else: 703 clades[0].branch_length = dm[1, 0] 704 clades[1].branch_length = 0 705 clades[1].clades.append(clades[0]) 706 root = clades[1] 707 708 return BaseTree.Tree(root, rooted=False)
709
710 - def _height_of(self, clade):
711 """calculate clade height -- the longest path to any terminal.""" 712 height = 0 713 if clade.is_terminal(): 714 height = clade.branch_length 715 else: 716 height = height + max([self._height_of(c) for c in clade.clades]) 717 return height
718 719 # #################### Tree Scoring and Searching Classes ##################### 720 721
722 -class Scorer(object):
723 """Base class for all tree scoring methods""" 724
725 - def get_score(self, tree, alignment):
726 """Caller to get the score of a tree for the given alignment. 727 728 This should be implemented in subclass""" 729 raise NotImplementedError("Method not implemented!")
730 731
732 -class TreeSearcher(object):
733 """Base class for all tree searching methods""" 734
735 - def search(self, starting_tree, alignment):
736 """Caller to search the best tree with a starting tree. 737 738 This should be implemented in subclass""" 739 raise NotImplementedError("Method not implemented!")
740 741
742 -class NNITreeSearcher(TreeSearcher):
743 """Tree searching with Nearest Neighbor Interchanges (NNI) algorithm. 744 745 :Parameters: 746 scorer : ParsimonyScorer 747 parsimony scorer to calculate the parsimony score of 748 different trees during NNI algorithm. 749 """ 750
751 - def __init__(self, scorer):
752 if isinstance(scorer, Scorer): 753 self.scorer = scorer 754 else: 755 raise TypeError("Must provide a Scorer object.")
756
757 - def search(self, starting_tree, alignment):
758 """Implement the TreeSearcher.search method. 759 760 :Parameters: 761 starting_tree : Tree 762 starting tree of NNI method. 763 alignment : MultipleSeqAlignment 764 multiple sequence alignment used to calculate parsimony 765 score of different NNI trees. 766 """ 767 768 return self._nni(starting_tree, alignment)
769
770 - def _nni(self, starting_tree, alignment):
771 """Search for the best parsimony tree using the NNI algorithm.""" 772 best_tree = starting_tree 773 while True: 774 best_score = self.scorer.get_score(best_tree, alignment) 775 temp = best_score 776 for t in self._get_neighbors(best_tree): 777 score = self.scorer.get_score(t, alignment) 778 if score < best_score: 779 best_score = score 780 best_tree = t 781 # stop if no smaller score exist 782 if best_score >= temp: 783 break 784 return best_tree
785
786 - def _get_neighbors(self, tree):
787 """Get all neighbor trees of the given tree. 788 789 Currently only for binary rooted trees. 790 """ 791 # make child to parent dict 792 parents = {} 793 for clade in tree.find_clades(): 794 if clade != tree.root: 795 node_path = tree.get_path(clade) 796 # cannot get the parent if the parent is root. Bug? 797 if len(node_path) == 1: 798 parents[clade] = tree.root 799 else: 800 parents[clade] = node_path[-2] 801 neighbors = [] 802 root_childs = [] 803 for clade in tree.get_nonterminals(order="level"): 804 if clade == tree.root: 805 left = clade.clades[0] 806 right = clade.clades[1] 807 root_childs.append(left) 808 root_childs.append(right) 809 if not left.is_terminal() and not right.is_terminal(): 810 # make changes around the left_left clade 811 # left_left = left.clades[0] 812 left_right = left.clades[1] 813 right_left = right.clades[0] 814 right_right = right.clades[1] 815 # neightbor 1 (left_left + right_right) 816 del left.clades[1] 817 del right.clades[1] 818 left.clades.append(right_right) 819 right.clades.append(left_right) 820 temp_tree = copy.deepcopy(tree) 821 neighbors.append(temp_tree) 822 # neighbor 2 (left_left + right_left) 823 del left.clades[1] 824 del right.clades[0] 825 left.clades.append(right_left) 826 right.clades.append(right_right) 827 temp_tree = copy.deepcopy(tree) 828 neighbors.append(temp_tree) 829 # change back (left_left + left_right) 830 del left.clades[1] 831 del right.clades[0] 832 left.clades.append(left_right) 833 right.clades.insert(0, right_left) 834 elif clade in root_childs: 835 # skip root child 836 continue 837 else: 838 # method for other clades 839 # make changes around the parent clade 840 left = clade.clades[0] 841 right = clade.clades[1] 842 parent = parents[clade] 843 if clade == parent.clades[0]: 844 sister = parent.clades[1] 845 # neighbor 1 (parent + right) 846 del parent.clades[1] 847 del clade.clades[1] 848 parent.clades.append(right) 849 clade.clades.append(sister) 850 temp_tree = copy.deepcopy(tree) 851 neighbors.append(temp_tree) 852 # neighbor 2 (parent + left) 853 del parent.clades[1] 854 del clade.clades[0] 855 parent.clades.append(left) 856 clade.clades.append(right) 857 temp_tree = copy.deepcopy(tree) 858 neighbors.append(temp_tree) 859 # change back (parent + sister) 860 del parent.clades[1] 861 del clade.clades[0] 862 parent.clades.append(sister) 863 clade.clades.insert(0, left) 864 else: 865 sister = parent.clades[0] 866 # neighbor 1 (parent + right) 867 del parent.clades[0] 868 del clade.clades[1] 869 parent.clades.insert(0, right) 870 clade.clades.append(sister) 871 temp_tree = copy.deepcopy(tree) 872 neighbors.append(temp_tree) 873 # neighbor 2 (parent + left) 874 del parent.clades[0] 875 del clade.clades[0] 876 parent.clades.insert(0, left) 877 clade.clades.append(right) 878 temp_tree = copy.deepcopy(tree) 879 neighbors.append(temp_tree) 880 # change back (parent + sister) 881 del parent.clades[0] 882 del clade.clades[0] 883 parent.clades.insert(0, sister) 884 clade.clades.insert(0, left) 885 return neighbors
886 887 # ######################## Parsimony Classes ########################## 888 889
890 -class ParsimonyScorer(Scorer):
891 """Parsimony scorer with a scoring matrix. 892 893 This is a combination of Fitch algorithm and Sankoff algorithm. 894 See ParsimonyTreeConstructor for usage. 895 896 :Parameters: 897 matrix : _Matrix 898 scoring matrix used in parsimony score calculation. 899 """ 900
901 - def __init__(self, matrix=None):
902 if not matrix or isinstance(matrix, _Matrix): 903 self.matrix = matrix 904 else: 905 raise TypeError("Must provide a _Matrix object.")
906
907 - def get_score(self, tree, alignment):
908 """Calculate and return the parsimony score given a tree and 909 the MSA using the Fitch algorithm without the penalty matrix 910 the Sankoff algorithm with the matrix""" 911 # make sure the tree is rooted and bifurcating 912 if not tree.is_bifurcating(): 913 raise ValueError("The tree provided should be bifurcating.") 914 if not tree.rooted: 915 tree.root_at_midpoint() 916 # sort tree terminals and alignment 917 terms = tree.get_terminals() 918 terms.sort(key=lambda term: term.name) 919 alignment.sort() 920 if not all([t.name == a.id for t, a in zip(terms, alignment)]): 921 raise ValueError( 922 "Taxon names of the input tree should be the same with the alignment.") 923 # term_align = dict(zip(terms, alignment)) 924 score = 0 925 for i in range(len(alignment[0])): 926 # parsimony score for column_i 927 score_i = 0 928 # get column 929 column_i = alignment[:, i] 930 # skip non-informative column 931 if column_i == len(column_i) * column_i[0]: 932 continue 933 934 # start calculating score_i using the tree and column_i 935 936 # Fitch algorithm without the penalty matrix 937 if not self.matrix: 938 # init by mapping terminal clades and states in column_i 939 clade_states = dict(zip(terms, [set([c]) for c in column_i])) 940 for clade in tree.get_nonterminals(order="postorder"): 941 clade_childs = clade.clades 942 left_state = clade_states[clade_childs[0]] 943 right_state = clade_states[clade_childs[1]] 944 state = left_state & right_state 945 if not state: 946 state = left_state | right_state 947 score_i = score_i + 1 948 clade_states[clade] = state 949 # Sankoff algorithm with the penalty matrix 950 else: 951 inf = float('inf') 952 # init score arrays for terminal clades 953 alphabet = self.matrix.names 954 length = len(alphabet) 955 clade_scores = {} 956 for j in range(len(column_i)): 957 array = [inf] * length 958 index = alphabet.index(column_i[j]) 959 array[index] = 0 960 clade_scores[terms[j]] = array 961 # bottom up calculation 962 for clade in tree.get_nonterminals(order="postorder"): 963 clade_childs = clade.clades 964 left_score = clade_scores[clade_childs[0]] 965 right_score = clade_scores[clade_childs[1]] 966 array = [] 967 for m in range(length): 968 min_l = inf 969 min_r = inf 970 for n in range(length): 971 sl = self.matrix[ 972 alphabet[m], alphabet[n]] + left_score[n] 973 sr = self.matrix[ 974 alphabet[m], alphabet[n]] + right_score[n] 975 if min_l > sl: 976 min_l = sl 977 if min_r > sr: 978 min_r = sr 979 array.append(min_l + min_r) 980 clade_scores[clade] = array 981 # minimum from root score 982 score_i = min(array) 983 # TODO: resolve internal states 984 score = score + score_i 985 return score
986 987
988 -class ParsimonyTreeConstructor(TreeConstructor):
989 """Parsimony tree constructor. 990 991 :Parameters: 992 searcher : TreeSearcher 993 tree searcher to search the best parsimony tree. 994 starting_tree : Tree 995 starting tree provided to the searcher. 996 997 Example 998 -------- 999 1000 >>> from Bio import AlignIO 1001 >>> from TreeConstruction import * 1002 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 1003 >>> print aln 1004 SingleLetterAlphabet() alignment with 5 rows and 13 columns 1005 AACGTGGCCACAT Alpha 1006 AAGGTCGCCACAC Beta 1007 GAGATTTCCGCCT Delta 1008 GAGATCTCCGCCC Epsilon 1009 CAGTTCGCCACAA Gamma 1010 >>> starting_tree = Phylo.read('Tests/TreeConstruction/nj.tre', 'newick') 1011 >>> print tree 1012 Tree(weight=1.0, rooted=False) 1013 Clade(branch_length=0.0, name='Inner3') 1014 Clade(branch_length=0.01421, name='Inner2') 1015 Clade(branch_length=0.23927, name='Inner1') 1016 Clade(branch_length=0.08531, name='Epsilon') 1017 Clade(branch_length=0.13691, name='Delta') 1018 Clade(branch_length=0.29231, name='Alpha') 1019 Clade(branch_length=0.07477, name='Beta') 1020 Clade(branch_length=0.17523, name='Gamma') 1021 >>> from TreeConstruction import * 1022 >>> scorer = ParsimonyScorer() 1023 >>> searcher = NNITreeSearcher(scorer) 1024 >>> constructor = ParsimonyTreeConstructor(searcher, starting_tree) 1025 >>> pars_tree = constructor.build_tree(aln) 1026 >>> print pars_tree 1027 Tree(weight=1.0, rooted=True) 1028 Clade(branch_length=0.0) 1029 Clade(branch_length=0.197335, name='Inner1') 1030 Clade(branch_length=0.13691, name='Delta') 1031 Clade(branch_length=0.08531, name='Epsilon') 1032 Clade(branch_length=0.041935, name='Inner2') 1033 Clade(branch_length=0.01421, name='Inner3') 1034 Clade(branch_length=0.17523, name='Gamma') 1035 Clade(branch_length=0.07477, name='Beta') 1036 Clade(branch_length=0.29231, name='Alpha') 1037 """ 1038
1039 - def __init__(self, searcher, starting_tree=None):
1040 self.searcher = searcher 1041 self.starting_tree = starting_tree
1042
1043 - def build_tree(self, alignment):
1044 """Build the tree. 1045 1046 :Parameters: 1047 alignment : MultipleSeqAlignment 1048 multiple sequence alignment to calculate parsimony tree. 1049 """ 1050 # if starting_tree is none, 1051 # create a upgma tree with 'identity' scoring matrix 1052 if self.starting_tree is None: 1053 dtc = DistanceTreeConstructor(DistanceCalculator("identity"), 1054 "upgma") 1055 self.starting_tree = dtc.build_tree(alignment) 1056 return self.searcher.search(self.starting_tree, alignment)
1057