Package Bio :: Package Phylo :: Module TreeConstruction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.TreeConstruction

   1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes and methods for tree construction""" 
   7  __docformat__ = "restructuredtext en" 
   8   
   9  import itertools 
  10  import copy 
  11  from Bio.Phylo import BaseTree 
  12  from Bio.Align import MultipleSeqAlignment 
  13  from Bio.SubsMat import MatrixInfo 
  14  from Bio import _py3k 
  15   
  16   
17 -def _is_numeric(x):
18 return _py3k._is_int_or_long(x) or isinstance(x, (float, complex))
19
20 -class _Matrix(object):
21 """A base class for distance matrix or scoring matrix that accepts 22 a list of names and a lower triangular matrix. 23 24 matrix = [[0], 25 [1, 0], 26 [2, 3, 0], 27 [4, 5, 6, 0]] 28 represents the symmetric matrix of 29 [0,1,2,4] 30 [1,0,3,5] 31 [2,3,0,6] 32 [4,5,6,0] 33 34 :Parameters: 35 names : list 36 names of elements, used for indexing 37 matrix : list 38 nested list of numerical lists in lower triangular format 39 40 Example 41 ------- 42 43 >>> from Bio.Phylo.TreeConstruction import _Matrix 44 >>> names = ['Alpha', 'Beta', 'Gamma', 'Delta'] 45 >>> matrix = [[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]] 46 >>> m = _Matrix(names, matrix) 47 >>> m 48 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]]) 49 50 You can use two indices to get or assign an element in the matrix. 51 52 >>> m[1,2] 53 3 54 >>> m['Beta','Gamma'] 55 3 56 >>> m['Beta','Gamma'] = 4 57 >>> m['Beta','Gamma'] 58 4 59 60 Further more, you can use one index to get or assign a list of elements related to that index. 61 62 >>> m[0] 63 [0, 1, 2, 4] 64 >>> m['Alpha'] 65 [0, 1, 2, 4] 66 >>> m['Alpha'] = [0, 7, 8, 9] 67 >>> m[0] 68 [0, 7, 8, 9] 69 >>> m[0,1] 70 7 71 72 Also you can delete or insert a column&row of elemets by index. 73 74 >>> m 75 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 76 >>> del m['Alpha'] 77 >>> m 78 _Matrix(names=['Beta', 'Gamma', 'Delta'], matrix=[[0], [4, 0], [5, 6, 0]]) 79 >>> m.insert('Alpha', [0, 7, 8, 9] , 0) 80 >>> m 81 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 82 83 """ 84
85 - def __init__(self, names, matrix=None):
86 """Initialize matrix by a list of names and a list of 87 lower triangular matrix data""" 88 # check names 89 if isinstance(names, list) and all(isinstance(s, str) for s in names): 90 if len(set(names)) == len(names): 91 self.names = names 92 else: 93 raise ValueError("Duplicate names found") 94 else: 95 raise TypeError("'names' should be a list of strings") 96 97 # check matrix 98 if matrix is None: 99 # create a new one with 0 if matrix is not assigned 100 matrix = [[0]*i for i in range(1, len(self) + 1)] 101 self.matrix = matrix 102 else: 103 # check if all elements are numbers 104 if (isinstance(matrix, list) 105 and all(isinstance(l, list) for l in matrix) 106 and all(_is_numeric(n) for n in [item for sublist in matrix 107 for item in sublist])): 108 # check if the same length with names 109 if len(matrix) == len(names): 110 # check if is lower triangle format 111 if [len(m) for m in matrix] == list(range(1, len(self) + 1)): 112 self.matrix = matrix 113 else: 114 raise ValueError("'matrix' should be in lower triangle format") 115 else: 116 raise ValueError("'names' and 'matrix' should be the same size") 117 else: 118 raise TypeError("'matrix' should be a list of numerical lists")
119
120 - def __getitem__(self, item):
121 """Access value(s) by the index(s) or name(s). 122 For a _Matrix object 'dm': 123 dm[i] get a value list from the given 'i' to others; 124 dm[i, j] get the value between 'i' and 'j'; 125 dm['name'] map name to index first 126 dm['name1', 'name2'] map name to index first 127 """ 128 # Handle single indexing 129 if isinstance(item, (int, str)): 130 index = None 131 if isinstance(item, int): 132 index = item 133 elif isinstance(item, str): 134 if item in self.names: 135 index = self.names.index(item) 136 else: 137 raise ValueError("Item not found.") 138 else: 139 raise TypeError("Invalid index type.") 140 # check index 141 if index > len(self) - 1: 142 raise IndexError("Index out of range.") 143 return [self.matrix[index][i] for i in range(0, index)] + [self.matrix[i][index] for i in range(index, len(self))] 144 # Handle double indexing 145 elif len(item) == 2: 146 row_index = None 147 col_index = None 148 if all(isinstance(i, int) for i in item): 149 row_index, col_index = item 150 elif all(isinstance(i, str) for i in item): 151 row_name, col_name = item 152 if row_name in self.names and col_name in self.names: 153 row_index = self.names.index(row_name) 154 col_index = self.names.index(col_name) 155 else: 156 raise ValueError("Item not found.") 157 else: 158 raise TypeError("Invalid index type.") 159 # check index 160 if row_index > len(self) - 1 or col_index > len(self) - 1: 161 raise IndexError("Index out of range.") 162 if row_index > col_index: 163 return self.matrix[row_index][col_index] 164 else: 165 return self.matrix[col_index][row_index] 166 else: 167 raise TypeError("Invalid index type.")
168
169 - def __setitem__(self, item, value):
170 """Set value by the index(s) or name(s). 171 Similar to __getitem__ 172 dm[1] = [1, 0, 3, 4] set values from '1' to others; 173 dm[i, j] = 2 set the value from 'i' to 'j' 174 """ 175 176 # Handle single indexing 177 if isinstance(item, (int, str)): 178 index = None 179 if isinstance(item, int): 180 index = item 181 elif isinstance(item, str): 182 if item in self.names: 183 index = self.names.index(item) 184 else: 185 raise ValueError("Item not found.") 186 else: 187 raise TypeError("Invalid index type.") 188 # check index 189 if index > len(self) - 1: 190 raise IndexError("Index out of range.") 191 # check and assign value 192 if isinstance(value, list) and all(_is_numeric(n) for n in value): 193 if len(value) == len(self): 194 for i in range(0, index): 195 self.matrix[index][i] = value[i] 196 for i in range(index, len(self)): 197 self.matrix[i][index] = value[i] 198 else: 199 raise ValueError("Value not the same size.") 200 else: 201 raise TypeError("Invalid value type.") 202 # Handle double indexing 203 elif len(item) == 2: 204 row_index = None 205 col_index = None 206 if all(isinstance(i, int) for i in item): 207 row_index, col_index = item 208 elif all(isinstance(i, str) for i in item): 209 row_name, col_name = item 210 if row_name in self.names and col_name in self.names: 211 row_index = self.names.index(row_name) 212 col_index = self.names.index(col_name) 213 else: 214 raise ValueError("Item not found.") 215 else: 216 raise TypeError("Invalid index type.") 217 # check index 218 if row_index > len(self) - 1 or col_index > len(self) - 1: 219 raise IndexError("Index out of range.") 220 # check and assign value 221 if _is_numeric(value): 222 if row_index > col_index: 223 self.matrix[row_index][col_index] = value 224 else: 225 self.matrix[col_index][row_index] = value 226 else: 227 raise TypeError("Invalid value type.") 228 else: 229 raise TypeError("Invalid index type.")
230
231 - def __delitem__(self, item):
232 """Delete related distances by the index or name""" 233 index = None 234 if isinstance(item, int): 235 index = item 236 elif isinstance(item, str): 237 index = self.names.index(item) 238 else: 239 raise TypeError("Invalid index type.") 240 # remove distances related to index 241 for i in range(index + 1, len(self)): 242 del self.matrix[i][index] 243 del self.matrix[index] 244 # remove name 245 del self.names[index]
246
247 - def insert(self, name, value, index=None):
248 """Insert distances given the name and value. 249 250 :Parameters: 251 name : str 252 name of a row/col to be inserted 253 value : list 254 a row/col of values to be inserted 255 """ 256 if isinstance(name, str): 257 # insert at the given index or at the end 258 if index is None: 259 index = len(self) 260 if not isinstance(index, int): 261 raise TypeError("Invalid index type.") 262 # insert name 263 self.names.insert(index, name) 264 # insert elements of 0, to be assigned 265 self.matrix.insert(index, [0] * index) 266 for i in range(index, len(self)): 267 self.matrix[i].insert(index, 0) 268 # assign value 269 self[index] = value 270 else: 271 raise TypeError("Invalid name type.")
272
273 - def __len__(self):
274 """Matrix length""" 275 return len(self.names)
276
277 - def __repr__(self):
278 return self.__class__.__name__ \ 279 + "(names=%s, matrix=%s)" \ 280 % tuple(map(repr, (self.names, self.matrix)))
281
282 - def __str__(self):
283 """Get a lower triangular matrix string""" 284 matrix_string = '\n'.join( 285 [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]]) 286 for i in range(0, len(self))]) 287 matrix_string = matrix_string + "\n\t" + "\t".join(self.names) 288 return matrix_string
289 290
291 -class _DistanceMatrix(_Matrix):
292 """Distance matrix class that can be used for distance based tree 293 algorithms. 294 All diagonal elements will be zero no matter what the users provide. 295 """ 296
297 - def __init__(self, names, matrix=None):
298 _Matrix.__init__(self, names, matrix) 299 self._set_zero_diagonal()
300
301 - def __setitem__(self, item, value):
302 _Matrix.__setitem__(self, item, value) 303 self._set_zero_diagonal()
304
305 - def _set_zero_diagonal(self):
306 """set all diagonal elements to zero""" 307 for i in range(0, len(self)): 308 self.matrix[i][i] = 0
309 310
311 -class DistanceCalculator(object):
312 """Class to calculate the distance matrix from a DNA or Protein 313 Multiple Sequence Alignment(MSA) and the given name of the 314 substitution model. 315 316 Currently only scoring matrices are used. 317 318 :Parameters: 319 model : str 320 Name of the model matrix to be used to calculate distance. 321 The attribute `dna_matrices` contains the available model 322 names for DNA sequences and `protein_matrices` for protein 323 sequences. 324 325 Example 326 ------- 327 328 >>> from Bio.Phylo.TreeConstruction import DistanceCalculator 329 >>> from Bio import AlignIO 330 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 331 >>> print aln 332 SingleLetterAlphabet() alignment with 5 rows and 13 columns 333 AACGTGGCCACAT Alpha 334 AAGGTCGCCACAC Beta 335 GAGATTTCCGCCT Delta 336 GAGATCTCCGCCC Epsilon 337 CAGTTCGCCACAA Gamma 338 339 DNA calculator with 'identity' model: 340 341 >>> calculator = DistanceCalculator('identity') 342 >>> dm = calculator.get_distance(aln) 343 >>> print dm 344 Alpha 0 345 Beta 0.230769230769 0 346 Gamma 0.384615384615 0.230769230769 0 347 Delta 0.538461538462 0.538461538462 0.538461538462 0 348 Epsilon 0.615384615385 0.384615384615 0.461538461538 0.153846153846 0 349 Alpha Beta Gamma Delta Epsilon 350 351 Protein calculator with 'blosum62' model: 352 >>> calculator = DistanceCalculator('blosum62') 353 >>> dm = calculator.get_distance(aln) 354 >>> print dm 355 Alpha 0 356 Beta 0.369047619048 0 357 Gamma 0.493975903614 0.25 0 358 Delta 0.585365853659 0.547619047619 0.566265060241 0 359 Epsilon 0.7 0.355555555556 0.488888888889 0.222222222222 0 360 Alpha Beta Gamma Delta Epsilon 361 """ 362 363 dna_alphabet = ['A', 'T', 'C', 'G'] 364 365 # BLAST nucleic acid scoring matrix 366 blastn = [[ 5], 367 [-4, 5], 368 [-4, -4, 5], 369 [-4, -4, -4, 5]] 370 371 # transition/transversion scoring matrix 372 trans = [[ 6], 373 [-5, 6], 374 [-5, -1, 6], 375 [-1, -5, -5, 6]] 376 377 protein_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 378 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 379 'Z'] 380 381 # matrices available 382 dna_matrices = {'blastn': blastn, 'trans': trans} 383 protein_models = MatrixInfo.available_matrices 384 protein_matrices = dict((name, getattr(MatrixInfo, name)) 385 for name in protein_models) 386 387 dna_models = list(dna_matrices.keys()) 388 389 models = ['identity'] + dna_models + protein_models 390
391 - def __init__(self, model='identity'):
392 """Initialize with a distance model""" 393 394 if model == 'identity': 395 self.scoring_matrix = None 396 elif model in self.dna_models: 397 self.scoring_matrix = _Matrix(self.dna_alphabet, 398 self.dna_matrices[model]) 399 elif model in self.protein_models: 400 self.scoring_matrix = self._build_protein_matrix( 401 self.protein_matrices[model]) 402 else: 403 raise ValueError("Model not supported. Available models: " 404 + ", ".join(self.models))
405
406 - def _pairwise(self, seq1, seq2):
407 """Calculate pairwise distance from two sequences""" 408 score = 0 409 max_score = 0 410 if self.scoring_matrix: 411 max_score1 = 0 412 max_score2 = 0 413 skip_letters = ['-', '*'] 414 for i in range(0, len(seq1)): 415 l1 = seq1[i] 416 l2 = seq2[i] 417 if l1 in skip_letters or l2 in skip_letters: 418 continue 419 if l1 not in self.scoring_matrix.names: 420 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 421 % (l1, seq1.id, i)) 422 if l2 not in self.scoring_matrix.names: 423 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 424 % (l2, seq2.id, i)) 425 max_score1 += self.scoring_matrix[l1, l1] 426 max_score2 += self.scoring_matrix[l2, l2] 427 score += self.scoring_matrix[l1, l2] 428 429 max_score = max_score1 > max_score2 and max_score1 or max_score2 430 else: 431 for i in range(0, len(seq1)): 432 l1 = seq1[i] 433 l2 = seq2[i] 434 if l1 == l2: 435 score += 1 436 max_score = len(seq1) 437 438 return 1 - (score * 1.0 / max_score)
439
440 - def get_distance(self, msa):
441 """Return a _DistanceMatrix for MSA object 442 443 :Parameters: 444 msa : MultipleSeqAlignment 445 DNA or Protein multiple sequence alignment. 446 447 """ 448 449 if not isinstance(msa, MultipleSeqAlignment): 450 raise TypeError("Must provide a MultipleSeqAlignment object.") 451 452 names = [s.id for s in msa] 453 dm = _DistanceMatrix(names) 454 for seq1, seq2 in itertools.combinations(msa, 2): 455 dm[seq1.id, seq2.id] = self._pairwise(seq1, seq2) 456 return dm
457
458 - def _build_protein_matrix(self, subsmat):
459 """Convert matrix from SubsMat format to _Matrix object""" 460 protein_matrix = _Matrix(self.protein_alphabet) 461 for k, v in subsmat.items(): 462 aa1, aa2 = k 463 protein_matrix[aa1, aa2] = v 464 return protein_matrix
465 466
467 -class TreeConstructor(object):
468 """Base class for all tree constructor.""" 469
470 - def build_tree(self, msa):
471 """Caller to built the tree from a MultipleSeqAlignment object. 472 This should be implemented in subclass""" 473 raise NotImplementedError("Method not implemented!")
474 475
476 -class DistanceTreeConstructor(TreeConstructor):
477 """Distance based tree constructor. 478 479 :Parameters: 480 method : str 481 Distance tree construction method, 'nj'(default) or 'upgma'. 482 distance_calculator : DistanceCalculator 483 The distance matrix calculator for multiple sequence alignment. 484 It must be provided if `build_tree` will be called. 485 486 Example 487 -------- 488 489 >>> from TreeConstruction import DistanceTreeConstructor 490 >>> constructor = DistanceTreeConstructor() 491 492 UPGMA Tree: 493 494 >>> upgmatree = constructor.upgma(dm) 495 >>> print upgmatree 496 Tree(rooted=True) 497 Clade(name='Inner4') 498 Clade(branch_length=0.171955155115, name='Inner1') 499 Clade(branch_length=0.111111111111, name='Epsilon') 500 Clade(branch_length=0.111111111111, name='Delta') 501 Clade(branch_length=0.0673103855608, name='Inner3') 502 Clade(branch_length=0.0907558806655, name='Inner2') 503 Clade(branch_length=0.125, name='Gamma') 504 Clade(branch_length=0.125, name='Beta') 505 Clade(branch_length=0.215755880666, name='Alpha') 506 507 NJ Tree: 508 509 >>> njtree = constructor.nj(dm) 510 >>> print njtree 511 Tree(rooted=False) 512 Clade(name='Inner3') 513 Clade(branch_length=0.0142054862889, name='Inner2') 514 Clade(branch_length=0.239265540676, name='Inner1') 515 Clade(branch_length=0.0853101915988, name='Epsilon') 516 Clade(branch_length=0.136912030623, name='Delta') 517 Clade(branch_length=0.292306275042, name='Alpha') 518 Clade(branch_length=0.0747705106139, name='Beta') 519 Clade(branch_length=0.175229489386, name='Gamma') 520 521 522 """ 523 524 methods = ['nj', 'upgma'] 525
526 - def __init__(self, distance_calculator=None, method="nj"):
527 if (distance_calculator == None 528 or isinstance(distance_calculator, DistanceCalculator)): 529 self.distance_calculator = distance_calculator 530 else: 531 raise TypeError("Must provide a DistanceCalculator object.") 532 if isinstance(method, str) and method in self.methods: 533 self.method = method 534 else: 535 raise TypeError("Bad method: " + method + 536 ". Available methods: " + ", ".join(self.methods))
537
538 - def build_tree(self, msa):
539 if self.distance_calculator: 540 dm = self.distance_calculator.get_distance(msa) 541 tree = None 542 if self.method == 'upgma': 543 tree = self.upgma(dm) 544 else: 545 tree = self.nj(dm) 546 return tree 547 else: 548 raise TypeError("Must provide a DistanceCalculator object.")
549
550 - def upgma(self, distance_matrix):
551 """Construct and return an UPGMA(Unweighted Pair Group Method 552 with Arithmetic mean) tree. 553 554 :Parameters: 555 distance_matrix : _DistanceMatrix 556 The distance matrix for tree construction. 557 """ 558 if not isinstance(distance_matrix, _DistanceMatrix): 559 raise TypeError("Must provide a _DistanceMatrix object.") 560 561 # make a copy of the distance matrix to be used 562 dm = copy.deepcopy(distance_matrix) 563 # init terminal clades 564 clades = [BaseTree.Clade(None, name) for name in dm.names] 565 # init minimum index 566 min_i = 0 567 min_j = 0 568 inner_count = 0 569 while len(dm) > 1: 570 min_dist = dm[1, 0] 571 # find minimum index 572 for i in range(1, len(dm)): 573 for j in range(0, i): 574 if min_dist >= dm[i, j]: 575 min_dist = dm[i, j] 576 min_i = i 577 min_j = j 578 579 # create clade 580 clade1 = clades[min_i] 581 clade2 = clades[min_j] 582 inner_count += 1 583 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 584 inner_clade.clades.append(clade1) 585 inner_clade.clades.append(clade2) 586 #assign branch length 587 if clade1.is_terminal(): 588 clade1.branch_length = min_dist * 1.0 / 2 589 else: 590 clade1.branch_length = min_dist * 1.0 / 2 - self._height_of(clade1) 591 592 if clade2.is_terminal(): 593 clade2.branch_length = min_dist * 1.0 / 2 594 else: 595 clade2.branch_length = min_dist * 1.0 / 2 - self._height_of(clade2) 596 597 # update node list 598 clades[min_j] = inner_clade 599 del clades[min_i] 600 601 # rebuild distance matrix, 602 # set the distances of new node at the index of min_j 603 for k in range(0, len(dm)): 604 if k != min_i and k != min_j: 605 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2 606 607 dm.names[min_j] = "Inner" + str(inner_count) 608 609 del dm[min_i] 610 inner_clade.branch_length = 0 611 return BaseTree.Tree(inner_clade)
612
613 - def nj(self, distance_matrix):
614 """Construct and return an Neighbor Joining tree. 615 616 :Parameters: 617 distance_matrix : _DistanceMatrix 618 The distance matrix for tree construction. 619 """ 620 621 if not isinstance(distance_matrix, _DistanceMatrix): 622 raise TypeError("Must provide a _DistanceMatrix object.") 623 624 # make a copy of the distance matrix to be used 625 dm = copy.deepcopy(distance_matrix) 626 # init terminal clades 627 clades = [BaseTree.Clade(None, name) for name in dm.names] 628 # init node distance 629 node_dist = [0] * len(dm) 630 # init minimum index 631 min_i = 0 632 min_j = 0 633 inner_count = 0 634 while len(dm) > 2: 635 # calculate nodeDist 636 for i in range(0, len(dm)): 637 node_dist[i] = 0 638 for j in range(0, len(dm)): 639 node_dist[i] += dm[i, j] 640 node_dist[i] = node_dist[i] / (len(dm) - 2) 641 642 # find minimum distance pair 643 min_dist = dm[1, 0] - node_dist[1] - node_dist[0] 644 min_i = 0 645 min_j = 1 646 for i in range(1, len(dm)): 647 for j in range(0, i): 648 temp = dm[i, j] - node_dist[i] - node_dist[j] 649 if min_dist > temp: 650 min_dist = temp 651 min_i = i 652 min_j = j 653 # create clade 654 clade1 = clades[min_i] 655 clade2 = clades[min_j] 656 inner_count += 1 657 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 658 inner_clade.clades.append(clade1) 659 inner_clade.clades.append(clade2) 660 #assign branch length 661 clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] 662 - node_dist[min_j]) / 2.0 663 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length 664 665 # update node list 666 clades[min_j] = inner_clade 667 del clades[min_i] 668 669 # rebuild distance matrix, 670 # set the distances of new node at the index of min_j 671 for k in range(0, len(dm)): 672 if k != min_i and k != min_j: 673 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] 674 - dm[min_i, min_j]) / 2.0 675 676 dm.names[min_j] = "Inner" + str(inner_count) 677 del dm[min_i] 678 679 # set the last clade as one of the child of the inner_clade 680 root = None 681 if clades[0] == inner_clade: 682 clades[0].branch_length = 0 683 clades[1].branch_length = dm[1, 0] 684 clades[0].clades.append(clades[1]) 685 root = clades[0] 686 else: 687 clades[0].branch_length = dm[1, 0] 688 clades[1].branch_length = 0 689 clades[1].clades.append(clades[0]) 690 root = clades[1] 691 692 return BaseTree.Tree(root, rooted=False)
693
694 - def _height_of(self, clade):
695 """calculate height of the clade -- the longest path to one of 696 the terminals""" 697 height = 0 698 if clade.is_terminal(): 699 height = clade.branch_length 700 else: 701 height = height + max([self._height_of(c) for c in clade.clades]) 702 return height
703 704 ##################### Tree Scoring and Searching Classes ##################### 705
706 -class Scorer(object):
707 """Base class for all tree scoring methods""" 708
709 - def get_score(self, tree, alignment):
710 """Caller to get the score of a tree for the given alignment. 711 This should be implemented in subclass""" 712 raise NotImplementedError("Method not implemented!")
713 714
715 -class TreeSearcher(object):
716 """Base class for all tree searching methods""" 717
718 - def search(self, starting_tree, alignment):
719 """Caller to search the best tree with a starting tree. 720 This should be implemented in subclass""" 721 raise NotImplementedError("Method not implemented!")
722 723
724 -class NNITreeSearcher(TreeSearcher):
725 """Tree searching class of NNI(Nearest Neighbor Interchanges) 726 algorithm. 727 728 :Parameters: 729 scorer: ParsimonyScorer 730 parsimony scorer to calculate the parsimony score of 731 different trees during NNI algorithm. 732 """ 733
734 - def __init__(self, scorer):
735 if isinstance(scorer, Scorer): 736 self.scorer = scorer 737 else: 738 raise TypeError("Must provide a Scorer object.")
739
740 - def search(self, starting_tree, alignment):
741 """Implement the TreeSearcher.search method. 742 743 :Parameters: 744 starting_tree : Tree 745 starting tree of NNI method. 746 alignment: MultipleSeqAlignment 747 multiple sequence alignment used to calculate parsimony 748 score of different NNI trees. 749 """ 750 751 return self._nni(starting_tree, alignment)
752
753 - def _nni(self, starting_tree, alignment):
754 """Search the best parsimony tree by using the NNI(Nearest 755 Neighbor Interchanges) algorithm""" 756 best_tree = starting_tree 757 while True: 758 best_score = self.scorer.get_score(best_tree, alignment) 759 temp = best_score 760 for t in self._get_neighbors(best_tree): 761 score = self.scorer.get_score(t, alignment) 762 if score < best_score: 763 best_score = score 764 best_tree = t 765 # stop if no smaller score exist 766 if best_score >= temp: 767 break 768 return best_tree
769
770 - def _get_neighbors(self, tree):
771 """Get all neighbor trees of the given tree(currently only 772 for binary rooted tree)""" 773 # make child to parent dict 774 parents = {} 775 for clade in tree.find_clades(): 776 if clade != tree.root: 777 node_path = tree.get_path(clade) 778 # cannot get the parent if the parent is root. Bug? 779 if len(node_path) == 1: 780 parents[clade] = tree.root 781 else: 782 parents[clade] = node_path[-2] 783 neighbors = [] 784 root_childs = [] 785 for clade in tree.get_nonterminals(order="level"): 786 if clade == tree.root: 787 left = clade.clades[0] 788 right = clade.clades[1] 789 root_childs.append(left) 790 root_childs.append(right) 791 if not left.is_terminal() and not right.is_terminal(): 792 # make changes around the left_left clade 793 #left_left = left.clades[0] 794 left_right = left.clades[1] 795 right_left = right.clades[0] 796 right_right = right.clades[1] 797 # neightbor 1 (left_left + right_right) 798 del left.clades[1] 799 del right.clades[1] 800 left.clades.append(right_right) 801 right.clades.append(left_right) 802 temp_tree = copy.deepcopy(tree) 803 neighbors.append(temp_tree) 804 # neighbor 2 (left_left + right_left) 805 del left.clades[1] 806 del right.clades[0] 807 left.clades.append(right_left) 808 right.clades.append(right_right) 809 temp_tree = copy.deepcopy(tree) 810 neighbors.append(temp_tree) 811 # change back (left_left + left_right) 812 del left.clades[1] 813 del right.clades[0] 814 left.clades.append(left_right) 815 right.clades.insert(0, right_left) 816 elif clade in root_childs: 817 # skip root child 818 continue 819 else: 820 # method for other clades 821 # make changes around the parent clade 822 left = clade.clades[0] 823 right = clade.clades[1] 824 parent = parents[clade] 825 if clade == parent.clades[0]: 826 sister = parent.clades[1] 827 # neighbor 1 (parent + right) 828 del parent.clades[1] 829 del clade.clades[1] 830 parent.clades.append(right) 831 clade.clades.append(sister) 832 temp_tree = copy.deepcopy(tree) 833 neighbors.append(temp_tree) 834 # neighbor 2 (parent + left) 835 del parent.clades[1] 836 del clade.clades[0] 837 parent.clades.append(left) 838 clade.clades.append(right) 839 temp_tree = copy.deepcopy(tree) 840 neighbors.append(temp_tree) 841 # change back (parent + sister) 842 del parent.clades[1] 843 del clade.clades[0] 844 parent.clades.append(sister) 845 clade.clades.insert(0, left) 846 else: 847 sister = parent.clades[0] 848 # neighbor 1 (parent + right) 849 del parent.clades[0] 850 del clade.clades[1] 851 parent.clades.insert(0, right) 852 clade.clades.append(sister) 853 temp_tree = copy.deepcopy(tree) 854 neighbors.append(temp_tree) 855 # neighbor 2 (parent + left) 856 del parent.clades[0] 857 del clade.clades[0] 858 parent.clades.insert(0, left) 859 clade.clades.append(right) 860 temp_tree = copy.deepcopy(tree) 861 neighbors.append(temp_tree) 862 # change back (parent + sister) 863 del parent.clades[0] 864 del clade.clades[0] 865 parent.clades.insert(0, sister) 866 clade.clades.insert(0, left) 867 return neighbors
868 869 ######################### Parsimony Classes ########################## 870 871
872 -class ParsimonyScorer(Scorer):
873 """Parsimony scorer with a scoring matrix. 874 875 This is a combination of Fitch algorithm and Sankoff algorithm. 876 See ParsimonyTreeConstructor for usage. 877 878 :Parameters: 879 matrix: _Matrix 880 scoring matrix used in parsimony score calculation. 881 """
882 - def __init__(self, matrix=None):
883 if not matrix or isinstance(matrix, _Matrix): 884 self.matrix = matrix 885 else: 886 raise TypeError("Must provide a _Matrix object.")
887
888 - def get_score(self, tree, alignment):
889 """Calculate and return the parsimony score given a tree and 890 the MSA using the Fitch algorithm without the penalty matrix 891 the Sankoff algorithm with the matrix""" 892 # make sure the tree is rooted and bifurcating 893 if not tree.is_bifurcating(): 894 raise ValueError("The tree provided should be bifurcating.") 895 if not tree.rooted: 896 tree.root_at_midpoint() 897 # sort tree terminals and alignment 898 terms = tree.get_terminals() 899 terms.sort(key=lambda term: term.name) 900 alignment.sort() 901 if not all([t.name == a.id for t, a in zip(terms, alignment)]): 902 raise ValueError("Taxon names of the input tree should be the same with the alignment.") 903 #term_align = dict(zip(terms, alignment)) 904 score = 0 905 for i in range(len(alignment[0])): 906 # parsimony score for column_i 907 score_i = 0 908 # get column 909 column_i = alignment[:, i] 910 # skip non-informative column 911 if column_i == len(column_i) * column_i[0]: 912 continue 913 914 #### start calculating score_i using the tree and column_i 915 916 ## Fitch algorithm without the penalty matrix 917 if not self.matrix: 918 # init by mapping terminal clades and states in column_i 919 clade_states = dict(zip(terms, [set([c]) for c in column_i])) 920 for clade in tree.get_nonterminals(order="postorder"): 921 clade_childs = clade.clades 922 left_state = clade_states[clade_childs[0]] 923 right_state = clade_states[clade_childs[1]] 924 state = left_state & right_state 925 if not state: 926 state = left_state | right_state 927 score_i = score_i + 1 928 clade_states[clade] = state 929 ## Sankoff algorithm with the penalty matrix 930 else: 931 inf = float('inf') 932 # init score arrays for terminal clades 933 alphabet = self.matrix.names 934 length = len(alphabet) 935 clade_scores = {} 936 for j in range(len(column_i)): 937 array = [inf] * length 938 index = alphabet.index(column_i[j]) 939 array[index] = 0 940 clade_scores[terms[j]] = array 941 # bottom up calculation 942 for clade in tree.get_nonterminals(order="postorder"): 943 clade_childs = clade.clades 944 left_score = clade_scores[clade_childs[0]] 945 right_score = clade_scores[clade_childs[1]] 946 array = [] 947 for m in range(length): 948 min_l = inf 949 min_r = inf 950 for n in range(length): 951 sl = self.matrix[alphabet[m], alphabet[n]] + left_score[n] 952 sr = self.matrix[alphabet[m], alphabet[n]] + right_score[n] 953 if min_l > sl: 954 min_l = sl 955 if min_r > sr: 956 min_r = sr 957 array.append(min_l + min_r) 958 clade_scores[clade] = array 959 # minimum from root score 960 score_i = min(array) 961 # TODO: resolve internal states 962 score = score + score_i 963 return score
964 965
966 -class ParsimonyTreeConstructor(TreeConstructor):
967 """Parsimony tree constructor. 968 969 :Parameters: 970 searcher: TreeSearcher 971 tree searcher to search the best parsimony tree. 972 starting_tree: Tree 973 starting tree provided to the searcher. 974 975 Example 976 -------- 977 978 >>> from Bio import AlignIO 979 >>> from TreeConstruction import * 980 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 981 >>> print aln 982 SingleLetterAlphabet() alignment with 5 rows and 13 columns 983 AACGTGGCCACAT Alpha 984 AAGGTCGCCACAC Beta 985 GAGATTTCCGCCT Delta 986 GAGATCTCCGCCC Epsilon 987 CAGTTCGCCACAA Gamma 988 >>> starting_tree = Phylo.read('Tests/TreeConstruction/nj.tre', 'newick') 989 >>> print tree 990 Tree(weight=1.0, rooted=False) 991 Clade(branch_length=0.0, name='Inner3') 992 Clade(branch_length=0.01421, name='Inner2') 993 Clade(branch_length=0.23927, name='Inner1') 994 Clade(branch_length=0.08531, name='Epsilon') 995 Clade(branch_length=0.13691, name='Delta') 996 Clade(branch_length=0.29231, name='Alpha') 997 Clade(branch_length=0.07477, name='Beta') 998 Clade(branch_length=0.17523, name='Gamma') 999 >>> from TreeConstruction import * 1000 >>> scorer = ParsimonyScorer() 1001 >>> searcher = NNITreeSearcher(scorer) 1002 >>> constructor = ParsimonyTreeConstructor(searcher, starting_tree) 1003 >>> pars_tree = constructor.build_tree(aln) 1004 >>> print pars_tree 1005 Tree(weight=1.0, rooted=True) 1006 Clade(branch_length=0.0) 1007 Clade(branch_length=0.197335, name='Inner1') 1008 Clade(branch_length=0.13691, name='Delta') 1009 Clade(branch_length=0.08531, name='Epsilon') 1010 Clade(branch_length=0.041935, name='Inner2') 1011 Clade(branch_length=0.01421, name='Inner3') 1012 Clade(branch_length=0.17523, name='Gamma') 1013 Clade(branch_length=0.07477, name='Beta') 1014 Clade(branch_length=0.29231, name='Alpha') 1015 """
1016 - def __init__(self, searcher, starting_tree=None):
1017 self.searcher = searcher 1018 self.starting_tree = starting_tree
1019
1020 - def build_tree(self, alignment):
1021 """ 1022 :Parameters: 1023 alignment: MultipleSeqAlignment 1024 multiple sequence alignment to calculate parsimony tree. 1025 """ 1026 # if starting_tree is none, 1027 # create a upgma tree with 'identity' scoring matrix 1028 if self.starting_tree is None: 1029 dtc = DistanceTreeConstructor(DistanceCalculator("identity"), 1030 "upgma") 1031 self.starting_tree = dtc.build_tree(alignment) 1032 return self.searcher.search(self.starting_tree, alignment)
1033