Package Bio :: Package Phylo :: Module TreeConstruction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.TreeConstruction

   1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes and methods for tree construction""" 
   7  __docformat__ = "restructuredtext en" 
   8   
   9  import itertools 
  10  import copy 
  11  from Bio.Phylo import BaseTree 
  12  from Bio.Align import MultipleSeqAlignment 
  13  from Bio.SubsMat import MatrixInfo 
  14  from Bio import _py3k 
  15   
  16   
17 -def _is_numeric(x):
18 return _py3k._is_int_or_long(x) or isinstance(x, (float, complex))
19 20
21 -class _Matrix(object):
22 """Base class for distance matrix or scoring matrix 23 24 Accepts a list of names and a lower triangular matrix.:: 25 26 matrix = [[0], 27 [1, 0], 28 [2, 3, 0], 29 [4, 5, 6, 0]] 30 represents the symmetric matrix of 31 [0,1,2,4] 32 [1,0,3,5] 33 [2,3,0,6] 34 [4,5,6,0] 35 36 :Parameters: 37 names : list 38 names of elements, used for indexing 39 matrix : list 40 nested list of numerical lists in lower triangular format 41 42 Example 43 ------- 44 45 >>> from Bio.Phylo.TreeConstruction import _Matrix 46 >>> names = ['Alpha', 'Beta', 'Gamma', 'Delta'] 47 >>> matrix = [[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]] 48 >>> m = _Matrix(names, matrix) 49 >>> m 50 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]]) 51 52 You can use two indices to get or assign an element in the matrix. 53 54 >>> m[1,2] 55 3 56 >>> m['Beta','Gamma'] 57 3 58 >>> m['Beta','Gamma'] = 4 59 >>> m['Beta','Gamma'] 60 4 61 62 Further more, you can use one index to get or assign a list of elements related to that index. 63 64 >>> m[0] 65 [0, 1, 2, 4] 66 >>> m['Alpha'] 67 [0, 1, 2, 4] 68 >>> m['Alpha'] = [0, 7, 8, 9] 69 >>> m[0] 70 [0, 7, 8, 9] 71 >>> m[0,1] 72 7 73 74 Also you can delete or insert a column&row of elemets by index. 75 76 >>> m 77 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 78 >>> del m['Alpha'] 79 >>> m 80 _Matrix(names=['Beta', 'Gamma', 'Delta'], matrix=[[0], [4, 0], [5, 6, 0]]) 81 >>> m.insert('Alpha', [0, 7, 8, 9] , 0) 82 >>> m 83 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 84 85 """ 86
87 - def __init__(self, names, matrix=None):
88 """Initialize matrix by a list of names and a list of 89 lower triangular matrix data""" 90 # check names 91 if isinstance(names, list) and all(isinstance(s, str) for s in names): 92 if len(set(names)) == len(names): 93 self.names = names 94 else: 95 raise ValueError("Duplicate names found") 96 else: 97 raise TypeError("'names' should be a list of strings") 98 99 # check matrix 100 if matrix is None: 101 # create a new one with 0 if matrix is not assigned 102 matrix = [[0] * i for i in range(1, len(self) + 1)] 103 self.matrix = matrix 104 else: 105 # check if all elements are numbers 106 if (isinstance(matrix, list) 107 and all(isinstance(l, list) for l in matrix) 108 and all(_is_numeric(n) for n in [item for sublist in matrix 109 for item in sublist])): 110 # check if the same length with names 111 if len(matrix) == len(names): 112 # check if is lower triangle format 113 if [len(m) for m in matrix] == list(range(1, len(self) + 1)): 114 self.matrix = matrix 115 else: 116 raise ValueError( 117 "'matrix' should be in lower triangle format") 118 else: 119 raise ValueError( 120 "'names' and 'matrix' should be the same size") 121 else: 122 raise TypeError("'matrix' should be a list of numerical lists")
123
124 - def __getitem__(self, item):
125 """Access value(s) by the index(s) or name(s). 126 127 For a _Matrix object 'dm':: 128 129 dm[i] get a value list from the given 'i' to others; 130 dm[i, j] get the value between 'i' and 'j'; 131 dm['name'] map name to index first 132 dm['name1', 'name2'] map name to index first 133 """ 134 # Handle single indexing 135 if isinstance(item, (int, str)): 136 index = None 137 if isinstance(item, int): 138 index = item 139 elif isinstance(item, str): 140 if item in self.names: 141 index = self.names.index(item) 142 else: 143 raise ValueError("Item not found.") 144 else: 145 raise TypeError("Invalid index type.") 146 # check index 147 if index > len(self) - 1: 148 raise IndexError("Index out of range.") 149 return [self.matrix[index][i] for i in range(0, index)] + [self.matrix[i][index] for i in range(index, len(self))] 150 # Handle double indexing 151 elif len(item) == 2: 152 row_index = None 153 col_index = None 154 if all(isinstance(i, int) for i in item): 155 row_index, col_index = item 156 elif all(isinstance(i, str) for i in item): 157 row_name, col_name = item 158 if row_name in self.names and col_name in self.names: 159 row_index = self.names.index(row_name) 160 col_index = self.names.index(col_name) 161 else: 162 raise ValueError("Item not found.") 163 else: 164 raise TypeError("Invalid index type.") 165 # check index 166 if row_index > len(self) - 1 or col_index > len(self) - 1: 167 raise IndexError("Index out of range.") 168 if row_index > col_index: 169 return self.matrix[row_index][col_index] 170 else: 171 return self.matrix[col_index][row_index] 172 else: 173 raise TypeError("Invalid index type.")
174
175 - def __setitem__(self, item, value):
176 """Set value by the index(s) or name(s). 177 178 Similar to __getitem__:: 179 180 dm[1] = [1, 0, 3, 4] set values from '1' to others; 181 dm[i, j] = 2 set the value from 'i' to 'j' 182 183 """ 184 185 # Handle single indexing 186 if isinstance(item, (int, str)): 187 index = None 188 if isinstance(item, int): 189 index = item 190 elif isinstance(item, str): 191 if item in self.names: 192 index = self.names.index(item) 193 else: 194 raise ValueError("Item not found.") 195 else: 196 raise TypeError("Invalid index type.") 197 # check index 198 if index > len(self) - 1: 199 raise IndexError("Index out of range.") 200 # check and assign value 201 if isinstance(value, list) and all(_is_numeric(n) for n in value): 202 if len(value) == len(self): 203 for i in range(0, index): 204 self.matrix[index][i] = value[i] 205 for i in range(index, len(self)): 206 self.matrix[i][index] = value[i] 207 else: 208 raise ValueError("Value not the same size.") 209 else: 210 raise TypeError("Invalid value type.") 211 # Handle double indexing 212 elif len(item) == 2: 213 row_index = None 214 col_index = None 215 if all(isinstance(i, int) for i in item): 216 row_index, col_index = item 217 elif all(isinstance(i, str) for i in item): 218 row_name, col_name = item 219 if row_name in self.names and col_name in self.names: 220 row_index = self.names.index(row_name) 221 col_index = self.names.index(col_name) 222 else: 223 raise ValueError("Item not found.") 224 else: 225 raise TypeError("Invalid index type.") 226 # check index 227 if row_index > len(self) - 1 or col_index > len(self) - 1: 228 raise IndexError("Index out of range.") 229 # check and assign value 230 if _is_numeric(value): 231 if row_index > col_index: 232 self.matrix[row_index][col_index] = value 233 else: 234 self.matrix[col_index][row_index] = value 235 else: 236 raise TypeError("Invalid value type.") 237 else: 238 raise TypeError("Invalid index type.")
239
240 - def __delitem__(self, item):
241 """Delete related distances by the index or name""" 242 index = None 243 if isinstance(item, int): 244 index = item 245 elif isinstance(item, str): 246 index = self.names.index(item) 247 else: 248 raise TypeError("Invalid index type.") 249 # remove distances related to index 250 for i in range(index + 1, len(self)): 251 del self.matrix[i][index] 252 del self.matrix[index] 253 # remove name 254 del self.names[index]
255
256 - def insert(self, name, value, index=None):
257 """Insert distances given the name and value. 258 259 :Parameters: 260 name : str 261 name of a row/col to be inserted 262 value : list 263 a row/col of values to be inserted 264 """ 265 if isinstance(name, str): 266 # insert at the given index or at the end 267 if index is None: 268 index = len(self) 269 if not isinstance(index, int): 270 raise TypeError("Invalid index type.") 271 # insert name 272 self.names.insert(index, name) 273 # insert elements of 0, to be assigned 274 self.matrix.insert(index, [0] * index) 275 for i in range(index, len(self)): 276 self.matrix[i].insert(index, 0) 277 # assign value 278 self[index] = value 279 else: 280 raise TypeError("Invalid name type.")
281
282 - def __len__(self):
283 """Matrix length""" 284 return len(self.names)
285
286 - def __repr__(self):
287 return self.__class__.__name__ \ 288 + "(names=%s, matrix=%s)" \ 289 % tuple(map(repr, (self.names, self.matrix)))
290
291 - def __str__(self):
292 """Get a lower triangular matrix string""" 293 matrix_string = '\n'.join( 294 [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]]) 295 for i in range(0, len(self))]) 296 matrix_string = matrix_string + "\n\t" + "\t".join(self.names) 297 return matrix_string
298 299
300 -class _DistanceMatrix(_Matrix):
301 """Distance matrix class that can be used for distance based tree algorithms. 302 303 All diagonal elements will be zero no matter what the users provide. 304 """ 305
306 - def __init__(self, names, matrix=None):
307 _Matrix.__init__(self, names, matrix) 308 self._set_zero_diagonal()
309
310 - def __setitem__(self, item, value):
311 _Matrix.__setitem__(self, item, value) 312 self._set_zero_diagonal()
313
314 - def _set_zero_diagonal(self):
315 """set all diagonal elements to zero""" 316 for i in range(0, len(self)): 317 self.matrix[i][i] = 0
318 319
320 -class DistanceCalculator(object):
321 """Class to calculate the distance matrix from a DNA or Protein 322 323 Multiple Sequence Alignment(MSA) and the given name of the 324 substitution model. 325 326 Currently only scoring matrices are used. 327 328 :Parameters: 329 model : str 330 Name of the model matrix to be used to calculate distance. 331 The attribute `dna_matrices` contains the available model 332 names for DNA sequences and `protein_matrices` for protein 333 sequences. 334 335 Example 336 ------- 337 338 >>> from Bio.Phylo.TreeConstruction import DistanceCalculator 339 >>> from Bio import AlignIO 340 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 341 >>> print aln 342 SingleLetterAlphabet() alignment with 5 rows and 13 columns 343 AACGTGGCCACAT Alpha 344 AAGGTCGCCACAC Beta 345 GAGATTTCCGCCT Delta 346 GAGATCTCCGCCC Epsilon 347 CAGTTCGCCACAA Gamma 348 349 DNA calculator with 'identity' model:: 350 351 >>> calculator = DistanceCalculator('identity') 352 >>> dm = calculator.get_distance(aln) 353 >>> print dm 354 Alpha 0 355 Beta 0.230769230769 0 356 Gamma 0.384615384615 0.230769230769 0 357 Delta 0.538461538462 0.538461538462 0.538461538462 0 358 Epsilon 0.615384615385 0.384615384615 0.461538461538 0.153846153846 0 359 Alpha Beta Gamma Delta Epsilon 360 361 Protein calculator with 'blosum62' model:: 362 363 >>> calculator = DistanceCalculator('blosum62') 364 >>> dm = calculator.get_distance(aln) 365 >>> print dm 366 Alpha 0 367 Beta 0.369047619048 0 368 Gamma 0.493975903614 0.25 0 369 Delta 0.585365853659 0.547619047619 0.566265060241 0 370 Epsilon 0.7 0.355555555556 0.488888888889 0.222222222222 0 371 Alpha Beta Gamma Delta Epsilon 372 """ 373 374 dna_alphabet = ['A', 'T', 'C', 'G'] 375 376 # BLAST nucleic acid scoring matrix 377 blastn = [[5], 378 [-4, 5], 379 [-4, -4, 5], 380 [-4, -4, -4, 5]] 381 382 # transition/transversion scoring matrix 383 trans = [[6], 384 [-5, 6], 385 [-5, -1, 6], 386 [-1, -5, -5, 6]] 387 388 protein_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 389 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 390 'Z'] 391 392 # matrices available 393 dna_matrices = {'blastn': blastn, 'trans': trans} 394 protein_models = MatrixInfo.available_matrices 395 protein_matrices = dict((name, getattr(MatrixInfo, name)) 396 for name in protein_models) 397 398 dna_models = list(dna_matrices.keys()) 399 400 models = ['identity'] + dna_models + protein_models 401
402 - def __init__(self, model='identity'):
403 """Initialize with a distance model""" 404 405 if model == 'identity': 406 self.scoring_matrix = None 407 elif model in self.dna_models: 408 self.scoring_matrix = _Matrix(self.dna_alphabet, 409 self.dna_matrices[model]) 410 elif model in self.protein_models: 411 self.scoring_matrix = self._build_protein_matrix( 412 self.protein_matrices[model]) 413 else: 414 raise ValueError("Model not supported. Available models: " 415 + ", ".join(self.models))
416
417 - def _pairwise(self, seq1, seq2):
418 """Calculate pairwise distance from two sequences. 419 420 Returns a value between 0 (identical sequences) and 1 (completely 421 different, or seq1 is an empty string.) 422 """ 423 score = 0 424 max_score = 0 425 if self.scoring_matrix: 426 max_score1 = 0 427 max_score2 = 0 428 skip_letters = ['-', '*'] 429 for i in range(0, len(seq1)): 430 l1 = seq1[i] 431 l2 = seq2[i] 432 if l1 in skip_letters or l2 in skip_letters: 433 continue 434 if l1 not in self.scoring_matrix.names: 435 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 436 % (l1, seq1.id, i)) 437 if l2 not in self.scoring_matrix.names: 438 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 439 % (l2, seq2.id, i)) 440 max_score1 += self.scoring_matrix[l1, l1] 441 max_score2 += self.scoring_matrix[l2, l2] 442 score += self.scoring_matrix[l1, l2] 443 # Take the higher score if the matrix is asymmetrical 444 max_score = max(max_score1, max_score2) 445 else: 446 # Score by character identity, not skipping any special letters 447 for i in range(0, len(seq1)): 448 l1 = seq1[i] 449 l2 = seq2[i] 450 if l1 == l2: 451 score += 1 452 max_score = len(seq1) 453 454 if max_score == 0: 455 return 1 # max possible scaled distance 456 return 1 - (score * 1.0 / max_score)
457
458 - def get_distance(self, msa):
459 """Return a _DistanceMatrix for MSA object 460 461 :Parameters: 462 msa : MultipleSeqAlignment 463 DNA or Protein multiple sequence alignment. 464 465 """ 466 467 if not isinstance(msa, MultipleSeqAlignment): 468 raise TypeError("Must provide a MultipleSeqAlignment object.") 469 470 names = [s.id for s in msa] 471 dm = _DistanceMatrix(names) 472 for seq1, seq2 in itertools.combinations(msa, 2): 473 dm[seq1.id, seq2.id] = self._pairwise(seq1, seq2) 474 return dm
475
476 - def _build_protein_matrix(self, subsmat):
477 """Convert matrix from SubsMat format to _Matrix object""" 478 protein_matrix = _Matrix(self.protein_alphabet) 479 for k, v in subsmat.items(): 480 aa1, aa2 = k 481 protein_matrix[aa1, aa2] = v 482 return protein_matrix
483 484
485 -class TreeConstructor(object):
486 """Base class for all tree constructor.""" 487
488 - def build_tree(self, msa):
489 """Caller to built the tree from a MultipleSeqAlignment object. 490 491 This should be implemented in subclass""" 492 raise NotImplementedError("Method not implemented!")
493 494
495 -class DistanceTreeConstructor(TreeConstructor):
496 """Distance based tree constructor. 497 498 :Parameters: 499 method : str 500 Distance tree construction method, 'nj'(default) or 'upgma'. 501 distance_calculator : DistanceCalculator 502 The distance matrix calculator for multiple sequence alignment. 503 It must be provided if `build_tree` will be called. 504 505 Example 506 -------- 507 508 >>> from TreeConstruction import DistanceTreeConstructor 509 >>> constructor = DistanceTreeConstructor() 510 511 UPGMA Tree: 512 513 >>> upgmatree = constructor.upgma(dm) 514 >>> print upgmatree 515 Tree(rooted=True) 516 Clade(name='Inner4') 517 Clade(branch_length=0.171955155115, name='Inner1') 518 Clade(branch_length=0.111111111111, name='Epsilon') 519 Clade(branch_length=0.111111111111, name='Delta') 520 Clade(branch_length=0.0673103855608, name='Inner3') 521 Clade(branch_length=0.0907558806655, name='Inner2') 522 Clade(branch_length=0.125, name='Gamma') 523 Clade(branch_length=0.125, name='Beta') 524 Clade(branch_length=0.215755880666, name='Alpha') 525 526 NJ Tree: 527 528 >>> njtree = constructor.nj(dm) 529 >>> print njtree 530 Tree(rooted=False) 531 Clade(name='Inner3') 532 Clade(branch_length=0.0142054862889, name='Inner2') 533 Clade(branch_length=0.239265540676, name='Inner1') 534 Clade(branch_length=0.0853101915988, name='Epsilon') 535 Clade(branch_length=0.136912030623, name='Delta') 536 Clade(branch_length=0.292306275042, name='Alpha') 537 Clade(branch_length=0.0747705106139, name='Beta') 538 Clade(branch_length=0.175229489386, name='Gamma') 539 540 541 """ 542 543 methods = ['nj', 'upgma'] 544
545 - def __init__(self, distance_calculator=None, method="nj"):
546 if (distance_calculator is None 547 or isinstance(distance_calculator, DistanceCalculator)): 548 self.distance_calculator = distance_calculator 549 else: 550 raise TypeError("Must provide a DistanceCalculator object.") 551 if isinstance(method, str) and method in self.methods: 552 self.method = method 553 else: 554 raise TypeError("Bad method: " + method + 555 ". Available methods: " + ", ".join(self.methods))
556
557 - def build_tree(self, msa):
558 if self.distance_calculator: 559 dm = self.distance_calculator.get_distance(msa) 560 tree = None 561 if self.method == 'upgma': 562 tree = self.upgma(dm) 563 else: 564 tree = self.nj(dm) 565 return tree 566 else: 567 raise TypeError("Must provide a DistanceCalculator object.")
568
569 - def upgma(self, distance_matrix):
570 """Construct and return an UPGMA tree. 571 572 Constructs and returns an Unweighted Pair Group Method 573 with Arithmetic mean (UPGMA) tree. 574 575 :Parameters: 576 distance_matrix : _DistanceMatrix 577 The distance matrix for tree construction. 578 """ 579 if not isinstance(distance_matrix, _DistanceMatrix): 580 raise TypeError("Must provide a _DistanceMatrix object.") 581 582 # make a copy of the distance matrix to be used 583 dm = copy.deepcopy(distance_matrix) 584 # init terminal clades 585 clades = [BaseTree.Clade(None, name) for name in dm.names] 586 # init minimum index 587 min_i = 0 588 min_j = 0 589 inner_count = 0 590 while len(dm) > 1: 591 min_dist = dm[1, 0] 592 # find minimum index 593 for i in range(1, len(dm)): 594 for j in range(0, i): 595 if min_dist >= dm[i, j]: 596 min_dist = dm[i, j] 597 min_i = i 598 min_j = j 599 600 # create clade 601 clade1 = clades[min_i] 602 clade2 = clades[min_j] 603 inner_count += 1 604 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 605 inner_clade.clades.append(clade1) 606 inner_clade.clades.append(clade2) 607 # assign branch length 608 if clade1.is_terminal(): 609 clade1.branch_length = min_dist * 1.0 / 2 610 else: 611 clade1.branch_length = min_dist * \ 612 1.0 / 2 - self._height_of(clade1) 613 614 if clade2.is_terminal(): 615 clade2.branch_length = min_dist * 1.0 / 2 616 else: 617 clade2.branch_length = min_dist * \ 618 1.0 / 2 - self._height_of(clade2) 619 620 # update node list 621 clades[min_j] = inner_clade 622 del clades[min_i] 623 624 # rebuild distance matrix, 625 # set the distances of new node at the index of min_j 626 for k in range(0, len(dm)): 627 if k != min_i and k != min_j: 628 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2 629 630 dm.names[min_j] = "Inner" + str(inner_count) 631 632 del dm[min_i] 633 inner_clade.branch_length = 0 634 return BaseTree.Tree(inner_clade)
635
636 - def nj(self, distance_matrix):
637 """Construct and return an Neighbor Joining tree. 638 639 :Parameters: 640 distance_matrix : _DistanceMatrix 641 The distance matrix for tree construction. 642 """ 643 644 if not isinstance(distance_matrix, _DistanceMatrix): 645 raise TypeError("Must provide a _DistanceMatrix object.") 646 647 # make a copy of the distance matrix to be used 648 dm = copy.deepcopy(distance_matrix) 649 # init terminal clades 650 clades = [BaseTree.Clade(None, name) for name in dm.names] 651 # init node distance 652 node_dist = [0] * len(dm) 653 # init minimum index 654 min_i = 0 655 min_j = 0 656 inner_count = 0 657 while len(dm) > 2: 658 # calculate nodeDist 659 for i in range(0, len(dm)): 660 node_dist[i] = 0 661 for j in range(0, len(dm)): 662 node_dist[i] += dm[i, j] 663 node_dist[i] = node_dist[i] / (len(dm) - 2) 664 665 # find minimum distance pair 666 min_dist = dm[1, 0] - node_dist[1] - node_dist[0] 667 min_i = 0 668 min_j = 1 669 for i in range(1, len(dm)): 670 for j in range(0, i): 671 temp = dm[i, j] - node_dist[i] - node_dist[j] 672 if min_dist > temp: 673 min_dist = temp 674 min_i = i 675 min_j = j 676 # create clade 677 clade1 = clades[min_i] 678 clade2 = clades[min_j] 679 inner_count += 1 680 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 681 inner_clade.clades.append(clade1) 682 inner_clade.clades.append(clade2) 683 # assign branch length 684 clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] 685 - node_dist[min_j]) / 2.0 686 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length 687 688 # update node list 689 clades[min_j] = inner_clade 690 del clades[min_i] 691 692 # rebuild distance matrix, 693 # set the distances of new node at the index of min_j 694 for k in range(0, len(dm)): 695 if k != min_i and k != min_j: 696 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] 697 - dm[min_i, min_j]) / 2.0 698 699 dm.names[min_j] = "Inner" + str(inner_count) 700 del dm[min_i] 701 702 # set the last clade as one of the child of the inner_clade 703 root = None 704 if clades[0] == inner_clade: 705 clades[0].branch_length = 0 706 clades[1].branch_length = dm[1, 0] 707 clades[0].clades.append(clades[1]) 708 root = clades[0] 709 else: 710 clades[0].branch_length = dm[1, 0] 711 clades[1].branch_length = 0 712 clades[1].clades.append(clades[0]) 713 root = clades[1] 714 715 return BaseTree.Tree(root, rooted=False)
716
717 - def _height_of(self, clade):
718 """calculate clade height -- the longest path to any terminal.""" 719 height = 0 720 if clade.is_terminal(): 721 height = clade.branch_length 722 else: 723 height = height + max([self._height_of(c) for c in clade.clades]) 724 return height
725 726 # #################### Tree Scoring and Searching Classes ##################### 727 728
729 -class Scorer(object):
730 """Base class for all tree scoring methods""" 731
732 - def get_score(self, tree, alignment):
733 """Caller to get the score of a tree for the given alignment. 734 735 This should be implemented in subclass""" 736 raise NotImplementedError("Method not implemented!")
737 738
739 -class TreeSearcher(object):
740 """Base class for all tree searching methods""" 741
742 - def search(self, starting_tree, alignment):
743 """Caller to search the best tree with a starting tree. 744 745 This should be implemented in subclass""" 746 raise NotImplementedError("Method not implemented!")
747 748
749 -class NNITreeSearcher(TreeSearcher):
750 """Tree searching with Nearest Neighbor Interchanges (NNI) algorithm. 751 752 :Parameters: 753 scorer : ParsimonyScorer 754 parsimony scorer to calculate the parsimony score of 755 different trees during NNI algorithm. 756 """ 757
758 - def __init__(self, scorer):
759 if isinstance(scorer, Scorer): 760 self.scorer = scorer 761 else: 762 raise TypeError("Must provide a Scorer object.")
763
764 - def search(self, starting_tree, alignment):
765 """Implement the TreeSearcher.search method. 766 767 :Parameters: 768 starting_tree : Tree 769 starting tree of NNI method. 770 alignment : MultipleSeqAlignment 771 multiple sequence alignment used to calculate parsimony 772 score of different NNI trees. 773 """ 774 775 return self._nni(starting_tree, alignment)
776
777 - def _nni(self, starting_tree, alignment):
778 """Search for the best parsimony tree using the NNI algorithm.""" 779 best_tree = starting_tree 780 while True: 781 best_score = self.scorer.get_score(best_tree, alignment) 782 temp = best_score 783 for t in self._get_neighbors(best_tree): 784 score = self.scorer.get_score(t, alignment) 785 if score < best_score: 786 best_score = score 787 best_tree = t 788 # stop if no smaller score exist 789 if best_score >= temp: 790 break 791 return best_tree
792
793 - def _get_neighbors(self, tree):
794 """Get all neighbor trees of the given tree. 795 796 Currently only for binary rooted trees. 797 """ 798 # make child to parent dict 799 parents = {} 800 for clade in tree.find_clades(): 801 if clade != tree.root: 802 node_path = tree.get_path(clade) 803 # cannot get the parent if the parent is root. Bug? 804 if len(node_path) == 1: 805 parents[clade] = tree.root 806 else: 807 parents[clade] = node_path[-2] 808 neighbors = [] 809 root_childs = [] 810 for clade in tree.get_nonterminals(order="level"): 811 if clade == tree.root: 812 left = clade.clades[0] 813 right = clade.clades[1] 814 root_childs.append(left) 815 root_childs.append(right) 816 if not left.is_terminal() and not right.is_terminal(): 817 # make changes around the left_left clade 818 # left_left = left.clades[0] 819 left_right = left.clades[1] 820 right_left = right.clades[0] 821 right_right = right.clades[1] 822 # neightbor 1 (left_left + right_right) 823 del left.clades[1] 824 del right.clades[1] 825 left.clades.append(right_right) 826 right.clades.append(left_right) 827 temp_tree = copy.deepcopy(tree) 828 neighbors.append(temp_tree) 829 # neighbor 2 (left_left + right_left) 830 del left.clades[1] 831 del right.clades[0] 832 left.clades.append(right_left) 833 right.clades.append(right_right) 834 temp_tree = copy.deepcopy(tree) 835 neighbors.append(temp_tree) 836 # change back (left_left + left_right) 837 del left.clades[1] 838 del right.clades[0] 839 left.clades.append(left_right) 840 right.clades.insert(0, right_left) 841 elif clade in root_childs: 842 # skip root child 843 continue 844 else: 845 # method for other clades 846 # make changes around the parent clade 847 left = clade.clades[0] 848 right = clade.clades[1] 849 parent = parents[clade] 850 if clade == parent.clades[0]: 851 sister = parent.clades[1] 852 # neighbor 1 (parent + right) 853 del parent.clades[1] 854 del clade.clades[1] 855 parent.clades.append(right) 856 clade.clades.append(sister) 857 temp_tree = copy.deepcopy(tree) 858 neighbors.append(temp_tree) 859 # neighbor 2 (parent + left) 860 del parent.clades[1] 861 del clade.clades[0] 862 parent.clades.append(left) 863 clade.clades.append(right) 864 temp_tree = copy.deepcopy(tree) 865 neighbors.append(temp_tree) 866 # change back (parent + sister) 867 del parent.clades[1] 868 del clade.clades[0] 869 parent.clades.append(sister) 870 clade.clades.insert(0, left) 871 else: 872 sister = parent.clades[0] 873 # neighbor 1 (parent + right) 874 del parent.clades[0] 875 del clade.clades[1] 876 parent.clades.insert(0, right) 877 clade.clades.append(sister) 878 temp_tree = copy.deepcopy(tree) 879 neighbors.append(temp_tree) 880 # neighbor 2 (parent + left) 881 del parent.clades[0] 882 del clade.clades[0] 883 parent.clades.insert(0, left) 884 clade.clades.append(right) 885 temp_tree = copy.deepcopy(tree) 886 neighbors.append(temp_tree) 887 # change back (parent + sister) 888 del parent.clades[0] 889 del clade.clades[0] 890 parent.clades.insert(0, sister) 891 clade.clades.insert(0, left) 892 return neighbors
893 894 # ######################## Parsimony Classes ########################## 895 896
897 -class ParsimonyScorer(Scorer):
898 """Parsimony scorer with a scoring matrix. 899 900 This is a combination of Fitch algorithm and Sankoff algorithm. 901 See ParsimonyTreeConstructor for usage. 902 903 :Parameters: 904 matrix : _Matrix 905 scoring matrix used in parsimony score calculation. 906 """ 907
908 - def __init__(self, matrix=None):
909 if not matrix or isinstance(matrix, _Matrix): 910 self.matrix = matrix 911 else: 912 raise TypeError("Must provide a _Matrix object.")
913
914 - def get_score(self, tree, alignment):
915 """Calculate and return the parsimony score given a tree and 916 the MSA using the Fitch algorithm without the penalty matrix 917 the Sankoff algorithm with the matrix""" 918 # make sure the tree is rooted and bifurcating 919 if not tree.is_bifurcating(): 920 raise ValueError("The tree provided should be bifurcating.") 921 if not tree.rooted: 922 tree.root_at_midpoint() 923 # sort tree terminals and alignment 924 terms = tree.get_terminals() 925 terms.sort(key=lambda term: term.name) 926 alignment.sort() 927 if not all([t.name == a.id for t, a in zip(terms, alignment)]): 928 raise ValueError( 929 "Taxon names of the input tree should be the same with the alignment.") 930 # term_align = dict(zip(terms, alignment)) 931 score = 0 932 for i in range(len(alignment[0])): 933 # parsimony score for column_i 934 score_i = 0 935 # get column 936 column_i = alignment[:, i] 937 # skip non-informative column 938 if column_i == len(column_i) * column_i[0]: 939 continue 940 941 # start calculating score_i using the tree and column_i 942 943 # Fitch algorithm without the penalty matrix 944 if not self.matrix: 945 # init by mapping terminal clades and states in column_i 946 clade_states = dict(zip(terms, [set([c]) for c in column_i])) 947 for clade in tree.get_nonterminals(order="postorder"): 948 clade_childs = clade.clades 949 left_state = clade_states[clade_childs[0]] 950 right_state = clade_states[clade_childs[1]] 951 state = left_state & right_state 952 if not state: 953 state = left_state | right_state 954 score_i = score_i + 1 955 clade_states[clade] = state 956 # Sankoff algorithm with the penalty matrix 957 else: 958 inf = float('inf') 959 # init score arrays for terminal clades 960 alphabet = self.matrix.names 961 length = len(alphabet) 962 clade_scores = {} 963 for j in range(len(column_i)): 964 array = [inf] * length 965 index = alphabet.index(column_i[j]) 966 array[index] = 0 967 clade_scores[terms[j]] = array 968 # bottom up calculation 969 for clade in tree.get_nonterminals(order="postorder"): 970 clade_childs = clade.clades 971 left_score = clade_scores[clade_childs[0]] 972 right_score = clade_scores[clade_childs[1]] 973 array = [] 974 for m in range(length): 975 min_l = inf 976 min_r = inf 977 for n in range(length): 978 sl = self.matrix[ 979 alphabet[m], alphabet[n]] + left_score[n] 980 sr = self.matrix[ 981 alphabet[m], alphabet[n]] + right_score[n] 982 if min_l > sl: 983 min_l = sl 984 if min_r > sr: 985 min_r = sr 986 array.append(min_l + min_r) 987 clade_scores[clade] = array 988 # minimum from root score 989 score_i = min(array) 990 # TODO: resolve internal states 991 score = score + score_i 992 return score
993 994
995 -class ParsimonyTreeConstructor(TreeConstructor):
996 """Parsimony tree constructor. 997 998 :Parameters: 999 searcher : TreeSearcher 1000 tree searcher to search the best parsimony tree. 1001 starting_tree : Tree 1002 starting tree provided to the searcher. 1003 1004 Example 1005 -------- 1006 1007 >>> from Bio import AlignIO 1008 >>> from TreeConstruction import * 1009 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 1010 >>> print aln 1011 SingleLetterAlphabet() alignment with 5 rows and 13 columns 1012 AACGTGGCCACAT Alpha 1013 AAGGTCGCCACAC Beta 1014 GAGATTTCCGCCT Delta 1015 GAGATCTCCGCCC Epsilon 1016 CAGTTCGCCACAA Gamma 1017 >>> starting_tree = Phylo.read('Tests/TreeConstruction/nj.tre', 'newick') 1018 >>> print tree 1019 Tree(weight=1.0, rooted=False) 1020 Clade(branch_length=0.0, name='Inner3') 1021 Clade(branch_length=0.01421, name='Inner2') 1022 Clade(branch_length=0.23927, name='Inner1') 1023 Clade(branch_length=0.08531, name='Epsilon') 1024 Clade(branch_length=0.13691, name='Delta') 1025 Clade(branch_length=0.29231, name='Alpha') 1026 Clade(branch_length=0.07477, name='Beta') 1027 Clade(branch_length=0.17523, name='Gamma') 1028 >>> from TreeConstruction import * 1029 >>> scorer = ParsimonyScorer() 1030 >>> searcher = NNITreeSearcher(scorer) 1031 >>> constructor = ParsimonyTreeConstructor(searcher, starting_tree) 1032 >>> pars_tree = constructor.build_tree(aln) 1033 >>> print pars_tree 1034 Tree(weight=1.0, rooted=True) 1035 Clade(branch_length=0.0) 1036 Clade(branch_length=0.197335, name='Inner1') 1037 Clade(branch_length=0.13691, name='Delta') 1038 Clade(branch_length=0.08531, name='Epsilon') 1039 Clade(branch_length=0.041935, name='Inner2') 1040 Clade(branch_length=0.01421, name='Inner3') 1041 Clade(branch_length=0.17523, name='Gamma') 1042 Clade(branch_length=0.07477, name='Beta') 1043 Clade(branch_length=0.29231, name='Alpha') 1044 """ 1045
1046 - def __init__(self, searcher, starting_tree=None):
1047 self.searcher = searcher 1048 self.starting_tree = starting_tree
1049
1050 - def build_tree(self, alignment):
1051 """Build the tree. 1052 1053 :Parameters: 1054 alignment : MultipleSeqAlignment 1055 multiple sequence alignment to calculate parsimony tree. 1056 """ 1057 # if starting_tree is none, 1058 # create a upgma tree with 'identity' scoring matrix 1059 if self.starting_tree is None: 1060 dtc = DistanceTreeConstructor(DistanceCalculator("identity"), 1061 "upgma") 1062 self.starting_tree = dtc.build_tree(alignment) 1063 return self.searcher.search(self.starting_tree, alignment)
1064