Package Bio :: Package Phylo :: Module TreeConstruction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.TreeConstruction

   1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes and methods for tree construction""" 
   7   
   8  import itertools 
   9  import copy 
  10  from Bio.Phylo import BaseTree 
  11  from Bio.Align import MultipleSeqAlignment 
  12  from Bio.SubsMat import MatrixInfo 
  13  from Bio import _py3k 
  14  from Bio._py3k import zip, range 
  15   
  16   
17 -def _is_numeric(x):
18 return _py3k._is_int_or_long(x) or isinstance(x, (float, complex))
19 20
21 -class _Matrix(object):
22 """Base class for distance matrix or scoring matrix 23 24 Accepts a list of names and a lower triangular matrix.:: 25 26 matrix = [[0], 27 [1, 0], 28 [2, 3, 0], 29 [4, 5, 6, 0]] 30 represents the symmetric matrix of 31 [0,1,2,4] 32 [1,0,3,5] 33 [2,3,0,6] 34 [4,5,6,0] 35 36 :Parameters: 37 names : list 38 names of elements, used for indexing 39 matrix : list 40 nested list of numerical lists in lower triangular format 41 42 Example 43 ------- 44 45 >>> from Bio.Phylo.TreeConstruction import _Matrix 46 >>> names = ['Alpha', 'Beta', 'Gamma', 'Delta'] 47 >>> matrix = [[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]] 48 >>> m = _Matrix(names, matrix) 49 >>> m 50 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]]) 51 52 You can use two indices to get or assign an element in the matrix. 53 54 >>> m[1,2] 55 3 56 >>> m['Beta','Gamma'] 57 3 58 >>> m['Beta','Gamma'] = 4 59 >>> m['Beta','Gamma'] 60 4 61 62 Further more, you can use one index to get or assign a list of elements related to that index. 63 64 >>> m[0] 65 [0, 1, 2, 4] 66 >>> m['Alpha'] 67 [0, 1, 2, 4] 68 >>> m['Alpha'] = [0, 7, 8, 9] 69 >>> m[0] 70 [0, 7, 8, 9] 71 >>> m[0,1] 72 7 73 74 Also you can delete or insert a column&row of elemets by index. 75 76 >>> m 77 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 78 >>> del m['Alpha'] 79 >>> m 80 _Matrix(names=['Beta', 'Gamma', 'Delta'], matrix=[[0], [4, 0], [5, 6, 0]]) 81 >>> m.insert('Alpha', [0, 7, 8, 9] , 0) 82 >>> m 83 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 84 85 """ 86
87 - def __init__(self, names, matrix=None):
88 """Initialize matrix by a list of names and a list of 89 lower triangular matrix data. 90 """ 91 # check names 92 if isinstance(names, list) and all(isinstance(s, str) for s in names): 93 if len(set(names)) == len(names): 94 self.names = names 95 else: 96 raise ValueError("Duplicate names found") 97 else: 98 raise TypeError("'names' should be a list of strings") 99 100 # check matrix 101 if matrix is None: 102 # create a new one with 0 if matrix is not assigned 103 matrix = [[0] * i for i in range(1, len(self) + 1)] 104 self.matrix = matrix 105 else: 106 # check if all elements are numbers 107 if (isinstance(matrix, list) and 108 all(isinstance(l, list) for l in matrix) and 109 all(_is_numeric(n) for n in [item for sublist in matrix 110 for item in sublist])): 111 # check if the same length with names 112 if len(matrix) == len(names): 113 # check if is lower triangle format 114 if [len(m) for m in matrix] == list(range(1, len(self) + 1)): 115 self.matrix = matrix 116 else: 117 raise ValueError( 118 "'matrix' should be in lower triangle format") 119 else: 120 raise ValueError( 121 "'names' and 'matrix' should be the same size") 122 else: 123 raise TypeError("'matrix' should be a list of numerical lists")
124
125 - def __getitem__(self, item):
126 """Access value(s) by the index(s) or name(s). 127 128 For a _Matrix object 'dm':: 129 130 dm[i] get a value list from the given 'i' to others; 131 dm[i, j] get the value between 'i' and 'j'; 132 dm['name'] map name to index first 133 dm['name1', 'name2'] map name to index first 134 135 """ 136 # Handle single indexing 137 if isinstance(item, (int, str)): 138 index = None 139 if isinstance(item, int): 140 index = item 141 elif isinstance(item, str): 142 if item in self.names: 143 index = self.names.index(item) 144 else: 145 raise ValueError("Item not found.") 146 else: 147 raise TypeError("Invalid index type.") 148 # check index 149 if index > len(self) - 1: 150 raise IndexError("Index out of range.") 151 return [self.matrix[index][i] for i in range(0, index)] + [self.matrix[i][index] for i in range(index, len(self))] 152 # Handle double indexing 153 elif len(item) == 2: 154 row_index = None 155 col_index = None 156 if all(isinstance(i, int) for i in item): 157 row_index, col_index = item 158 elif all(isinstance(i, str) for i in item): 159 row_name, col_name = item 160 if row_name in self.names and col_name in self.names: 161 row_index = self.names.index(row_name) 162 col_index = self.names.index(col_name) 163 else: 164 raise ValueError("Item not found.") 165 else: 166 raise TypeError("Invalid index type.") 167 # check index 168 if row_index > len(self) - 1 or col_index > len(self) - 1: 169 raise IndexError("Index out of range.") 170 if row_index > col_index: 171 return self.matrix[row_index][col_index] 172 else: 173 return self.matrix[col_index][row_index] 174 else: 175 raise TypeError("Invalid index type.")
176
177 - def __setitem__(self, item, value):
178 """Set value by the index(s) or name(s). 179 180 Similar to __getitem__:: 181 182 dm[1] = [1, 0, 3, 4] set values from '1' to others; 183 dm[i, j] = 2 set the value from 'i' to 'j' 184 185 """ 186 # Handle single indexing 187 if isinstance(item, (int, str)): 188 index = None 189 if isinstance(item, int): 190 index = item 191 elif isinstance(item, str): 192 if item in self.names: 193 index = self.names.index(item) 194 else: 195 raise ValueError("Item not found.") 196 else: 197 raise TypeError("Invalid index type.") 198 # check index 199 if index > len(self) - 1: 200 raise IndexError("Index out of range.") 201 # check and assign value 202 if isinstance(value, list) and all(_is_numeric(n) for n in value): 203 if len(value) == len(self): 204 for i in range(0, index): 205 self.matrix[index][i] = value[i] 206 for i in range(index, len(self)): 207 self.matrix[i][index] = value[i] 208 else: 209 raise ValueError("Value not the same size.") 210 else: 211 raise TypeError("Invalid value type.") 212 # Handle double indexing 213 elif len(item) == 2: 214 row_index = None 215 col_index = None 216 if all(isinstance(i, int) for i in item): 217 row_index, col_index = item 218 elif all(isinstance(i, str) for i in item): 219 row_name, col_name = item 220 if row_name in self.names and col_name in self.names: 221 row_index = self.names.index(row_name) 222 col_index = self.names.index(col_name) 223 else: 224 raise ValueError("Item not found.") 225 else: 226 raise TypeError("Invalid index type.") 227 # check index 228 if row_index > len(self) - 1 or col_index > len(self) - 1: 229 raise IndexError("Index out of range.") 230 # check and assign value 231 if _is_numeric(value): 232 if row_index > col_index: 233 self.matrix[row_index][col_index] = value 234 else: 235 self.matrix[col_index][row_index] = value 236 else: 237 raise TypeError("Invalid value type.") 238 else: 239 raise TypeError("Invalid index type.")
240
241 - def __delitem__(self, item):
242 """Delete related distances by the index or name""" 243 index = None 244 if isinstance(item, int): 245 index = item 246 elif isinstance(item, str): 247 index = self.names.index(item) 248 else: 249 raise TypeError("Invalid index type.") 250 # remove distances related to index 251 for i in range(index + 1, len(self)): 252 del self.matrix[i][index] 253 del self.matrix[index] 254 # remove name 255 del self.names[index]
256
257 - def insert(self, name, value, index=None):
258 """Insert distances given the name and value. 259 260 :Parameters: 261 name : str 262 name of a row/col to be inserted 263 value : list 264 a row/col of values to be inserted 265 266 """ 267 if isinstance(name, str): 268 # insert at the given index or at the end 269 if index is None: 270 index = len(self) 271 if not isinstance(index, int): 272 raise TypeError("Invalid index type.") 273 # insert name 274 self.names.insert(index, name) 275 # insert elements of 0, to be assigned 276 self.matrix.insert(index, [0] * index) 277 for i in range(index, len(self)): 278 self.matrix[i].insert(index, 0) 279 # assign value 280 self[index] = value 281 else: 282 raise TypeError("Invalid name type.")
283
284 - def __len__(self):
285 """Matrix length""" 286 return len(self.names)
287
288 - def __repr__(self):
289 return self.__class__.__name__ \ 290 + "(names=%s, matrix=%s)" \ 291 % tuple(map(repr, (self.names, self.matrix)))
292
293 - def __str__(self):
294 """Get a lower triangular matrix string""" 295 matrix_string = '\n'.join( 296 [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]]) 297 for i in range(0, len(self))]) 298 matrix_string = matrix_string + "\n\t" + "\t".join(self.names) 299 return matrix_string
300 301
302 -class DistanceMatrix(_Matrix):
303 """Distance matrix class that can be used for distance based tree algorithms. 304 305 All diagonal elements will be zero no matter what the users provide. 306 """ 307
308 - def __init__(self, names, matrix=None):
309 _Matrix.__init__(self, names, matrix) 310 self._set_zero_diagonal()
311
312 - def __setitem__(self, item, value):
313 _Matrix.__setitem__(self, item, value) 314 self._set_zero_diagonal()
315
316 - def _set_zero_diagonal(self):
317 """Set all diagonal elements to zero""" 318 for i in range(0, len(self)): 319 self.matrix[i][i] = 0
320
321 - def format_phylip(self, handle):
322 """Write data in Phylip format to a given file-like object or handle. 323 324 The output stream is the input distance matrix format used with Phylip 325 programs (e.g. 'neighbor'). See: 326 http://evolution.genetics.washington.edu/phylip/doc/neighbor.html 327 328 :Parameters: 329 handle : file or file-like object 330 A writeable file handle or other object supporting the 'write' 331 method, such as StringIO or sys.stdout. On Python 3, should be 332 open in text mode. 333 334 """ 335 handle.write(" {0}\n".format(len(self.names))) 336 # Phylip needs space-separated, vertically aligned columns 337 name_width = max(12, max(map(len, self.names)) + 1) 338 value_fmts = ("{" + str(x) + ":.4f}" 339 for x in range(1, len(self.matrix) + 1)) 340 row_fmt = "{0:" + str(name_width) + "s}" + " ".join(value_fmts) + "\n" 341 for i, (name, values) in enumerate(zip(self.names, self.matrix)): 342 # Mirror the matrix values across the diagonal 343 mirror_values = (self.matrix[j][i] 344 for j in range(i + 1, len(self.matrix))) 345 fields = itertools.chain([name], values, mirror_values) 346 handle.write(row_fmt.format(*fields))
347 348 349 # Shim for compatibility with Biopython<1.70 (#1304) 350 _DistanceMatrix = DistanceMatrix 351 352
353 -class DistanceCalculator(object):
354 """Class to calculate the distance matrix from a DNA or Protein 355 356 Multiple Sequence Alignment(MSA) and the given name of the 357 substitution model. 358 359 Currently only scoring matrices are used. 360 361 :Parameters: 362 model : str 363 Name of the model matrix to be used to calculate distance. 364 The attribute `dna_matrices` contains the available model 365 names for DNA sequences and `protein_matrices` for protein 366 sequences. 367 368 Example 369 ------- 370 371 >>> from Bio.Phylo.TreeConstruction import DistanceCalculator 372 >>> from Bio import AlignIO 373 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 374 >>> print aln 375 SingleLetterAlphabet() alignment with 5 rows and 13 columns 376 AACGTGGCCACAT Alpha 377 AAGGTCGCCACAC Beta 378 GAGATTTCCGCCT Delta 379 GAGATCTCCGCCC Epsilon 380 CAGTTCGCCACAA Gamma 381 382 DNA calculator with 'identity' model:: 383 384 >>> calculator = DistanceCalculator('identity') 385 >>> dm = calculator.get_distance(aln) 386 >>> print dm 387 Alpha 0 388 Beta 0.230769230769 0 389 Gamma 0.384615384615 0.230769230769 0 390 Delta 0.538461538462 0.538461538462 0.538461538462 0 391 Epsilon 0.615384615385 0.384615384615 0.461538461538 0.153846153846 0 392 Alpha Beta Gamma Delta Epsilon 393 394 Protein calculator with 'blosum62' model:: 395 396 >>> calculator = DistanceCalculator('blosum62') 397 >>> dm = calculator.get_distance(aln) 398 >>> print dm 399 Alpha 0 400 Beta 0.369047619048 0 401 Gamma 0.493975903614 0.25 0 402 Delta 0.585365853659 0.547619047619 0.566265060241 0 403 Epsilon 0.7 0.355555555556 0.488888888889 0.222222222222 0 404 Alpha Beta Gamma Delta Epsilon 405 406 """ 407 408 dna_alphabet = ['A', 'T', 'C', 'G'] 409 410 # BLAST nucleic acid scoring matrix 411 blastn = [[5], 412 [-4, 5], 413 [-4, -4, 5], 414 [-4, -4, -4, 5]] 415 416 # transition/transversion scoring matrix 417 trans = [[6], 418 [-5, 6], 419 [-5, -1, 6], 420 [-1, -5, -5, 6]] 421 422 protein_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 423 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 424 'Z'] 425 426 # matrices available 427 dna_matrices = {'blastn': blastn, 'trans': trans} 428 protein_models = MatrixInfo.available_matrices 429 protein_matrices = dict((name, getattr(MatrixInfo, name)) 430 for name in protein_models) 431 432 dna_models = list(dna_matrices.keys()) 433 434 models = ['identity'] + dna_models + protein_models 435
436 - def __init__(self, model='identity', skip_letters=None):
437 """Initialize with a distance model.""" 438 # Shim for backward compatibility (#491) 439 if skip_letters: 440 self.skip_letters = skip_letters 441 elif model == 'identity': 442 self.skip_letters = () 443 else: 444 self.skip_letters = ('-', '*') 445 446 if model == 'identity': 447 self.scoring_matrix = None 448 elif model in self.dna_models: 449 self.scoring_matrix = _Matrix(self.dna_alphabet, 450 self.dna_matrices[model]) 451 elif model in self.protein_models: 452 self.scoring_matrix = self._build_protein_matrix( 453 self.protein_matrices[model]) 454 else: 455 raise ValueError("Model not supported. Available models: " + 456 ", ".join(self.models))
457
458 - def _pairwise(self, seq1, seq2):
459 """Calculate pairwise distance from two sequences. 460 461 Returns a value between 0 (identical sequences) and 1 (completely 462 different, or seq1 is an empty string.) 463 """ 464 score = 0 465 max_score = 0 466 if self.scoring_matrix: 467 max_score1 = 0 468 max_score2 = 0 469 for i in range(0, len(seq1)): 470 l1 = seq1[i] 471 l2 = seq2[i] 472 if l1 in self.skip_letters or l2 in self.skip_letters: 473 continue 474 if l1 not in self.scoring_matrix.names: 475 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 476 % (l1, seq1.id, i)) 477 if l2 not in self.scoring_matrix.names: 478 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 479 % (l2, seq2.id, i)) 480 max_score1 += self.scoring_matrix[l1, l1] 481 max_score2 += self.scoring_matrix[l2, l2] 482 score += self.scoring_matrix[l1, l2] 483 # Take the higher score if the matrix is asymmetrical 484 max_score = max(max_score1, max_score2) 485 else: 486 # Score by character identity, not skipping any special letters 487 score = sum(l1 == l2 488 for l1, l2 in zip(seq1, seq2) 489 if l1 not in self.skip_letters and l2 not in self.skip_letters) 490 max_score = len(seq1) 491 if max_score == 0: 492 return 1 # max possible scaled distance 493 return 1 - (score * 1.0 / max_score)
494
495 - def get_distance(self, msa):
496 """Return a DistanceMatrix for MSA object 497 498 :Parameters: 499 msa : MultipleSeqAlignment 500 DNA or Protein multiple sequence alignment. 501 502 """ 503 if not isinstance(msa, MultipleSeqAlignment): 504 raise TypeError("Must provide a MultipleSeqAlignment object.") 505 506 names = [s.id for s in msa] 507 dm = DistanceMatrix(names) 508 for seq1, seq2 in itertools.combinations(msa, 2): 509 dm[seq1.id, seq2.id] = self._pairwise(seq1, seq2) 510 return dm
511
512 - def _build_protein_matrix(self, subsmat):
513 """Convert matrix from SubsMat format to _Matrix object""" 514 protein_matrix = _Matrix(self.protein_alphabet) 515 for k, v in subsmat.items(): 516 aa1, aa2 = k 517 protein_matrix[aa1, aa2] = v 518 return protein_matrix
519 520
521 -class TreeConstructor(object):
522 """Base class for all tree constructor.""" 523
524 - def build_tree(self, msa):
525 """Caller to built the tree from a MultipleSeqAlignment object. 526 527 This should be implemented in subclass. 528 """ 529 raise NotImplementedError("Method not implemented!")
530 531
532 -class DistanceTreeConstructor(TreeConstructor):
533 """Distance based tree constructor. 534 535 :Parameters: 536 method : str 537 Distance tree construction method, 'nj'(default) or 'upgma'. 538 distance_calculator : DistanceCalculator 539 The distance matrix calculator for multiple sequence alignment. 540 It must be provided if `build_tree` will be called. 541 542 Example 543 -------- 544 545 >>> from TreeConstruction import DistanceTreeConstructor 546 >>> constructor = DistanceTreeConstructor() 547 548 UPGMA Tree: 549 550 >>> upgmatree = constructor.upgma(dm) 551 >>> print upgmatree 552 Tree(rooted=True) 553 Clade(name='Inner4') 554 Clade(branch_length=0.171955155115, name='Inner1') 555 Clade(branch_length=0.111111111111, name='Epsilon') 556 Clade(branch_length=0.111111111111, name='Delta') 557 Clade(branch_length=0.0673103855608, name='Inner3') 558 Clade(branch_length=0.0907558806655, name='Inner2') 559 Clade(branch_length=0.125, name='Gamma') 560 Clade(branch_length=0.125, name='Beta') 561 Clade(branch_length=0.215755880666, name='Alpha') 562 563 NJ Tree: 564 565 >>> njtree = constructor.nj(dm) 566 >>> print njtree 567 Tree(rooted=False) 568 Clade(name='Inner3') 569 Clade(branch_length=0.0142054862889, name='Inner2') 570 Clade(branch_length=0.239265540676, name='Inner1') 571 Clade(branch_length=0.0853101915988, name='Epsilon') 572 Clade(branch_length=0.136912030623, name='Delta') 573 Clade(branch_length=0.292306275042, name='Alpha') 574 Clade(branch_length=0.0747705106139, name='Beta') 575 Clade(branch_length=0.175229489386, name='Gamma') 576 577 578 """ 579 580 methods = ['nj', 'upgma'] 581
582 - def __init__(self, distance_calculator=None, method="nj"):
583 if (distance_calculator is None or isinstance(distance_calculator, DistanceCalculator)): 584 self.distance_calculator = distance_calculator 585 else: 586 raise TypeError("Must provide a DistanceCalculator object.") 587 if isinstance(method, str) and method in self.methods: 588 self.method = method 589 else: 590 raise TypeError("Bad method: " + method + 591 ". Available methods: " + ", ".join(self.methods))
592
593 - def build_tree(self, msa):
594 if self.distance_calculator: 595 dm = self.distance_calculator.get_distance(msa) 596 tree = None 597 if self.method == 'upgma': 598 tree = self.upgma(dm) 599 else: 600 tree = self.nj(dm) 601 return tree 602 else: 603 raise TypeError("Must provide a DistanceCalculator object.")
604
605 - def upgma(self, distance_matrix):
606 """Construct and return an UPGMA tree. 607 608 Constructs and returns an Unweighted Pair Group Method 609 with Arithmetic mean (UPGMA) tree. 610 611 :Parameters: 612 distance_matrix : DistanceMatrix 613 The distance matrix for tree construction. 614 615 """ 616 if not isinstance(distance_matrix, DistanceMatrix): 617 raise TypeError("Must provide a DistanceMatrix object.") 618 619 # make a copy of the distance matrix to be used 620 dm = copy.deepcopy(distance_matrix) 621 # init terminal clades 622 clades = [BaseTree.Clade(None, name) for name in dm.names] 623 # init minimum index 624 min_i = 0 625 min_j = 0 626 inner_count = 0 627 while len(dm) > 1: 628 min_dist = dm[1, 0] 629 # find minimum index 630 for i in range(1, len(dm)): 631 for j in range(0, i): 632 if min_dist >= dm[i, j]: 633 min_dist = dm[i, j] 634 min_i = i 635 min_j = j 636 637 # create clade 638 clade1 = clades[min_i] 639 clade2 = clades[min_j] 640 inner_count += 1 641 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 642 inner_clade.clades.append(clade1) 643 inner_clade.clades.append(clade2) 644 # assign branch length 645 if clade1.is_terminal(): 646 clade1.branch_length = min_dist * 1.0 / 2 647 else: 648 clade1.branch_length = min_dist * \ 649 1.0 / 2 - self._height_of(clade1) 650 651 if clade2.is_terminal(): 652 clade2.branch_length = min_dist * 1.0 / 2 653 else: 654 clade2.branch_length = min_dist * \ 655 1.0 / 2 - self._height_of(clade2) 656 657 # update node list 658 clades[min_j] = inner_clade 659 del clades[min_i] 660 661 # rebuild distance matrix, 662 # set the distances of new node at the index of min_j 663 for k in range(0, len(dm)): 664 if k != min_i and k != min_j: 665 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2 666 667 dm.names[min_j] = "Inner" + str(inner_count) 668 669 del dm[min_i] 670 inner_clade.branch_length = 0 671 return BaseTree.Tree(inner_clade)
672
673 - def nj(self, distance_matrix):
674 """Construct and return an Neighbor Joining tree. 675 676 :Parameters: 677 distance_matrix : DistanceMatrix 678 The distance matrix for tree construction. 679 680 """ 681 if not isinstance(distance_matrix, DistanceMatrix): 682 raise TypeError("Must provide a DistanceMatrix object.") 683 684 # make a copy of the distance matrix to be used 685 dm = copy.deepcopy(distance_matrix) 686 # init terminal clades 687 clades = [BaseTree.Clade(None, name) for name in dm.names] 688 # init node distance 689 node_dist = [0] * len(dm) 690 # init minimum index 691 min_i = 0 692 min_j = 0 693 inner_count = 0 694 while len(dm) > 2: 695 # calculate nodeDist 696 for i in range(0, len(dm)): 697 node_dist[i] = 0 698 for j in range(0, len(dm)): 699 node_dist[i] += dm[i, j] 700 node_dist[i] = node_dist[i] / (len(dm) - 2) 701 702 # find minimum distance pair 703 min_dist = dm[1, 0] - node_dist[1] - node_dist[0] 704 min_i = 0 705 min_j = 1 706 for i in range(1, len(dm)): 707 for j in range(0, i): 708 temp = dm[i, j] - node_dist[i] - node_dist[j] 709 if min_dist > temp: 710 min_dist = temp 711 min_i = i 712 min_j = j 713 # create clade 714 clade1 = clades[min_i] 715 clade2 = clades[min_j] 716 inner_count += 1 717 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 718 inner_clade.clades.append(clade1) 719 inner_clade.clades.append(clade2) 720 # assign branch length 721 clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] - 722 node_dist[min_j]) / 2.0 723 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length 724 725 # update node list 726 clades[min_j] = inner_clade 727 del clades[min_i] 728 729 # rebuild distance matrix, 730 # set the distances of new node at the index of min_j 731 for k in range(0, len(dm)): 732 if k != min_i and k != min_j: 733 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] - 734 dm[min_i, min_j]) / 2.0 735 736 dm.names[min_j] = "Inner" + str(inner_count) 737 del dm[min_i] 738 739 # set the last clade as one of the child of the inner_clade 740 root = None 741 if clades[0] == inner_clade: 742 clades[0].branch_length = 0 743 clades[1].branch_length = dm[1, 0] 744 clades[0].clades.append(clades[1]) 745 root = clades[0] 746 else: 747 clades[0].branch_length = dm[1, 0] 748 clades[1].branch_length = 0 749 clades[1].clades.append(clades[0]) 750 root = clades[1] 751 752 return BaseTree.Tree(root, rooted=False)
753
754 - def _height_of(self, clade):
755 """Calculate clade height -- the longest path to any terminal.""" 756 height = 0 757 if clade.is_terminal(): 758 height = clade.branch_length 759 else: 760 height = height + max([self._height_of(c) for c in clade.clades]) 761 return height
762 763 # #################### Tree Scoring and Searching Classes ##################### 764 765
766 -class Scorer(object):
767 """Base class for all tree scoring methods""" 768
769 - def get_score(self, tree, alignment):
770 """Caller to get the score of a tree for the given alignment. 771 772 This should be implemented in subclass. 773 """ 774 raise NotImplementedError("Method not implemented!")
775 776
777 -class TreeSearcher(object):
778 """Base class for all tree searching methods""" 779
780 - def search(self, starting_tree, alignment):
781 """Caller to search the best tree with a starting tree. 782 783 This should be implemented in subclass. 784 """ 785 raise NotImplementedError("Method not implemented!")
786 787
788 -class NNITreeSearcher(TreeSearcher):
789 """Tree searching with Nearest Neighbor Interchanges (NNI) algorithm. 790 791 :Parameters: 792 scorer : ParsimonyScorer 793 parsimony scorer to calculate the parsimony score of 794 different trees during NNI algorithm. 795 796 """ 797
798 - def __init__(self, scorer):
799 if isinstance(scorer, Scorer): 800 self.scorer = scorer 801 else: 802 raise TypeError("Must provide a Scorer object.")
803
804 - def search(self, starting_tree, alignment):
805 """Implement the TreeSearcher.search method. 806 807 :Parameters: 808 starting_tree : Tree 809 starting tree of NNI method. 810 alignment : MultipleSeqAlignment 811 multiple sequence alignment used to calculate parsimony 812 score of different NNI trees. 813 814 """ 815 return self._nni(starting_tree, alignment)
816
817 - def _nni(self, starting_tree, alignment):
818 """Search for the best parsimony tree using the NNI algorithm.""" 819 best_tree = starting_tree 820 while True: 821 best_score = self.scorer.get_score(best_tree, alignment) 822 temp = best_score 823 for t in self._get_neighbors(best_tree): 824 score = self.scorer.get_score(t, alignment) 825 if score < best_score: 826 best_score = score 827 best_tree = t 828 # stop if no smaller score exist 829 if best_score >= temp: 830 break 831 return best_tree
832
833 - def _get_neighbors(self, tree):
834 """Get all neighbor trees of the given tree. 835 836 Currently only for binary rooted trees. 837 """ 838 # make child to parent dict 839 parents = {} 840 for clade in tree.find_clades(): 841 if clade != tree.root: 842 node_path = tree.get_path(clade) 843 # cannot get the parent if the parent is root. Bug? 844 if len(node_path) == 1: 845 parents[clade] = tree.root 846 else: 847 parents[clade] = node_path[-2] 848 neighbors = [] 849 root_childs = [] 850 for clade in tree.get_nonterminals(order="level"): 851 if clade == tree.root: 852 left = clade.clades[0] 853 right = clade.clades[1] 854 root_childs.append(left) 855 root_childs.append(right) 856 if not left.is_terminal() and not right.is_terminal(): 857 # make changes around the left_left clade 858 # left_left = left.clades[0] 859 left_right = left.clades[1] 860 right_left = right.clades[0] 861 right_right = right.clades[1] 862 # neightbor 1 (left_left + right_right) 863 del left.clades[1] 864 del right.clades[1] 865 left.clades.append(right_right) 866 right.clades.append(left_right) 867 temp_tree = copy.deepcopy(tree) 868 neighbors.append(temp_tree) 869 # neighbor 2 (left_left + right_left) 870 del left.clades[1] 871 del right.clades[0] 872 left.clades.append(right_left) 873 right.clades.append(right_right) 874 temp_tree = copy.deepcopy(tree) 875 neighbors.append(temp_tree) 876 # change back (left_left + left_right) 877 del left.clades[1] 878 del right.clades[0] 879 left.clades.append(left_right) 880 right.clades.insert(0, right_left) 881 elif clade in root_childs: 882 # skip root child 883 continue 884 else: 885 # method for other clades 886 # make changes around the parent clade 887 left = clade.clades[0] 888 right = clade.clades[1] 889 parent = parents[clade] 890 if clade == parent.clades[0]: 891 sister = parent.clades[1] 892 # neighbor 1 (parent + right) 893 del parent.clades[1] 894 del clade.clades[1] 895 parent.clades.append(right) 896 clade.clades.append(sister) 897 temp_tree = copy.deepcopy(tree) 898 neighbors.append(temp_tree) 899 # neighbor 2 (parent + left) 900 del parent.clades[1] 901 del clade.clades[0] 902 parent.clades.append(left) 903 clade.clades.append(right) 904 temp_tree = copy.deepcopy(tree) 905 neighbors.append(temp_tree) 906 # change back (parent + sister) 907 del parent.clades[1] 908 del clade.clades[0] 909 parent.clades.append(sister) 910 clade.clades.insert(0, left) 911 else: 912 sister = parent.clades[0] 913 # neighbor 1 (parent + right) 914 del parent.clades[0] 915 del clade.clades[1] 916 parent.clades.insert(0, right) 917 clade.clades.append(sister) 918 temp_tree = copy.deepcopy(tree) 919 neighbors.append(temp_tree) 920 # neighbor 2 (parent + left) 921 del parent.clades[0] 922 del clade.clades[0] 923 parent.clades.insert(0, left) 924 clade.clades.append(right) 925 temp_tree = copy.deepcopy(tree) 926 neighbors.append(temp_tree) 927 # change back (parent + sister) 928 del parent.clades[0] 929 del clade.clades[0] 930 parent.clades.insert(0, sister) 931 clade.clades.insert(0, left) 932 return neighbors
933 934 # ######################## Parsimony Classes ########################## 935 936
937 -class ParsimonyScorer(Scorer):
938 """Parsimony scorer with a scoring matrix. 939 940 This is a combination of Fitch algorithm and Sankoff algorithm. 941 See ParsimonyTreeConstructor for usage. 942 943 :Parameters: 944 matrix : _Matrix 945 scoring matrix used in parsimony score calculation. 946 947 """ 948
949 - def __init__(self, matrix=None):
950 if not matrix or isinstance(matrix, _Matrix): 951 self.matrix = matrix 952 else: 953 raise TypeError("Must provide a _Matrix object.")
954
955 - def get_score(self, tree, alignment):
956 """Calculate parsimony score using the Fitch algorithm 957 958 Calculate and return the parsimony score given a tree and the 959 MSA using either the Fitch algorithm (without a penalty matrix) 960 or the Sankoff algorithm (with a matrix). 961 """ 962 # make sure the tree is rooted and bifurcating 963 if not tree.is_bifurcating(): 964 raise ValueError("The tree provided should be bifurcating.") 965 if not tree.rooted: 966 tree.root_at_midpoint() 967 # sort tree terminals and alignment 968 terms = tree.get_terminals() 969 terms.sort(key=lambda term: term.name) 970 alignment.sort() 971 if not all(t.name == a.id for t, a in zip(terms, alignment)): 972 raise ValueError( 973 "Taxon names of the input tree should be the same with the alignment.") 974 # term_align = dict(zip(terms, alignment)) 975 score = 0 976 for i in range(len(alignment[0])): 977 # parsimony score for column_i 978 score_i = 0 979 # get column 980 column_i = alignment[:, i] 981 # skip non-informative column 982 if column_i == len(column_i) * column_i[0]: 983 continue 984 985 # start calculating score_i using the tree and column_i 986 987 # Fitch algorithm without the penalty matrix 988 if not self.matrix: 989 # init by mapping terminal clades and states in column_i 990 clade_states = dict(zip(terms, [set([c]) for c in column_i])) 991 for clade in tree.get_nonterminals(order="postorder"): 992 clade_childs = clade.clades 993 left_state = clade_states[clade_childs[0]] 994 right_state = clade_states[clade_childs[1]] 995 state = left_state & right_state 996 if not state: 997 state = left_state | right_state 998 score_i = score_i + 1 999 clade_states[clade] = state 1000 # Sankoff algorithm with the penalty matrix 1001 else: 1002 inf = float('inf') 1003 # init score arrays for terminal clades 1004 alphabet = self.matrix.names 1005 length = len(alphabet) 1006 clade_scores = {} 1007 for j in range(len(column_i)): 1008 array = [inf] * length 1009 index = alphabet.index(column_i[j]) 1010 array[index] = 0 1011 clade_scores[terms[j]] = array 1012 # bottom up calculation 1013 for clade in tree.get_nonterminals(order="postorder"): 1014 clade_childs = clade.clades 1015 left_score = clade_scores[clade_childs[0]] 1016 right_score = clade_scores[clade_childs[1]] 1017 array = [] 1018 for m in range(length): 1019 min_l = inf 1020 min_r = inf 1021 for n in range(length): 1022 sl = self.matrix[ 1023 alphabet[m], alphabet[n]] + left_score[n] 1024 sr = self.matrix[ 1025 alphabet[m], alphabet[n]] + right_score[n] 1026 if min_l > sl: 1027 min_l = sl 1028 if min_r > sr: 1029 min_r = sr 1030 array.append(min_l + min_r) 1031 clade_scores[clade] = array 1032 # minimum from root score 1033 score_i = min(array) 1034 # TODO: resolve internal states 1035 score = score + score_i 1036 return score
1037 1038
1039 -class ParsimonyTreeConstructor(TreeConstructor):
1040 """Parsimony tree constructor. 1041 1042 :Parameters: 1043 searcher : TreeSearcher 1044 tree searcher to search the best parsimony tree. 1045 starting_tree : Tree 1046 starting tree provided to the searcher. 1047 1048 Example 1049 ------- 1050 1051 >>> from Bio import AlignIO 1052 >>> from TreeConstruction import * 1053 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 1054 >>> print aln 1055 SingleLetterAlphabet() alignment with 5 rows and 13 columns 1056 AACGTGGCCACAT Alpha 1057 AAGGTCGCCACAC Beta 1058 GAGATTTCCGCCT Delta 1059 GAGATCTCCGCCC Epsilon 1060 CAGTTCGCCACAA Gamma 1061 >>> starting_tree = Phylo.read('Tests/TreeConstruction/nj.tre', 'newick') 1062 >>> print tree 1063 Tree(weight=1.0, rooted=False) 1064 Clade(branch_length=0.0, name='Inner3') 1065 Clade(branch_length=0.01421, name='Inner2') 1066 Clade(branch_length=0.23927, name='Inner1') 1067 Clade(branch_length=0.08531, name='Epsilon') 1068 Clade(branch_length=0.13691, name='Delta') 1069 Clade(branch_length=0.29231, name='Alpha') 1070 Clade(branch_length=0.07477, name='Beta') 1071 Clade(branch_length=0.17523, name='Gamma') 1072 >>> from TreeConstruction import * 1073 >>> scorer = ParsimonyScorer() 1074 >>> searcher = NNITreeSearcher(scorer) 1075 >>> constructor = ParsimonyTreeConstructor(searcher, starting_tree) 1076 >>> pars_tree = constructor.build_tree(aln) 1077 >>> print pars_tree 1078 Tree(weight=1.0, rooted=True) 1079 Clade(branch_length=0.0) 1080 Clade(branch_length=0.197335, name='Inner1') 1081 Clade(branch_length=0.13691, name='Delta') 1082 Clade(branch_length=0.08531, name='Epsilon') 1083 Clade(branch_length=0.041935, name='Inner2') 1084 Clade(branch_length=0.01421, name='Inner3') 1085 Clade(branch_length=0.17523, name='Gamma') 1086 Clade(branch_length=0.07477, name='Beta') 1087 Clade(branch_length=0.29231, name='Alpha') 1088 """ 1089
1090 - def __init__(self, searcher, starting_tree=None):
1091 self.searcher = searcher 1092 self.starting_tree = starting_tree
1093
1094 - def build_tree(self, alignment):
1095 """Build the tree. 1096 1097 :Parameters: 1098 alignment : MultipleSeqAlignment 1099 multiple sequence alignment to calculate parsimony tree. 1100 1101 """ 1102 # if starting_tree is none, 1103 # create a upgma tree with 'identity' scoring matrix 1104 if self.starting_tree is None: 1105 dtc = DistanceTreeConstructor(DistanceCalculator("identity"), 1106 "upgma") 1107 self.starting_tree = dtc.build_tree(alignment) 1108 return self.searcher.search(self.starting_tree, alignment)
1109