Package Bio :: Package Phylo :: Module TreeConstruction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.TreeConstruction

   1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes and methods for tree construction.""" 
   7   
   8  import itertools 
   9  import copy 
  10  from Bio.Phylo import BaseTree 
  11  from Bio.Align import MultipleSeqAlignment 
  12  from Bio.SubsMat import MatrixInfo 
  13  from Bio import _py3k 
  14  from Bio._py3k import zip, range 
  15   
  16   
17 -def _is_numeric(x):
18 return _py3k._is_int_or_long(x) or isinstance(x, (float, complex))
19 20
21 -class _Matrix(object):
22 """Base class for distance matrix or scoring matrix. 23 24 Accepts a list of names and a lower triangular matrix.:: 25 26 matrix = [[0], 27 [1, 0], 28 [2, 3, 0], 29 [4, 5, 6, 0]] 30 represents the symmetric matrix of 31 [0,1,2,4] 32 [1,0,3,5] 33 [2,3,0,6] 34 [4,5,6,0] 35 36 :Parameters: 37 names : list 38 names of elements, used for indexing 39 matrix : list 40 nested list of numerical lists in lower triangular format 41 42 Examples 43 -------- 44 >>> from Bio.Phylo.TreeConstruction import _Matrix 45 >>> names = ['Alpha', 'Beta', 'Gamma', 'Delta'] 46 >>> matrix = [[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]] 47 >>> m = _Matrix(names, matrix) 48 >>> m 49 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]]) 50 51 You can use two indices to get or assign an element in the matrix. 52 53 >>> m[1,2] 54 3 55 >>> m['Beta','Gamma'] 56 3 57 >>> m['Beta','Gamma'] = 4 58 >>> m['Beta','Gamma'] 59 4 60 61 Further more, you can use one index to get or assign a list of elements related to that index. 62 63 >>> m[0] 64 [0, 1, 2, 4] 65 >>> m['Alpha'] 66 [0, 1, 2, 4] 67 >>> m['Alpha'] = [0, 7, 8, 9] 68 >>> m[0] 69 [0, 7, 8, 9] 70 >>> m[0,1] 71 7 72 73 Also you can delete or insert a column&row of elemets by index. 74 75 >>> m 76 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 77 >>> del m['Alpha'] 78 >>> m 79 _Matrix(names=['Beta', 'Gamma', 'Delta'], matrix=[[0], [4, 0], [5, 6, 0]]) 80 >>> m.insert('Alpha', [0, 7, 8, 9] , 0) 81 >>> m 82 _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]]) 83 84 """ 85
86 - def __init__(self, names, matrix=None):
87 """Initialize matrix. 88 89 Arguments are a list of names, and optionally a list of lower 90 triangular matrix data (zero matrix used by default). 91 """ 92 # check names 93 if isinstance(names, list) and all(isinstance(s, str) for s in names): 94 if len(set(names)) == len(names): 95 self.names = names 96 else: 97 raise ValueError("Duplicate names found") 98 else: 99 raise TypeError("'names' should be a list of strings") 100 101 # check matrix 102 if matrix is None: 103 # create a new one with 0 if matrix is not assigned 104 matrix = [[0] * i for i in range(1, len(self) + 1)] 105 self.matrix = matrix 106 else: 107 # check if all elements are numbers 108 if (isinstance(matrix, list) and 109 all(isinstance(l, list) for l in matrix) and 110 all(_is_numeric(n) for n in [item for sublist in matrix 111 for item in sublist])): 112 # check if the same length with names 113 if len(matrix) == len(names): 114 # check if is lower triangle format 115 if [len(m) for m in matrix] == list(range(1, len(self) + 1)): 116 self.matrix = matrix 117 else: 118 raise ValueError( 119 "'matrix' should be in lower triangle format") 120 else: 121 raise ValueError( 122 "'names' and 'matrix' should be the same size") 123 else: 124 raise TypeError("'matrix' should be a list of numerical lists")
125
126 - def __getitem__(self, item):
127 """Access value(s) by the index(s) or name(s). 128 129 For a _Matrix object 'dm':: 130 131 dm[i] get a value list from the given 'i' to others; 132 dm[i, j] get the value between 'i' and 'j'; 133 dm['name'] map name to index first 134 dm['name1', 'name2'] map name to index first 135 136 """ 137 # Handle single indexing 138 if isinstance(item, (int, str)): 139 index = None 140 if isinstance(item, int): 141 index = item 142 elif isinstance(item, str): 143 if item in self.names: 144 index = self.names.index(item) 145 else: 146 raise ValueError("Item not found.") 147 else: 148 raise TypeError("Invalid index type.") 149 # check index 150 if index > len(self) - 1: 151 raise IndexError("Index out of range.") 152 return [self.matrix[index][i] for i in range(0, index)] + [self.matrix[i][index] for i in range(index, len(self))] 153 # Handle double indexing 154 elif len(item) == 2: 155 row_index = None 156 col_index = None 157 if all(isinstance(i, int) for i in item): 158 row_index, col_index = item 159 elif all(isinstance(i, str) for i in item): 160 row_name, col_name = item 161 if row_name in self.names and col_name in self.names: 162 row_index = self.names.index(row_name) 163 col_index = self.names.index(col_name) 164 else: 165 raise ValueError("Item not found.") 166 else: 167 raise TypeError("Invalid index type.") 168 # check index 169 if row_index > len(self) - 1 or col_index > len(self) - 1: 170 raise IndexError("Index out of range.") 171 if row_index > col_index: 172 return self.matrix[row_index][col_index] 173 else: 174 return self.matrix[col_index][row_index] 175 else: 176 raise TypeError("Invalid index type.")
177
178 - def __setitem__(self, item, value):
179 """Set value by the index(s) or name(s). 180 181 Similar to __getitem__:: 182 183 dm[1] = [1, 0, 3, 4] set values from '1' to others; 184 dm[i, j] = 2 set the value from 'i' to 'j' 185 186 """ 187 # Handle single indexing 188 if isinstance(item, (int, str)): 189 index = None 190 if isinstance(item, int): 191 index = item 192 elif isinstance(item, str): 193 if item in self.names: 194 index = self.names.index(item) 195 else: 196 raise ValueError("Item not found.") 197 else: 198 raise TypeError("Invalid index type.") 199 # check index 200 if index > len(self) - 1: 201 raise IndexError("Index out of range.") 202 # check and assign value 203 if isinstance(value, list) and all(_is_numeric(n) for n in value): 204 if len(value) == len(self): 205 for i in range(0, index): 206 self.matrix[index][i] = value[i] 207 for i in range(index, len(self)): 208 self.matrix[i][index] = value[i] 209 else: 210 raise ValueError("Value not the same size.") 211 else: 212 raise TypeError("Invalid value type.") 213 # Handle double indexing 214 elif len(item) == 2: 215 row_index = None 216 col_index = None 217 if all(isinstance(i, int) for i in item): 218 row_index, col_index = item 219 elif all(isinstance(i, str) for i in item): 220 row_name, col_name = item 221 if row_name in self.names and col_name in self.names: 222 row_index = self.names.index(row_name) 223 col_index = self.names.index(col_name) 224 else: 225 raise ValueError("Item not found.") 226 else: 227 raise TypeError("Invalid index type.") 228 # check index 229 if row_index > len(self) - 1 or col_index > len(self) - 1: 230 raise IndexError("Index out of range.") 231 # check and assign value 232 if _is_numeric(value): 233 if row_index > col_index: 234 self.matrix[row_index][col_index] = value 235 else: 236 self.matrix[col_index][row_index] = value 237 else: 238 raise TypeError("Invalid value type.") 239 else: 240 raise TypeError("Invalid index type.")
241
242 - def __delitem__(self, item):
243 """Delete related distances by the index or name.""" 244 index = None 245 if isinstance(item, int): 246 index = item 247 elif isinstance(item, str): 248 index = self.names.index(item) 249 else: 250 raise TypeError("Invalid index type.") 251 # remove distances related to index 252 for i in range(index + 1, len(self)): 253 del self.matrix[i][index] 254 del self.matrix[index] 255 # remove name 256 del self.names[index]
257
258 - def insert(self, name, value, index=None):
259 """Insert distances given the name and value. 260 261 :Parameters: 262 name : str 263 name of a row/col to be inserted 264 value : list 265 a row/col of values to be inserted 266 267 """ 268 if isinstance(name, str): 269 # insert at the given index or at the end 270 if index is None: 271 index = len(self) 272 if not isinstance(index, int): 273 raise TypeError("Invalid index type.") 274 # insert name 275 self.names.insert(index, name) 276 # insert elements of 0, to be assigned 277 self.matrix.insert(index, [0] * index) 278 for i in range(index, len(self)): 279 self.matrix[i].insert(index, 0) 280 # assign value 281 self[index] = value 282 else: 283 raise TypeError("Invalid name type.")
284
285 - def __len__(self):
286 """Matrix length.""" 287 return len(self.names)
288
289 - def __repr__(self):
290 return self.__class__.__name__ \ 291 + "(names=%s, matrix=%s)" \ 292 % tuple(map(repr, (self.names, self.matrix)))
293
294 - def __str__(self):
295 """Get a lower triangular matrix string.""" 296 matrix_string = '\n'.join( 297 [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]]) 298 for i in range(0, len(self))]) 299 matrix_string = matrix_string + "\n\t" + "\t".join(self.names) 300 return matrix_string
301 302
303 -class DistanceMatrix(_Matrix):
304 """Distance matrix class that can be used for distance based tree algorithms. 305 306 All diagonal elements will be zero no matter what the users provide. 307 """ 308
309 - def __init__(self, names, matrix=None):
310 """Initialize the class.""" 311 _Matrix.__init__(self, names, matrix) 312 self._set_zero_diagonal()
313
314 - def __setitem__(self, item, value):
315 _Matrix.__setitem__(self, item, value) 316 self._set_zero_diagonal()
317
318 - def _set_zero_diagonal(self):
319 """Set all diagonal elements to zero (PRIVATE).""" 320 for i in range(0, len(self)): 321 self.matrix[i][i] = 0
322
323 - def format_phylip(self, handle):
324 """Write data in Phylip format to a given file-like object or handle. 325 326 The output stream is the input distance matrix format used with Phylip 327 programs (e.g. 'neighbor'). See: 328 http://evolution.genetics.washington.edu/phylip/doc/neighbor.html 329 330 :Parameters: 331 handle : file or file-like object 332 A writeable file handle or other object supporting the 'write' 333 method, such as StringIO or sys.stdout. On Python 3, should be 334 open in text mode. 335 336 """ 337 handle.write(" {0}\n".format(len(self.names))) 338 # Phylip needs space-separated, vertically aligned columns 339 name_width = max(12, max(map(len, self.names)) + 1) 340 value_fmts = ("{" + str(x) + ":.4f}" 341 for x in range(1, len(self.matrix) + 1)) 342 row_fmt = "{0:" + str(name_width) + "s}" + " ".join(value_fmts) + "\n" 343 for i, (name, values) in enumerate(zip(self.names, self.matrix)): 344 # Mirror the matrix values across the diagonal 345 mirror_values = (self.matrix[j][i] 346 for j in range(i + 1, len(self.matrix))) 347 fields = itertools.chain([name], values, mirror_values) 348 handle.write(row_fmt.format(*fields))
349 350 351 # Shim for compatibility with Biopython<1.70 (#1304) 352 _DistanceMatrix = DistanceMatrix 353 354
355 -class DistanceCalculator(object):
356 """Class to calculate the distance matrix from a DNA or Protein. 357 358 Multiple Sequence Alignment(MSA) and the given name of the 359 substitution model. 360 361 Currently only scoring matrices are used. 362 363 :Parameters: 364 model : str 365 Name of the model matrix to be used to calculate distance. 366 The attribute `dna_matrices` contains the available model 367 names for DNA sequences and `protein_matrices` for protein 368 sequences. 369 370 Examples 371 -------- 372 >>> from Bio.Phylo.TreeConstruction import DistanceCalculator 373 >>> from Bio import AlignIO 374 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 375 >>> print aln 376 SingleLetterAlphabet() alignment with 5 rows and 13 columns 377 AACGTGGCCACAT Alpha 378 AAGGTCGCCACAC Beta 379 GAGATTTCCGCCT Delta 380 GAGATCTCCGCCC Epsilon 381 CAGTTCGCCACAA Gamma 382 383 DNA calculator with 'identity' model:: 384 385 >>> calculator = DistanceCalculator('identity') 386 >>> dm = calculator.get_distance(aln) 387 >>> print dm 388 Alpha 0 389 Beta 0.230769230769 0 390 Gamma 0.384615384615 0.230769230769 0 391 Delta 0.538461538462 0.538461538462 0.538461538462 0 392 Epsilon 0.615384615385 0.384615384615 0.461538461538 0.153846153846 0 393 Alpha Beta Gamma Delta Epsilon 394 395 Protein calculator with 'blosum62' model:: 396 397 >>> calculator = DistanceCalculator('blosum62') 398 >>> dm = calculator.get_distance(aln) 399 >>> print dm 400 Alpha 0 401 Beta 0.369047619048 0 402 Gamma 0.493975903614 0.25 0 403 Delta 0.585365853659 0.547619047619 0.566265060241 0 404 Epsilon 0.7 0.355555555556 0.488888888889 0.222222222222 0 405 Alpha Beta Gamma Delta Epsilon 406 407 """ 408 409 dna_alphabet = ['A', 'T', 'C', 'G'] 410 411 # BLAST nucleic acid scoring matrix 412 blastn = [[5], 413 [-4, 5], 414 [-4, -4, 5], 415 [-4, -4, -4, 5]] 416 417 # transition/transversion scoring matrix 418 trans = [[6], 419 [-5, 6], 420 [-5, -1, 6], 421 [-1, -5, -5, 6]] 422 423 protein_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 424 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 425 'Z'] 426 427 # matrices available 428 dna_matrices = {'blastn': blastn, 'trans': trans} 429 protein_models = MatrixInfo.available_matrices 430 protein_matrices = dict((name, getattr(MatrixInfo, name)) 431 for name in protein_models) 432 433 dna_models = list(dna_matrices.keys()) 434 435 models = ['identity'] + dna_models + protein_models 436
437 - def __init__(self, model='identity', skip_letters=None):
438 """Initialize with a distance model.""" 439 # Shim for backward compatibility (#491) 440 if skip_letters: 441 self.skip_letters = skip_letters 442 elif model == 'identity': 443 self.skip_letters = () 444 else: 445 self.skip_letters = ('-', '*') 446 447 if model == 'identity': 448 self.scoring_matrix = None 449 elif model in self.dna_models: 450 self.scoring_matrix = _Matrix(self.dna_alphabet, 451 self.dna_matrices[model]) 452 elif model in self.protein_models: 453 self.scoring_matrix = self._build_protein_matrix( 454 self.protein_matrices[model]) 455 else: 456 raise ValueError("Model not supported. Available models: " + 457 ", ".join(self.models))
458
459 - def _pairwise(self, seq1, seq2):
460 """Calculate pairwise distance from two sequences (PRIVATE). 461 462 Returns a value between 0 (identical sequences) and 1 (completely 463 different, or seq1 is an empty string.) 464 """ 465 score = 0 466 max_score = 0 467 if self.scoring_matrix: 468 max_score1 = 0 469 max_score2 = 0 470 for i in range(0, len(seq1)): 471 l1 = seq1[i] 472 l2 = seq2[i] 473 if l1 in self.skip_letters or l2 in self.skip_letters: 474 continue 475 if l1 not in self.scoring_matrix.names: 476 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 477 % (l1, seq1.id, i)) 478 if l2 not in self.scoring_matrix.names: 479 raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'" 480 % (l2, seq2.id, i)) 481 max_score1 += self.scoring_matrix[l1, l1] 482 max_score2 += self.scoring_matrix[l2, l2] 483 score += self.scoring_matrix[l1, l2] 484 # Take the higher score if the matrix is asymmetrical 485 max_score = max(max_score1, max_score2) 486 else: 487 # Score by character identity, not skipping any special letters 488 score = sum(l1 == l2 489 for l1, l2 in zip(seq1, seq2) 490 if l1 not in self.skip_letters and l2 not in self.skip_letters) 491 max_score = len(seq1) 492 if max_score == 0: 493 return 1 # max possible scaled distance 494 return 1 - (score * 1.0 / max_score)
495
496 - def get_distance(self, msa):
497 """Return a DistanceMatrix for MSA object. 498 499 :Parameters: 500 msa : MultipleSeqAlignment 501 DNA or Protein multiple sequence alignment. 502 503 """ 504 if not isinstance(msa, MultipleSeqAlignment): 505 raise TypeError("Must provide a MultipleSeqAlignment object.") 506 507 names = [s.id for s in msa] 508 dm = DistanceMatrix(names) 509 for seq1, seq2 in itertools.combinations(msa, 2): 510 dm[seq1.id, seq2.id] = self._pairwise(seq1, seq2) 511 return dm
512
513 - def _build_protein_matrix(self, subsmat):
514 """Convert matrix from SubsMat format to _Matrix object (PRIVATE).""" 515 protein_matrix = _Matrix(self.protein_alphabet) 516 for k, v in subsmat.items(): 517 aa1, aa2 = k 518 protein_matrix[aa1, aa2] = v 519 return protein_matrix
520 521
522 -class TreeConstructor(object):
523 """Base class for all tree constructor.""" 524
525 - def build_tree(self, msa):
526 """Caller to built the tree from a MultipleSeqAlignment object. 527 528 This should be implemented in subclass. 529 """ 530 raise NotImplementedError("Method not implemented!")
531 532
533 -class DistanceTreeConstructor(TreeConstructor):
534 """Distance based tree constructor. 535 536 :Parameters: 537 method : str 538 Distance tree construction method, 'nj'(default) or 'upgma'. 539 distance_calculator : DistanceCalculator 540 The distance matrix calculator for multiple sequence alignment. 541 It must be provided if `build_tree` will be called. 542 543 Examples 544 -------- 545 >>> from TreeConstruction import DistanceTreeConstructor 546 >>> constructor = DistanceTreeConstructor() 547 548 UPGMA Tree: 549 550 >>> upgmatree = constructor.upgma(dm) 551 >>> print upgmatree 552 Tree(rooted=True) 553 Clade(name='Inner4') 554 Clade(branch_length=0.171955155115, name='Inner1') 555 Clade(branch_length=0.111111111111, name='Epsilon') 556 Clade(branch_length=0.111111111111, name='Delta') 557 Clade(branch_length=0.0673103855608, name='Inner3') 558 Clade(branch_length=0.0907558806655, name='Inner2') 559 Clade(branch_length=0.125, name='Gamma') 560 Clade(branch_length=0.125, name='Beta') 561 Clade(branch_length=0.215755880666, name='Alpha') 562 563 NJ Tree: 564 565 >>> njtree = constructor.nj(dm) 566 >>> print njtree 567 Tree(rooted=False) 568 Clade(name='Inner3') 569 Clade(branch_length=0.0142054862889, name='Inner2') 570 Clade(branch_length=0.239265540676, name='Inner1') 571 Clade(branch_length=0.0853101915988, name='Epsilon') 572 Clade(branch_length=0.136912030623, name='Delta') 573 Clade(branch_length=0.292306275042, name='Alpha') 574 Clade(branch_length=0.0747705106139, name='Beta') 575 Clade(branch_length=0.175229489386, name='Gamma') 576 577 578 """ 579 580 methods = ['nj', 'upgma'] 581
582 - def __init__(self, distance_calculator=None, method="nj"):
583 """Initialize the class.""" 584 if (distance_calculator is None or isinstance(distance_calculator, DistanceCalculator)): 585 self.distance_calculator = distance_calculator 586 else: 587 raise TypeError("Must provide a DistanceCalculator object.") 588 if isinstance(method, str) and method in self.methods: 589 self.method = method 590 else: 591 raise TypeError("Bad method: " + method + 592 ". Available methods: " + ", ".join(self.methods))
593
594 - def build_tree(self, msa):
595 if self.distance_calculator: 596 dm = self.distance_calculator.get_distance(msa) 597 tree = None 598 if self.method == 'upgma': 599 tree = self.upgma(dm) 600 else: 601 tree = self.nj(dm) 602 return tree 603 else: 604 raise TypeError("Must provide a DistanceCalculator object.")
605
606 - def upgma(self, distance_matrix):
607 """Construct and return an UPGMA tree. 608 609 Constructs and returns an Unweighted Pair Group Method 610 with Arithmetic mean (UPGMA) tree. 611 612 :Parameters: 613 distance_matrix : DistanceMatrix 614 The distance matrix for tree construction. 615 616 """ 617 if not isinstance(distance_matrix, DistanceMatrix): 618 raise TypeError("Must provide a DistanceMatrix object.") 619 620 # make a copy of the distance matrix to be used 621 dm = copy.deepcopy(distance_matrix) 622 # init terminal clades 623 clades = [BaseTree.Clade(None, name) for name in dm.names] 624 # init minimum index 625 min_i = 0 626 min_j = 0 627 inner_count = 0 628 while len(dm) > 1: 629 min_dist = dm[1, 0] 630 # find minimum index 631 for i in range(1, len(dm)): 632 for j in range(0, i): 633 if min_dist >= dm[i, j]: 634 min_dist = dm[i, j] 635 min_i = i 636 min_j = j 637 638 # create clade 639 clade1 = clades[min_i] 640 clade2 = clades[min_j] 641 inner_count += 1 642 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 643 inner_clade.clades.append(clade1) 644 inner_clade.clades.append(clade2) 645 # assign branch length 646 if clade1.is_terminal(): 647 clade1.branch_length = min_dist * 1.0 / 2 648 else: 649 clade1.branch_length = min_dist * \ 650 1.0 / 2 - self._height_of(clade1) 651 652 if clade2.is_terminal(): 653 clade2.branch_length = min_dist * 1.0 / 2 654 else: 655 clade2.branch_length = min_dist * \ 656 1.0 / 2 - self._height_of(clade2) 657 658 # update node list 659 clades[min_j] = inner_clade 660 del clades[min_i] 661 662 # rebuild distance matrix, 663 # set the distances of new node at the index of min_j 664 for k in range(0, len(dm)): 665 if k != min_i and k != min_j: 666 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2 667 668 dm.names[min_j] = "Inner" + str(inner_count) 669 670 del dm[min_i] 671 inner_clade.branch_length = 0 672 return BaseTree.Tree(inner_clade)
673
674 - def nj(self, distance_matrix):
675 """Construct and return a Neighbor Joining tree. 676 677 :Parameters: 678 distance_matrix : DistanceMatrix 679 The distance matrix for tree construction. 680 681 """ 682 if not isinstance(distance_matrix, DistanceMatrix): 683 raise TypeError("Must provide a DistanceMatrix object.") 684 685 # make a copy of the distance matrix to be used 686 dm = copy.deepcopy(distance_matrix) 687 # init terminal clades 688 clades = [BaseTree.Clade(None, name) for name in dm.names] 689 # init node distance 690 node_dist = [0] * len(dm) 691 # init minimum index 692 min_i = 0 693 min_j = 0 694 inner_count = 0 695 while len(dm) > 2: 696 # calculate nodeDist 697 for i in range(0, len(dm)): 698 node_dist[i] = 0 699 for j in range(0, len(dm)): 700 node_dist[i] += dm[i, j] 701 node_dist[i] = node_dist[i] / (len(dm) - 2) 702 703 # find minimum distance pair 704 min_dist = dm[1, 0] - node_dist[1] - node_dist[0] 705 min_i = 0 706 min_j = 1 707 for i in range(1, len(dm)): 708 for j in range(0, i): 709 temp = dm[i, j] - node_dist[i] - node_dist[j] 710 if min_dist > temp: 711 min_dist = temp 712 min_i = i 713 min_j = j 714 # create clade 715 clade1 = clades[min_i] 716 clade2 = clades[min_j] 717 inner_count += 1 718 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) 719 inner_clade.clades.append(clade1) 720 inner_clade.clades.append(clade2) 721 # assign branch length 722 clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] - 723 node_dist[min_j]) / 2.0 724 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length 725 726 # update node list 727 clades[min_j] = inner_clade 728 del clades[min_i] 729 730 # rebuild distance matrix, 731 # set the distances of new node at the index of min_j 732 for k in range(0, len(dm)): 733 if k != min_i and k != min_j: 734 dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] - 735 dm[min_i, min_j]) / 2.0 736 737 dm.names[min_j] = "Inner" + str(inner_count) 738 del dm[min_i] 739 740 # set the last clade as one of the child of the inner_clade 741 root = None 742 if clades[0] == inner_clade: 743 clades[0].branch_length = 0 744 clades[1].branch_length = dm[1, 0] 745 clades[0].clades.append(clades[1]) 746 root = clades[0] 747 else: 748 clades[0].branch_length = dm[1, 0] 749 clades[1].branch_length = 0 750 clades[1].clades.append(clades[0]) 751 root = clades[1] 752 753 return BaseTree.Tree(root, rooted=False)
754
755 - def _height_of(self, clade):
756 """Calculate clade height -- the longest path to any terminal (PRIVATE).""" 757 height = 0 758 if clade.is_terminal(): 759 height = clade.branch_length 760 else: 761 height = height + max([self._height_of(c) for c in clade.clades]) 762 return height
763 764 # #################### Tree Scoring and Searching Classes ##################### 765 766
767 -class Scorer(object):
768 """Base class for all tree scoring methods.""" 769
770 - def get_score(self, tree, alignment):
771 """Caller to get the score of a tree for the given alignment. 772 773 This should be implemented in subclass. 774 """ 775 raise NotImplementedError("Method not implemented!")
776 777
778 -class TreeSearcher(object):
779 """Base class for all tree searching methods.""" 780
781 - def search(self, starting_tree, alignment):
782 """Caller to search the best tree with a starting tree. 783 784 This should be implemented in subclass. 785 """ 786 raise NotImplementedError("Method not implemented!")
787 788
789 -class NNITreeSearcher(TreeSearcher):
790 """Tree searching with Nearest Neighbor Interchanges (NNI) algorithm. 791 792 :Parameters: 793 scorer : ParsimonyScorer 794 parsimony scorer to calculate the parsimony score of 795 different trees during NNI algorithm. 796 797 """ 798
799 - def __init__(self, scorer):
800 """Initialize the class.""" 801 if isinstance(scorer, Scorer): 802 self.scorer = scorer 803 else: 804 raise TypeError("Must provide a Scorer object.")
805
806 - def search(self, starting_tree, alignment):
807 """Implement the TreeSearcher.search method. 808 809 :Parameters: 810 starting_tree : Tree 811 starting tree of NNI method. 812 alignment : MultipleSeqAlignment 813 multiple sequence alignment used to calculate parsimony 814 score of different NNI trees. 815 816 """ 817 return self._nni(starting_tree, alignment)
818
819 - def _nni(self, starting_tree, alignment):
820 """Search for the best parsimony tree using the NNI algorithm (PRIVATE).""" 821 best_tree = starting_tree 822 while True: 823 best_score = self.scorer.get_score(best_tree, alignment) 824 temp = best_score 825 for t in self._get_neighbors(best_tree): 826 score = self.scorer.get_score(t, alignment) 827 if score < best_score: 828 best_score = score 829 best_tree = t 830 # stop if no smaller score exist 831 if best_score >= temp: 832 break 833 return best_tree
834
835 - def _get_neighbors(self, tree):
836 """Get all neighbor trees of the given tree (PRIVATE). 837 838 Currently only for binary rooted trees. 839 """ 840 # make child to parent dict 841 parents = {} 842 for clade in tree.find_clades(): 843 if clade != tree.root: 844 node_path = tree.get_path(clade) 845 # cannot get the parent if the parent is root. Bug? 846 if len(node_path) == 1: 847 parents[clade] = tree.root 848 else: 849 parents[clade] = node_path[-2] 850 neighbors = [] 851 root_childs = [] 852 for clade in tree.get_nonterminals(order="level"): 853 if clade == tree.root: 854 left = clade.clades[0] 855 right = clade.clades[1] 856 root_childs.append(left) 857 root_childs.append(right) 858 if not left.is_terminal() and not right.is_terminal(): 859 # make changes around the left_left clade 860 # left_left = left.clades[0] 861 left_right = left.clades[1] 862 right_left = right.clades[0] 863 right_right = right.clades[1] 864 # neightbor 1 (left_left + right_right) 865 del left.clades[1] 866 del right.clades[1] 867 left.clades.append(right_right) 868 right.clades.append(left_right) 869 temp_tree = copy.deepcopy(tree) 870 neighbors.append(temp_tree) 871 # neighbor 2 (left_left + right_left) 872 del left.clades[1] 873 del right.clades[0] 874 left.clades.append(right_left) 875 right.clades.append(right_right) 876 temp_tree = copy.deepcopy(tree) 877 neighbors.append(temp_tree) 878 # change back (left_left + left_right) 879 del left.clades[1] 880 del right.clades[0] 881 left.clades.append(left_right) 882 right.clades.insert(0, right_left) 883 elif clade in root_childs: 884 # skip root child 885 continue 886 else: 887 # method for other clades 888 # make changes around the parent clade 889 left = clade.clades[0] 890 right = clade.clades[1] 891 parent = parents[clade] 892 if clade == parent.clades[0]: 893 sister = parent.clades[1] 894 # neighbor 1 (parent + right) 895 del parent.clades[1] 896 del clade.clades[1] 897 parent.clades.append(right) 898 clade.clades.append(sister) 899 temp_tree = copy.deepcopy(tree) 900 neighbors.append(temp_tree) 901 # neighbor 2 (parent + left) 902 del parent.clades[1] 903 del clade.clades[0] 904 parent.clades.append(left) 905 clade.clades.append(right) 906 temp_tree = copy.deepcopy(tree) 907 neighbors.append(temp_tree) 908 # change back (parent + sister) 909 del parent.clades[1] 910 del clade.clades[0] 911 parent.clades.append(sister) 912 clade.clades.insert(0, left) 913 else: 914 sister = parent.clades[0] 915 # neighbor 1 (parent + right) 916 del parent.clades[0] 917 del clade.clades[1] 918 parent.clades.insert(0, right) 919 clade.clades.append(sister) 920 temp_tree = copy.deepcopy(tree) 921 neighbors.append(temp_tree) 922 # neighbor 2 (parent + left) 923 del parent.clades[0] 924 del clade.clades[0] 925 parent.clades.insert(0, left) 926 clade.clades.append(right) 927 temp_tree = copy.deepcopy(tree) 928 neighbors.append(temp_tree) 929 # change back (parent + sister) 930 del parent.clades[0] 931 del clade.clades[0] 932 parent.clades.insert(0, sister) 933 clade.clades.insert(0, left) 934 return neighbors
935 936 # ######################## Parsimony Classes ########################## 937 938
939 -class ParsimonyScorer(Scorer):
940 """Parsimony scorer with a scoring matrix. 941 942 This is a combination of Fitch algorithm and Sankoff algorithm. 943 See ParsimonyTreeConstructor for usage. 944 945 :Parameters: 946 matrix : _Matrix 947 scoring matrix used in parsimony score calculation. 948 949 """ 950
951 - def __init__(self, matrix=None):
952 """Initialize the class.""" 953 if not matrix or isinstance(matrix, _Matrix): 954 self.matrix = matrix 955 else: 956 raise TypeError("Must provide a _Matrix object.")
957
958 - def get_score(self, tree, alignment):
959 """Calculate parsimony score using the Fitch algorithm. 960 961 Calculate and return the parsimony score given a tree and the 962 MSA using either the Fitch algorithm (without a penalty matrix) 963 or the Sankoff algorithm (with a matrix). 964 """ 965 # make sure the tree is rooted and bifurcating 966 if not tree.is_bifurcating(): 967 raise ValueError("The tree provided should be bifurcating.") 968 if not tree.rooted: 969 tree.root_at_midpoint() 970 # sort tree terminals and alignment 971 terms = tree.get_terminals() 972 terms.sort(key=lambda term: term.name) 973 alignment.sort() 974 if not all(t.name == a.id for t, a in zip(terms, alignment)): 975 raise ValueError( 976 "Taxon names of the input tree should be the same with the alignment.") 977 # term_align = dict(zip(terms, alignment)) 978 score = 0 979 for i in range(len(alignment[0])): 980 # parsimony score for column_i 981 score_i = 0 982 # get column 983 column_i = alignment[:, i] 984 # skip non-informative column 985 if column_i == len(column_i) * column_i[0]: 986 continue 987 988 # start calculating score_i using the tree and column_i 989 990 # Fitch algorithm without the penalty matrix 991 if not self.matrix: 992 # init by mapping terminal clades and states in column_i 993 clade_states = dict(zip(terms, [set([c]) for c in column_i])) 994 for clade in tree.get_nonterminals(order="postorder"): 995 clade_childs = clade.clades 996 left_state = clade_states[clade_childs[0]] 997 right_state = clade_states[clade_childs[1]] 998 state = left_state & right_state 999 if not state: 1000 state = left_state | right_state 1001 score_i = score_i + 1 1002 clade_states[clade] = state 1003 # Sankoff algorithm with the penalty matrix 1004 else: 1005 inf = float('inf') 1006 # init score arrays for terminal clades 1007 alphabet = self.matrix.names 1008 length = len(alphabet) 1009 clade_scores = {} 1010 for j in range(len(column_i)): 1011 array = [inf] * length 1012 index = alphabet.index(column_i[j]) 1013 array[index] = 0 1014 clade_scores[terms[j]] = array 1015 # bottom up calculation 1016 for clade in tree.get_nonterminals(order="postorder"): 1017 clade_childs = clade.clades 1018 left_score = clade_scores[clade_childs[0]] 1019 right_score = clade_scores[clade_childs[1]] 1020 array = [] 1021 for m in range(length): 1022 min_l = inf 1023 min_r = inf 1024 for n in range(length): 1025 sl = self.matrix[ 1026 alphabet[m], alphabet[n]] + left_score[n] 1027 sr = self.matrix[ 1028 alphabet[m], alphabet[n]] + right_score[n] 1029 if min_l > sl: 1030 min_l = sl 1031 if min_r > sr: 1032 min_r = sr 1033 array.append(min_l + min_r) 1034 clade_scores[clade] = array 1035 # minimum from root score 1036 score_i = min(array) 1037 # TODO: resolve internal states 1038 score = score + score_i 1039 return score
1040 1041
1042 -class ParsimonyTreeConstructor(TreeConstructor):
1043 """Parsimony tree constructor. 1044 1045 :Parameters: 1046 searcher : TreeSearcher 1047 tree searcher to search the best parsimony tree. 1048 starting_tree : Tree 1049 starting tree provided to the searcher. 1050 1051 Examples 1052 -------- 1053 >>> from Bio import AlignIO 1054 >>> from TreeConstruction import * 1055 >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip') 1056 >>> print aln 1057 SingleLetterAlphabet() alignment with 5 rows and 13 columns 1058 AACGTGGCCACAT Alpha 1059 AAGGTCGCCACAC Beta 1060 GAGATTTCCGCCT Delta 1061 GAGATCTCCGCCC Epsilon 1062 CAGTTCGCCACAA Gamma 1063 >>> starting_tree = Phylo.read('Tests/TreeConstruction/nj.tre', 'newick') 1064 >>> print tree 1065 Tree(weight=1.0, rooted=False) 1066 Clade(branch_length=0.0, name='Inner3') 1067 Clade(branch_length=0.01421, name='Inner2') 1068 Clade(branch_length=0.23927, name='Inner1') 1069 Clade(branch_length=0.08531, name='Epsilon') 1070 Clade(branch_length=0.13691, name='Delta') 1071 Clade(branch_length=0.29231, name='Alpha') 1072 Clade(branch_length=0.07477, name='Beta') 1073 Clade(branch_length=0.17523, name='Gamma') 1074 >>> from TreeConstruction import * 1075 >>> scorer = ParsimonyScorer() 1076 >>> searcher = NNITreeSearcher(scorer) 1077 >>> constructor = ParsimonyTreeConstructor(searcher, starting_tree) 1078 >>> pars_tree = constructor.build_tree(aln) 1079 >>> print pars_tree 1080 Tree(weight=1.0, rooted=True) 1081 Clade(branch_length=0.0) 1082 Clade(branch_length=0.197335, name='Inner1') 1083 Clade(branch_length=0.13691, name='Delta') 1084 Clade(branch_length=0.08531, name='Epsilon') 1085 Clade(branch_length=0.041935, name='Inner2') 1086 Clade(branch_length=0.01421, name='Inner3') 1087 Clade(branch_length=0.17523, name='Gamma') 1088 Clade(branch_length=0.07477, name='Beta') 1089 Clade(branch_length=0.29231, name='Alpha') 1090 1091 """ 1092
1093 - def __init__(self, searcher, starting_tree=None):
1094 """Initialize the class.""" 1095 self.searcher = searcher 1096 self.starting_tree = starting_tree
1097
1098 - def build_tree(self, alignment):
1099 """Build the tree. 1100 1101 :Parameters: 1102 alignment : MultipleSeqAlignment 1103 multiple sequence alignment to calculate parsimony tree. 1104 1105 """ 1106 # if starting_tree is none, 1107 # create a upgma tree with 'identity' scoring matrix 1108 if self.starting_tree is None: 1109 dtc = DistanceTreeConstructor(DistanceCalculator("identity"), 1110 "upgma") 1111 self.starting_tree = dtc.build_tree(alignment) 1112 return self.searcher.search(self.starting_tree, alignment)
1113