[frames] | no frames]

# Source Code for Module Bio.Phylo.TreeConstruction

```   1  # Copyright (C) 2013 by Yanbo Ye (yeyanbo289@gmail.com)
2  # This code is part of the Biopython distribution and governed by its
4  # as part of this package.
5
6  """Classes and methods for tree construction."""
7
8  import itertools
9  import copy
10  from Bio.Phylo import BaseTree
11  from Bio.Align import MultipleSeqAlignment
12  from Bio.SubsMat import MatrixInfo
13  from Bio import _py3k
14  from Bio._py3k import zip, range
15
16
17 -def _is_numeric(x):
18      return _py3k._is_int_or_long(x) or isinstance(x, (float, complex))
19
20
21 -class _Matrix(object):
22      """Base class for distance matrix or scoring matrix.
23
24      Accepts a list of names and a lower triangular matrix.::
25
26          matrix = [[0],
27                    [1, 0],
28                    [2, 3, 0],
29                    [4, 5, 6, 0]]
30          represents the symmetric matrix of
31          [0,1,2,4]
32          [1,0,3,5]
33          [2,3,0,6]
34          [4,5,6,0]
35
36      :Parameters:
37          names : list
38              names of elements, used for indexing
39          matrix : list
40              nested list of numerical lists in lower triangular format
41
42      Examples
43      --------
44      >>> from Bio.Phylo.TreeConstruction import _Matrix
45      >>> names = ['Alpha', 'Beta', 'Gamma', 'Delta']
46      >>> matrix = [[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]]
47      >>> m = _Matrix(names, matrix)
48      >>> m
49      _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]])
50
51      You can use two indices to get or assign an element in the matrix.
52
53      >>> m[1,2]
54      3
55      >>> m['Beta','Gamma']
56      3
57      >>> m['Beta','Gamma'] = 4
58      >>> m['Beta','Gamma']
59      4
60
61      Further more, you can use one index to get or assign a list of elements related to that index.
62
63      >>> m[0]
64      [0, 1, 2, 4]
65      >>> m['Alpha']
66      [0, 1, 2, 4]
67      >>> m['Alpha'] = [0, 7, 8, 9]
68      >>> m[0]
69      [0, 7, 8, 9]
70      >>> m[0,1]
71      7
72
73      Also you can delete or insert a column&row of elemets by index.
74
75      >>> m
76      _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]])
77      >>> del m['Alpha']
78      >>> m
79      _Matrix(names=['Beta', 'Gamma', 'Delta'], matrix=[[0], [4, 0], [5, 6, 0]])
80      >>> m.insert('Alpha', [0, 7, 8, 9] , 0)
81      >>> m
82      _Matrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], matrix=[[0], [7, 0], [8, 4, 0], [9, 5, 6, 0]])
83
84      """
85
86 -    def __init__(self, names, matrix=None):
87          """Initialize matrix.
88
89          Arguments are a list of names, and optionally a list of lower
90          triangular matrix data (zero matrix used by default).
91          """
92          # check names
93          if isinstance(names, list) and all(isinstance(s, str) for s in names):
94              if len(set(names)) == len(names):
95                  self.names = names
96              else:
97                  raise ValueError("Duplicate names found")
98          else:
99              raise TypeError("'names' should be a list of strings")
100
101          # check matrix
102          if matrix is None:
103              # create a new one with 0 if matrix is not assigned
104              matrix = [[0] * i for i in range(1, len(self) + 1)]
105              self.matrix = matrix
106          else:
107              # check if all elements are numbers
108              if (isinstance(matrix, list) and
109                  all(isinstance(l, list) for l in matrix) and
110                  all(_is_numeric(n) for n in [item for sublist in matrix
111                                               for item in sublist])):
112                  # check if the same length with names
113                  if len(matrix) == len(names):
114                      # check if is lower triangle format
115                      if [len(m) for m in matrix] == list(range(1, len(self) + 1)):
116                          self.matrix = matrix
117                      else:
118                          raise ValueError(
119                              "'matrix' should be in lower triangle format")
120                  else:
121                      raise ValueError(
122                          "'names' and 'matrix' should be the same size")
123              else:
124                  raise TypeError("'matrix' should be a list of numerical lists")
125
126 -    def __getitem__(self, item):
127          """Access value(s) by the index(s) or name(s).
128
129          For a _Matrix object 'dm'::
130
131              dm[i]                   get a value list from the given 'i' to others;
132              dm[i, j]                get the value between 'i' and 'j';
133              dm['name']              map name to index first
134              dm['name1', 'name2']    map name to index first
135
136          """
137          # Handle single indexing
138          if isinstance(item, (int, str)):
139              index = None
140              if isinstance(item, int):
141                  index = item
142              elif isinstance(item, str):
143                  if item in self.names:
144                      index = self.names.index(item)
145                  else:
147              else:
148                  raise TypeError("Invalid index type.")
149              # check index
150              if index > len(self) - 1:
151                  raise IndexError("Index out of range.")
152              return [self.matrix[index][i] for i in range(0, index)] + [self.matrix[i][index] for i in range(index, len(self))]
153          # Handle double indexing
154          elif len(item) == 2:
155              row_index = None
156              col_index = None
157              if all(isinstance(i, int) for i in item):
158                  row_index, col_index = item
159              elif all(isinstance(i, str) for i in item):
160                  row_name, col_name = item
161                  if row_name in self.names and col_name in self.names:
162                      row_index = self.names.index(row_name)
163                      col_index = self.names.index(col_name)
164                  else:
166              else:
167                  raise TypeError("Invalid index type.")
168              # check index
169              if row_index > len(self) - 1 or col_index > len(self) - 1:
170                  raise IndexError("Index out of range.")
171              if row_index > col_index:
172                  return self.matrix[row_index][col_index]
173              else:
174                  return self.matrix[col_index][row_index]
175          else:
176              raise TypeError("Invalid index type.")
177
178 -    def __setitem__(self, item, value):
179          """Set value by the index(s) or name(s).
180
181          Similar to __getitem__::
182
183              dm[1] = [1, 0, 3, 4]    set values from '1' to others;
184              dm[i, j] = 2            set the value from 'i' to 'j'
185
186          """
187          # Handle single indexing
188          if isinstance(item, (int, str)):
189              index = None
190              if isinstance(item, int):
191                  index = item
192              elif isinstance(item, str):
193                  if item in self.names:
194                      index = self.names.index(item)
195                  else:
197              else:
198                  raise TypeError("Invalid index type.")
199              # check index
200              if index > len(self) - 1:
201                  raise IndexError("Index out of range.")
202              # check and assign value
203              if isinstance(value, list) and all(_is_numeric(n) for n in value):
204                  if len(value) == len(self):
205                      for i in range(0, index):
206                          self.matrix[index][i] = value[i]
207                      for i in range(index, len(self)):
208                          self.matrix[i][index] = value[i]
209                  else:
210                      raise ValueError("Value not the same size.")
211              else:
212                  raise TypeError("Invalid value type.")
213          # Handle double indexing
214          elif len(item) == 2:
215              row_index = None
216              col_index = None
217              if all(isinstance(i, int) for i in item):
218                  row_index, col_index = item
219              elif all(isinstance(i, str) for i in item):
220                  row_name, col_name = item
221                  if row_name in self.names and col_name in self.names:
222                      row_index = self.names.index(row_name)
223                      col_index = self.names.index(col_name)
224                  else:
226              else:
227                  raise TypeError("Invalid index type.")
228              # check index
229              if row_index > len(self) - 1 or col_index > len(self) - 1:
230                  raise IndexError("Index out of range.")
231              # check and assign value
232              if _is_numeric(value):
233                  if row_index > col_index:
234                      self.matrix[row_index][col_index] = value
235                  else:
236                      self.matrix[col_index][row_index] = value
237              else:
238                  raise TypeError("Invalid value type.")
239          else:
240              raise TypeError("Invalid index type.")
241
242 -    def __delitem__(self, item):
243          """Delete related distances by the index or name."""
244          index = None
245          if isinstance(item, int):
246              index = item
247          elif isinstance(item, str):
248              index = self.names.index(item)
249          else:
250              raise TypeError("Invalid index type.")
251          # remove distances related to index
252          for i in range(index + 1, len(self)):
253              del self.matrix[i][index]
254          del self.matrix[index]
255          # remove name
256          del self.names[index]
257
258 -    def insert(self, name, value, index=None):
259          """Insert distances given the name and value.
260
261          :Parameters:
262              name : str
263                  name of a row/col to be inserted
264              value : list
265                  a row/col of values to be inserted
266
267          """
268          if isinstance(name, str):
269              # insert at the given index or at the end
270              if index is None:
271                  index = len(self)
272              if not isinstance(index, int):
273                  raise TypeError("Invalid index type.")
274              # insert name
275              self.names.insert(index, name)
276              # insert elements of 0, to be assigned
277              self.matrix.insert(index, [0] * index)
278              for i in range(index, len(self)):
279                  self.matrix[i].insert(index, 0)
280              # assign value
281              self[index] = value
282          else:
283              raise TypeError("Invalid name type.")
284
285 -    def __len__(self):
286          """Matrix length."""
287          return len(self.names)
288
289 -    def __repr__(self):
290          return self.__class__.__name__ \
291              + "(names=%s, matrix=%s)" \
292              % tuple(map(repr, (self.names, self.matrix)))
293
294 -    def __str__(self):
295          """Get a lower triangular matrix string."""
296          matrix_string = '\n'.join(
297              [self.names[i] + "\t" + "\t".join([str(n) for n in self.matrix[i]])
298               for i in range(0, len(self))])
299          matrix_string = matrix_string + "\n\t" + "\t".join(self.names)
300          return matrix_string
301
302
303 -class DistanceMatrix(_Matrix):
304      """Distance matrix class that can be used for distance based tree algorithms.
305
306      All diagonal elements will be zero no matter what the users provide.
307      """
308
309 -    def __init__(self, names, matrix=None):
310          """Initialize the class."""
311          _Matrix.__init__(self, names, matrix)
312          self._set_zero_diagonal()
313
314 -    def __setitem__(self, item, value):
315          _Matrix.__setitem__(self, item, value)
316          self._set_zero_diagonal()
317
318 -    def _set_zero_diagonal(self):
319          """Set all diagonal elements to zero (PRIVATE)."""
320          for i in range(0, len(self)):
321              self.matrix[i][i] = 0
322
323 -    def format_phylip(self, handle):
324          """Write data in Phylip format to a given file-like object or handle.
325
326          The output stream is the input distance matrix format used with Phylip
327          programs (e.g. 'neighbor'). See:
328          http://evolution.genetics.washington.edu/phylip/doc/neighbor.html
329
330          :Parameters:
331              handle : file or file-like object
332                  A writeable file handle or other object supporting the 'write'
333                  method, such as StringIO or sys.stdout. On Python 3, should be
334                  open in text mode.
335
336          """
337          handle.write("    {0}\n".format(len(self.names)))
338          # Phylip needs space-separated, vertically aligned columns
339          name_width = max(12, max(map(len, self.names)) + 1)
340          value_fmts = ("{" + str(x) + ":.4f}"
341                        for x in range(1, len(self.matrix) + 1))
342          row_fmt = "{0:" + str(name_width) + "s}" + "  ".join(value_fmts) + "\n"
343          for i, (name, values) in enumerate(zip(self.names, self.matrix)):
344              # Mirror the matrix values across the diagonal
345              mirror_values = (self.matrix[j][i]
346                               for j in range(i + 1, len(self.matrix)))
347              fields = itertools.chain([name], values, mirror_values)
348              handle.write(row_fmt.format(*fields))
349
350
351  # Shim for compatibility with Biopython<1.70 (#1304)
352  _DistanceMatrix = DistanceMatrix
353
354
355 -class DistanceCalculator(object):
356      """Class to calculate the distance matrix from a DNA or Protein.
357
358      Multiple Sequence Alignment(MSA) and the given name of the
359      substitution model.
360
361      Currently only scoring matrices are used.
362
363      :Parameters:
364          model : str
365              Name of the model matrix to be used to calculate distance.
366              The attribute `dna_matrices` contains the available model
367              names for DNA sequences and `protein_matrices` for protein
368              sequences.
369
370      Examples
371      --------
372      >>> from Bio.Phylo.TreeConstruction import DistanceCalculator
373      >>> from Bio import AlignIO
374      >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip')
375      >>> print aln
376      SingleLetterAlphabet() alignment with 5 rows and 13 columns
377      AACGTGGCCACAT Alpha
378      AAGGTCGCCACAC Beta
379      GAGATTTCCGCCT Delta
380      GAGATCTCCGCCC Epsilon
381      CAGTTCGCCACAA Gamma
382
383      DNA calculator with 'identity' model::
384
385          >>> calculator = DistanceCalculator('identity')
386          >>> dm = calculator.get_distance(aln)
387          >>> print dm
388          Alpha   0
389          Beta    0.230769230769  0
390          Gamma   0.384615384615  0.230769230769  0
391          Delta   0.538461538462  0.538461538462  0.538461538462  0
392          Epsilon 0.615384615385  0.384615384615  0.461538461538  0.153846153846  0
393                  Alpha           Beta            Gamma           Delta           Epsilon
394
395      Protein calculator with 'blosum62' model::
396
397          >>> calculator = DistanceCalculator('blosum62')
398          >>> dm = calculator.get_distance(aln)
399          >>> print dm
400          Alpha   0
401          Beta    0.369047619048  0
402          Gamma   0.493975903614  0.25            0
403          Delta   0.585365853659  0.547619047619  0.566265060241  0
404          Epsilon 0.7             0.355555555556  0.488888888889  0.222222222222  0
405                  Alpha           Beta            Gamma           Delta           Epsilon
406
407      """
408
409      dna_alphabet = ['A', 'T', 'C', 'G']
410
411      # BLAST nucleic acid scoring matrix
412      blastn = [[5],
413                [-4, 5],
414                [-4, -4, 5],
415                [-4, -4, -4, 5]]
416
417      # transition/transversion scoring matrix
418      trans = [[6],
419               [-5, 6],
420               [-5, -1, 6],
421               [-1, -5, -5, 6]]
422
423      protein_alphabet = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
424                          'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y',
425                          'Z']
426
427      # matrices available
428      dna_matrices = {'blastn': blastn, 'trans': trans}
429      protein_models = MatrixInfo.available_matrices
430      protein_matrices = dict((name, getattr(MatrixInfo, name))
431                              for name in protein_models)
432
433      dna_models = list(dna_matrices.keys())
434
435      models = ['identity'] + dna_models + protein_models
436
437 -    def __init__(self, model='identity', skip_letters=None):
438          """Initialize with a distance model."""
439          # Shim for backward compatibility (#491)
440          if skip_letters:
441              self.skip_letters = skip_letters
442          elif model == 'identity':
443              self.skip_letters = ()
444          else:
445              self.skip_letters = ('-', '*')
446
447          if model == 'identity':
448              self.scoring_matrix = None
449          elif model in self.dna_models:
450              self.scoring_matrix = _Matrix(self.dna_alphabet,
451                                            self.dna_matrices[model])
452          elif model in self.protein_models:
453              self.scoring_matrix = self._build_protein_matrix(
454                  self.protein_matrices[model])
455          else:
456              raise ValueError("Model not supported. Available models: " +
457                               ", ".join(self.models))
458
459 -    def _pairwise(self, seq1, seq2):
460          """Calculate pairwise distance from two sequences (PRIVATE).
461
462          Returns a value between 0 (identical sequences) and 1 (completely
463          different, or seq1 is an empty string.)
464          """
465          score = 0
466          max_score = 0
467          if self.scoring_matrix:
468              max_score1 = 0
469              max_score2 = 0
470              for i in range(0, len(seq1)):
471                  l1 = seq1[i]
472                  l2 = seq2[i]
473                  if l1 in self.skip_letters or l2 in self.skip_letters:
474                      continue
475                  if l1 not in self.scoring_matrix.names:
476                      raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'"
477                                       % (l1, seq1.id, i))
478                  if l2 not in self.scoring_matrix.names:
479                      raise ValueError("Bad alphabet '%s' in sequence '%s' at position '%s'"
480                                       % (l2, seq2.id, i))
481                  max_score1 += self.scoring_matrix[l1, l1]
482                  max_score2 += self.scoring_matrix[l2, l2]
483                  score += self.scoring_matrix[l1, l2]
484              # Take the higher score if the matrix is asymmetrical
485              max_score = max(max_score1, max_score2)
486          else:
487              # Score by character identity, not skipping any special letters
488              score = sum(l1 == l2
489                          for l1, l2 in zip(seq1, seq2)
490                          if l1 not in self.skip_letters and l2 not in self.skip_letters)
491              max_score = len(seq1)
492          if max_score == 0:
493              return 1  # max possible scaled distance
494          return 1 - (score * 1.0 / max_score)
495
496 -    def get_distance(self, msa):
497          """Return a DistanceMatrix for MSA object.
498
499          :Parameters:
500              msa : MultipleSeqAlignment
501                  DNA or Protein multiple sequence alignment.
502
503          """
504          if not isinstance(msa, MultipleSeqAlignment):
505              raise TypeError("Must provide a MultipleSeqAlignment object.")
506
507          names = [s.id for s in msa]
508          dm = DistanceMatrix(names)
509          for seq1, seq2 in itertools.combinations(msa, 2):
510              dm[seq1.id, seq2.id] = self._pairwise(seq1, seq2)
511          return dm
512
513 -    def _build_protein_matrix(self, subsmat):
514          """Convert matrix from SubsMat format to _Matrix object (PRIVATE)."""
515          protein_matrix = _Matrix(self.protein_alphabet)
516          for k, v in subsmat.items():
517              aa1, aa2 = k
518              protein_matrix[aa1, aa2] = v
519          return protein_matrix
520
521
522 -class TreeConstructor(object):
523      """Base class for all tree constructor."""
524
525 -    def build_tree(self, msa):
526          """Caller to built the tree from a MultipleSeqAlignment object.
527
528          This should be implemented in subclass.
529          """
530          raise NotImplementedError("Method not implemented!")
531
532
533 -class DistanceTreeConstructor(TreeConstructor):
534      """Distance based tree constructor.
535
536      :Parameters:
537          method : str
538              Distance tree construction method, 'nj'(default) or 'upgma'.
539          distance_calculator : DistanceCalculator
540              The distance matrix calculator for multiple sequence alignment.
541              It must be provided if `build_tree` will be called.
542
543      Examples
544      --------
545      >>> from TreeConstruction import DistanceTreeConstructor
546      >>> constructor = DistanceTreeConstructor()
547
548      UPGMA Tree:
549
550      >>> upgmatree = constructor.upgma(dm)
551      >>> print upgmatree
552      Tree(rooted=True)
562
563      NJ Tree:
564
565      >>> njtree = constructor.nj(dm)
566      >>> print njtree
567      Tree(rooted=False)
576
577
578      """
579
580      methods = ['nj', 'upgma']
581
582 -    def __init__(self, distance_calculator=None, method="nj"):
583          """Initialize the class."""
584          if (distance_calculator is None or isinstance(distance_calculator, DistanceCalculator)):
585              self.distance_calculator = distance_calculator
586          else:
587              raise TypeError("Must provide a DistanceCalculator object.")
588          if isinstance(method, str) and method in self.methods:
589              self.method = method
590          else:
591              raise TypeError("Bad method: " + method +
592                              ". Available methods: " + ", ".join(self.methods))
593
594 -    def build_tree(self, msa):
595          if self.distance_calculator:
596              dm = self.distance_calculator.get_distance(msa)
597              tree = None
598              if self.method == 'upgma':
599                  tree = self.upgma(dm)
600              else:
601                  tree = self.nj(dm)
602              return tree
603          else:
604              raise TypeError("Must provide a DistanceCalculator object.")
605
606 -    def upgma(self, distance_matrix):
607          """Construct and return an UPGMA tree.
608
609          Constructs and returns an Unweighted Pair Group Method
610          with Arithmetic mean (UPGMA) tree.
611
612          :Parameters:
613              distance_matrix : DistanceMatrix
614                  The distance matrix for tree construction.
615
616          """
617          if not isinstance(distance_matrix, DistanceMatrix):
618              raise TypeError("Must provide a DistanceMatrix object.")
619
620          # make a copy of the distance matrix to be used
621          dm = copy.deepcopy(distance_matrix)
624          # init minimum index
625          min_i = 0
626          min_j = 0
627          inner_count = 0
628          while len(dm) > 1:
629              min_dist = dm[1, 0]
630              # find minimum index
631              for i in range(1, len(dm)):
632                  for j in range(0, i):
633                      if min_dist >= dm[i, j]:
634                          min_dist = dm[i, j]
635                          min_i = i
636                          min_j = j
637
641              inner_count += 1
645              # assign branch length
647                  clade1.branch_length = min_dist * 1.0 / 2
648              else:
649                  clade1.branch_length = min_dist * \
650                      1.0 / 2 - self._height_of(clade1)
651
653                  clade2.branch_length = min_dist * 1.0 / 2
654              else:
655                  clade2.branch_length = min_dist * \
656                      1.0 / 2 - self._height_of(clade2)
657
658              # update node list
661
662              # rebuild distance matrix,
663              # set the distances of new node at the index of min_j
664              for k in range(0, len(dm)):
665                  if k != min_i and k != min_j:
666                      dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2
667
668              dm.names[min_j] = "Inner" + str(inner_count)
669
670              del dm[min_i]
673
674 -    def nj(self, distance_matrix):
675          """Construct and return a Neighbor Joining tree.
676
677          :Parameters:
678              distance_matrix : DistanceMatrix
679                  The distance matrix for tree construction.
680
681          """
682          if not isinstance(distance_matrix, DistanceMatrix):
683              raise TypeError("Must provide a DistanceMatrix object.")
684
685          # make a copy of the distance matrix to be used
686          dm = copy.deepcopy(distance_matrix)
689          # init node distance
690          node_dist = [0] * len(dm)
691          # init minimum index
692          min_i = 0
693          min_j = 0
694          inner_count = 0
695          while len(dm) > 2:
696              # calculate nodeDist
697              for i in range(0, len(dm)):
698                  node_dist[i] = 0
699                  for j in range(0, len(dm)):
700                      node_dist[i] += dm[i, j]
701                  node_dist[i] = node_dist[i] / (len(dm) - 2)
702
703              # find minimum distance pair
704              min_dist = dm[1, 0] - node_dist[1] - node_dist[0]
705              min_i = 0
706              min_j = 1
707              for i in range(1, len(dm)):
708                  for j in range(0, i):
709                      temp = dm[i, j] - node_dist[i] - node_dist[j]
710                      if min_dist > temp:
711                          min_dist = temp
712                          min_i = i
713                          min_j = j
717              inner_count += 1
721              # assign branch length
722              clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] -
723                                      node_dist[min_j]) / 2.0
725
726              # update node list
729
730              # rebuild distance matrix,
731              # set the distances of new node at the index of min_j
732              for k in range(0, len(dm)):
733                  if k != min_i and k != min_j:
734                      dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] -
735                                      dm[min_i, min_j]) / 2.0
736
737              dm.names[min_j] = "Inner" + str(inner_count)
738              del dm[min_i]
739
740          # set the last clade as one of the child of the inner_clade
741          root = None
747          else:
752
753          return BaseTree.Tree(root, rooted=False)
754
756          """Calculate clade height -- the longest path to any terminal (PRIVATE)."""
757          height = 0
760          else:
762          return height
763
764  # #################### Tree Scoring and Searching Classes #####################
765
766
767 -class Scorer(object):
768      """Base class for all tree scoring methods."""
769
770 -    def get_score(self, tree, alignment):
771          """Caller to get the score of a tree for the given alignment.
772
773          This should be implemented in subclass.
774          """
775          raise NotImplementedError("Method not implemented!")
776
777
778 -class TreeSearcher(object):
779      """Base class for all tree searching methods."""
780
781 -    def search(self, starting_tree, alignment):
782          """Caller to search the best tree with a starting tree.
783
784          This should be implemented in subclass.
785          """
786          raise NotImplementedError("Method not implemented!")
787
788
789 -class NNITreeSearcher(TreeSearcher):
790      """Tree searching with Nearest Neighbor Interchanges (NNI) algorithm.
791
792      :Parameters:
793          scorer : ParsimonyScorer
794              parsimony scorer to calculate the parsimony score of
795              different trees during NNI algorithm.
796
797      """
798
799 -    def __init__(self, scorer):
800          """Initialize the class."""
801          if isinstance(scorer, Scorer):
802              self.scorer = scorer
803          else:
804              raise TypeError("Must provide a Scorer object.")
805
806 -    def search(self, starting_tree, alignment):
807          """Implement the TreeSearcher.search method.
808
809          :Parameters:
810             starting_tree : Tree
811                 starting tree of NNI method.
812             alignment : MultipleSeqAlignment
813                 multiple sequence alignment used to calculate parsimony
814                 score of different NNI trees.
815
816          """
817          return self._nni(starting_tree, alignment)
818
819 -    def _nni(self, starting_tree, alignment):
820          """Search for the best parsimony tree using the NNI algorithm (PRIVATE)."""
821          best_tree = starting_tree
822          while True:
823              best_score = self.scorer.get_score(best_tree, alignment)
824              temp = best_score
825              for t in self._get_neighbors(best_tree):
826                  score = self.scorer.get_score(t, alignment)
827                  if score < best_score:
828                      best_score = score
829                      best_tree = t
830              # stop if no smaller score exist
831              if best_score >= temp:
832                  break
833          return best_tree
834
835 -    def _get_neighbors(self, tree):
836          """Get all neighbor trees of the given tree (PRIVATE).
837
838          Currently only for binary rooted trees.
839          """
840          # make child to parent dict
841          parents = {}
845                  # cannot get the parent if the parent is root. Bug?
846                  if len(node_path) == 1:
848                  else:
850          neighbors = []
851          root_childs = []
856                  root_childs.append(left)
857                  root_childs.append(right)
858                  if not left.is_terminal() and not right.is_terminal():
859                      # make changes around the left_left clade
864                      # neightbor 1 (left_left + right_right)
869                      temp_tree = copy.deepcopy(tree)
870                      neighbors.append(temp_tree)
871                      # neighbor 2 (left_left + right_left)
876                      temp_tree = copy.deepcopy(tree)
877                      neighbors.append(temp_tree)
878                      # change back (left_left + left_right)
884                  # skip root child
885                  continue
886              else:
887                  # method for other clades
888                  # make changes around the parent clade
894                      # neighbor 1 (parent + right)
899                      temp_tree = copy.deepcopy(tree)
900                      neighbors.append(temp_tree)
901                      # neighbor 2 (parent + left)
906                      temp_tree = copy.deepcopy(tree)
907                      neighbors.append(temp_tree)
908                      # change back (parent + sister)
913                  else:
915                      # neighbor 1 (parent + right)
920                      temp_tree = copy.deepcopy(tree)
921                      neighbors.append(temp_tree)
922                      # neighbor 2 (parent + left)
927                      temp_tree = copy.deepcopy(tree)
928                      neighbors.append(temp_tree)
929                      # change back (parent + sister)
934          return neighbors
935
936  # ######################## Parsimony Classes ##########################
937
938
939 -class ParsimonyScorer(Scorer):
940      """Parsimony scorer with a scoring matrix.
941
942      This is a combination of Fitch algorithm and Sankoff algorithm.
943      See ParsimonyTreeConstructor for usage.
944
945      :Parameters:
946          matrix : _Matrix
947              scoring matrix used in parsimony score calculation.
948
949      """
950
951 -    def __init__(self, matrix=None):
952          """Initialize the class."""
953          if not matrix or isinstance(matrix, _Matrix):
954              self.matrix = matrix
955          else:
956              raise TypeError("Must provide a _Matrix object.")
957
958 -    def get_score(self, tree, alignment):
959          """Calculate parsimony score using the Fitch algorithm.
960
961          Calculate and return the parsimony score given a tree and the
962          MSA using either the Fitch algorithm (without a penalty matrix)
963          or the Sankoff algorithm (with a matrix).
964          """
965          # make sure the tree is rooted and bifurcating
966          if not tree.is_bifurcating():
967              raise ValueError("The tree provided should be bifurcating.")
968          if not tree.rooted:
969              tree.root_at_midpoint()
970          # sort tree terminals and alignment
971          terms = tree.get_terminals()
972          terms.sort(key=lambda term: term.name)
973          alignment.sort()
974          if not all(t.name == a.id for t, a in zip(terms, alignment)):
975              raise ValueError(
976                  "Taxon names of the input tree should be the same with the alignment.")
977          # term_align = dict(zip(terms, alignment))
978          score = 0
979          for i in range(len(alignment[0])):
980              # parsimony score for column_i
981              score_i = 0
982              # get column
983              column_i = alignment[:, i]
984              # skip non-informative column
985              if column_i == len(column_i) * column_i[0]:
986                  continue
987
988              # start calculating score_i using the tree and column_i
989
990              # Fitch algorithm without the penalty matrix
991              if not self.matrix:
992                  # init by mapping terminal clades and states in column_i
993                  clade_states = dict(zip(terms, [set([c]) for c in column_i]))
998                      state = left_state & right_state
999                      if not state:
1000                          state = left_state | right_state
1001                          score_i = score_i + 1
1003              # Sankoff algorithm with the penalty matrix
1004              else:
1005                  inf = float('inf')
1006                  # init score arrays for terminal clades
1007                  alphabet = self.matrix.names
1008                  length = len(alphabet)
1010                  for j in range(len(column_i)):
1011                      array = [inf] * length
1012                      index = alphabet.index(column_i[j])
1013                      array[index] = 0
1015                  # bottom up calculation
1020                      array = []
1021                      for m in range(length):
1022                          min_l = inf
1023                          min_r = inf
1024                          for n in range(length):
1025                              sl = self.matrix[
1026                                  alphabet[m], alphabet[n]] + left_score[n]
1027                              sr = self.matrix[
1028                                  alphabet[m], alphabet[n]] + right_score[n]
1029                              if min_l > sl:
1030                                  min_l = sl
1031                              if min_r > sr:
1032                                  min_r = sr
1033                          array.append(min_l + min_r)
1035                  # minimum from root score
1036                  score_i = min(array)
1037                  # TODO: resolve internal states
1038              score = score + score_i
1039          return score
1040
1041
1042 -class ParsimonyTreeConstructor(TreeConstructor):
1043      """Parsimony tree constructor.
1044
1045      :Parameters:
1046          searcher : TreeSearcher
1047              tree searcher to search the best parsimony tree.
1048          starting_tree : Tree
1049              starting tree provided to the searcher.
1050
1051      Examples
1052      --------
1053      >>> from Bio import AlignIO
1054      >>> from TreeConstruction import *
1055      >>> aln = AlignIO.read(open('Tests/TreeConstruction/msa.phy'), 'phylip')
1056      >>> print aln
1057      SingleLetterAlphabet() alignment with 5 rows and 13 columns
1058      AACGTGGCCACAT Alpha
1059      AAGGTCGCCACAC Beta
1060      GAGATTTCCGCCT Delta
1061      GAGATCTCCGCCC Epsilon
1062      CAGTTCGCCACAA Gamma
1063      >>> starting_tree = Phylo.read('Tests/TreeConstruction/nj.tre', 'newick')
1064      >>> print tree
1065      Tree(weight=1.0, rooted=False)
1074      >>> from TreeConstruction import *
1075      >>> scorer = ParsimonyScorer()
1076      >>> searcher = NNITreeSearcher(scorer)
1077      >>> constructor = ParsimonyTreeConstructor(searcher, starting_tree)
1078      >>> pars_tree = constructor.build_tree(aln)
1079      >>> print pars_tree
1080      Tree(weight=1.0, rooted=True)
1090
1091      """
1092
1093 -    def __init__(self, searcher, starting_tree=None):
1094          """Initialize the class."""
1095          self.searcher = searcher
1096          self.starting_tree = starting_tree
1097
1098 -    def build_tree(self, alignment):
1099          """Build the tree.
1100
1101          :Parameters:
1102              alignment : MultipleSeqAlignment
1103                  multiple sequence alignment to calculate parsimony tree.
1104
1105          """
1106          # if starting_tree is none,
1107          # create a upgma tree with 'identity' scoring matrix
1108          if self.starting_tree is None:
1109              dtc = DistanceTreeConstructor(DistanceCalculator("identity"),
1110                                            "upgma")
1111              self.starting_tree = dtc.build_tree(alignment)
1112          return self.searcher.search(self.starting_tree, alignment)
1113
<!--
expandto(location.href);
// -->

```

 Generated by Epydoc 3.0.1 on Fri Jun 22 16:36:12 2018 http://epydoc.sourceforge.net