Package Bio :: Package CodonAlign :: Module CodonSeq'
[hide private]
[frames] | no frames]

Source Code for Module Bio.CodonAlign.CodonSeq'

   1  # Copyright 2013 by Zheng Ruan (zruan1991@gmai.com). 
   2  # All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  """Code for dealing with Codon Seq. 
   7   
   8  CodonSeq class is interited from Seq class. This is the core class to 
   9  deal with sequences in CodonAlignment in biopython. 
  10   
  11  """ 
  12  from __future__ import division, print_function 
  13  from itertools import permutations 
  14  from math import log 
  15   
  16  from Bio.Seq import Seq 
  17  from Bio.SeqRecord import SeqRecord 
  18  from Bio.Alphabet import IUPAC, Gapped, HasStopCodon, Alphabet 
  19  from Bio.Alphabet import generic_dna, _ungap 
  20  from Bio.Data.CodonTable import generic_by_id 
  21   
  22  from Bio.CodonAlign.CodonAlphabet import default_codon_alphabet, default_codon_table 
  23   
  24  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
25 26 -class CodonSeq(Seq):
27 """CodonSeq is designed to be within the SeqRecords of a 28 CodonAlignment class. 29 30 CodonSeq is useful as it allows the user to specify 31 reading frame when translate CodonSeq 32 33 CodonSeq also accepts codon style slice by calling 34 get_codon() method. 35 36 Important: Ungapped CodonSeq can be any length if you 37 specify the rf_table. Gapped CodonSeq should be a 38 multiple of three. 39 40 >>> codonseq = CodonSeq("AAATTTGGGCCAAATTT", rf_table=(0,3,6,8,11,14)) 41 >>> print(codonseq.translate()) 42 KFGAKF 43 44 test get_full_rf_table method 45 46 >>> p = CodonSeq('AAATTTCCCGG-TGGGTTTAA', rf_table=(0, 3, 6, 9, 11, 14, 17)) 47 >>> full_rf_table = p.get_full_rf_table() 48 >>> print(full_rf_table) 49 [0, 3, 6, 9, 12, 15, 18] 50 >>> print(p.translate(rf_table=full_rf_table, ungap_seq=False)) 51 KFPPWV* 52 >>> p = CodonSeq('AAATTTCCCGGGAA-TTTTAA', rf_table=(0, 3, 6, 9, 14, 17)) 53 >>> print(p.get_full_rf_table()) 54 [0, 3, 6, 9, 12.0, 15, 18] 55 >>> p = CodonSeq('AAA------------TAA', rf_table=(0, 3)) 56 >>> print(p.get_full_rf_table()) 57 [0, 3.0, 6.0, 9.0, 12.0, 15] 58 59 """
60 - def __init__(self, data='', alphabet=default_codon_alphabet, \ 61 gap_char="-", rf_table=None):
62 # rf_table should be a tuple or list indicating the every 63 # codon position along the sequence. For example: 64 # sequence = 'AAATTTGGGCCAAATTT' 65 # rf_table = (0, 3, 6, 8, 11, 14) 66 # the translated protein sequences will be 67 # AAA TTT GGG GCC AAA TTT 68 # K F G A K F 69 # Notice: rf_table applies to ungapped sequence. If there 70 # are gaps in the sequence, they will be discarded. This 71 # feature ensures the rf_table is independent of where the 72 # codon sequence appears in the alignment 73 74 Seq.__init__(self, data.upper(), alphabet=alphabet) 75 self.gap_char = gap_char 76 77 # check the length of the alignment to be a triple 78 if rf_table is None: 79 seq_ungapped = self._data.replace(gap_char, "") 80 assert len(self) % 3 == 0, "Sequence length is not a triple number" 81 self.rf_table = list(filter(lambda x: x%3 == 0, 82 range(len(seq_ungapped)))) 83 # check alphabet 84 # Not use Alphabet._verify_alphabet function because it 85 # only works for single alphabet 86 for i in self.rf_table: 87 if self._data[i:i+3] not in alphabet.letters: 88 raise ValueError("Sequence contain undefined letters from" 89 " alphabet " 90 "({0})! ".format(self._data[i:i+3])) 91 else: 92 #if gap_char in self._data: 93 # assert len(self) % 3 == 0, \ 94 # "Gapped sequence length is not a triple number" 95 assert isinstance(rf_table, (tuple, list)), \ 96 "rf_table should be a tuple or list object" 97 assert all(isinstance(i, int) for i in rf_table), \ 98 "elements in rf_table should be int that specify " \ 99 + "the codon positions of the sequence" 100 seq_ungapped = self._data.replace(gap_char, "") 101 for i in rf_table: 102 if seq_ungapped[i:i+3] not in alphabet.letters: 103 raise ValueError("Sequence contain undefined letters " 104 "from alphabet " 105 "({0})!".format(seq_ungapped[i:i+3])) 106 self.rf_table = rf_table
107
108 - def __getitem__(self, index):
109 # TODO: handle alphabet elegantly 110 return Seq(self._data[index], alphabet=generic_dna)
111
112 - def get_codon(self, index):
113 """get the `index`-th codon from the self.seq 114 """ 115 if len(set([i % 3 for i in self.rf_table])) != 1: 116 raise RuntimeError("frameshift detected. " 117 "CodonSeq object is not able to deal " 118 "with codon sequence with frameshift. " 119 "Plase use normal slice option.") 120 if isinstance(index, int): 121 if index != -1: 122 return self._data[index*3:(index+1)*3] 123 else: 124 return self._data[index*3:] 125 else: 126 # This slice ensures that codon will always be the unit 127 # in slicing (it won't change to other codon if you are 128 # using reverse slicing such as [::-1]). 129 # The idea of the code below is to first map the slice 130 # to amino acid sequence and then transform it into 131 # codon sequence. 132 aa_index = range(len(self)//3) 133 def cslice(p): 134 aa_slice = aa_index[p] 135 codon_slice = '' 136 for i in aa_slice: 137 codon_slice += self._data[i*3:i*3+3] 138 return codon_slice
139 codon_slice = cslice(index) 140 return CodonSeq(codon_slice, alphabet=self.alphabet)
141
142 - def get_codon_num(self):
143 """Return the number of codons in the CodonSeq""" 144 return len(self.rf_table)
145
146 - def translate(self, codon_table=default_codon_table, 147 stop_symbol="*", rf_table=None, ungap_seq=True):
148 """Translate the CodonSeq based on the reading frame 149 in rf_table. It is possible for the user to specify 150 a rf_table at this point. If you want to include 151 gaps in the translated sequence, this is the only 152 way. ungap_seq should be set to true for this 153 purpose. 154 """ 155 amino_acids = [] 156 if ungap_seq is True: 157 tr_seq = self._data.replace(self.gap_char, "") 158 else: 159 tr_seq = self._data 160 if rf_table is None: 161 rf_table = self.rf_table 162 p = -1 #initiation 163 for i in rf_table: 164 if isinstance(i, float): 165 amino_acids.append('-') 166 continue 167 #elif '---' == tr_seq[i:i+3]: 168 # amino_acids.append('-') 169 # continue 170 elif '-' in tr_seq[i:i+3]: 171 # considering two types of frameshift 172 if p == -1 or p - i == 3: 173 p = i 174 codon = tr_seq[i:i+6].replace('-', '')[:3] 175 elif p - i > 3: 176 codon = tr_seq[i:i+3] 177 p = i 178 else: 179 # normal condition without gaps 180 codon = tr_seq[i:i+3] 181 p = i 182 if codon in codon_table.stop_codons: 183 amino_acids.append(stop_symbol) 184 continue 185 try: 186 amino_acids.append(codon_table.forward_table[codon]) 187 except KeyError: 188 raise RuntimeError("Unknown codon detected ({0}). Do you " 189 "forget to speficy ungap_seq " 190 "argument?".format(codon)) 191 return "".join(amino_acids)
192
193 - def toSeq(self, alphabet=generic_dna):
194 return Seq(self._data, generic_dna)
195
196 - def get_full_rf_table(self):
197 """This function returns a full rf_table of the given 198 CodonSeq records. A full rf_table is different from 199 normal rf_table in that it translate gaps in CodonSeq. 200 It is helpful to construct alignment containing 201 frameshift. 202 """ 203 ungap_seq = self._data.replace("-", "") 204 codon_lst = [ungap_seq[i:i+3] for i in self.rf_table] 205 relative_pos = [self.rf_table[0]] 206 for i in range(1, len(self.rf_table[1:])+1): 207 relative_pos.append(self.rf_table[i]-self.rf_table[i-1]) 208 full_rf_table = [] 209 codon_num = 0 210 for i in filter(lambda x: x%3==0, range(len(self._data))): 211 if self._data[i:i+3] == self.gap_char*3: 212 full_rf_table.append(i+0.0) 213 elif relative_pos[codon_num] == 0: 214 full_rf_table.append(i) 215 codon_num += 1 216 elif relative_pos[codon_num] in (-1, -2): 217 # check the gap status of previous codon 218 gap_stat = len(self._data[i-3:i].replace("-", "")) 219 if gap_stat == 3: 220 full_rf_table.append(i+relative_pos[codon_num]) 221 elif gap_stat == 2: 222 full_rf_table.append(i+1+relative_pos[codon_num]) 223 elif gap_stat == 1: 224 full_rf_table.append(i+2+relative_pos[codon_num]) 225 codon_num += 1 226 elif relative_pos[codon_num] > 0: 227 full_rf_table.append(i+0.0) 228 try: 229 this_len = len(self._data[i:i+3].replace("-", "")) 230 relative_pos[codon_num] -= this_len 231 except: 232 # we probably reached the last codon 233 pass 234 return full_rf_table
235
236 - def full_translate(self, codon_table=default_codon_table, stop_symbol="*"):
237 """Apply full translation with gaps considered. 238 """ 239 full_rf_table = self.get_full_rf_table() 240 return self.translate(codon_table=codon_table, stop_symbol=stop_symbol, 241 rf_table=full_rf_table, ungap_seq=False)
242
243 - def ungap(self, gap=None):
244 if hasattr(self.alphabet, "gap_char"): 245 if not gap: 246 gap = self.alphabet.gap_char 247 elif gap != self.alphabet.gap_char: 248 raise ValueError("Gap %s does not match %s from alphabet" 249 % (repr(gap), repr(self.alphabet.alphabet.gap_char))) 250 alpha = _ungap(self.alphabet) 251 elif not gap: 252 raise ValueError("Gap character not given and not defined in " 253 "alphabet") 254 else: 255 alpha = self.alphabet # modify! 256 if len(gap) != 1 or not isinstance(gap, str): 257 raise ValueError("Unexpected gap character, %s" % repr(gap)) 258 return CodonSeq(str(self._data).replace(gap, ""), alpha, 259 rf_table=self.rf_table)
260 261 @classmethod
262 - def from_seq(cls, seq, alphabet=default_codon_alphabet, rf_table=None):
263 if rf_table is None: 264 return cls(seq._data, alphabet=alphabet) 265 else: 266 return cls(seq._data, alphabet=alphabet, rf_table=rf_table)
267
268 269 -def _get_codon_list(codonseq):
270 """get a list of codons according to full_rf_table for counting 271 (PRIVATE). 272 """ 273 #if not isinstance(codonseq, CodonSeq): 274 # raise TypeError("_get_codon_list accept a CodonSeq object " 275 # "({0} detected)".format(type(codonseq))) 276 full_rf_table = codonseq.get_full_rf_table() 277 codon_lst = [] 278 for i, k in enumerate(full_rf_table): 279 if isinstance(k, int): 280 start = k 281 try: 282 end = int(full_rf_table[i+1]) 283 except IndexError: 284 end = start+3 285 this_codon = str(codonseq[start:end]) 286 if len(this_codon) == 3: 287 codon_lst.append(this_codon) 288 else: 289 codon_lst.append(str(this_codon.ungap())) 290 elif str(codonseq[int(k):int(k)+3]) == "---": 291 codon_lst.append("---") 292 else: 293 # this may be problematic, as normally no codon shoud 294 # fall into this condition 295 codon_lst.append(codonseq[int(k):int(k)+3]) 296 return codon_lst
297
298 299 -def cal_dn_ds(codon_seq1, codon_seq2, method="NG86", 300 codon_table=default_codon_table, k=1, cfreq=None):
301 """Function to calculate the dN and dS of the given two CodonSeq 302 or SeqRecord that contain CodonSeq objects. 303 304 Available methods: 305 - NG86 - PMID: 3444411 306 - LWL85 - PMID: 3916709 307 - ML - PMID: 7968486 308 - YN00 - PMID: 10666704 309 310 Arguments: 311 - w - transition/transvertion ratio 312 - cfreq - Current codon frequency vector can only be specified 313 when you are using ML method. Possible ways of 314 getting cfreq are: F1x4, F3x4 and F61. 315 """ 316 if all([isinstance(codon_seq1, CodonSeq), 317 isinstance(codon_seq2, CodonSeq)]): 318 pass 319 elif all([isinstance(codon_seq1, SeqRecord), 320 isinstance(codon_seq2, SeqRecord)]): 321 codon_seq1 = codon_seq1.seq 322 codon_seq2 = codon_seq2.seq 323 else: 324 raise TypeError("cal_dn_ds accepts two CodonSeq objects or SeqRecord " 325 "that contains CodonSeq as its seq!") 326 if len(codon_seq1.get_full_rf_table()) != \ 327 len(codon_seq2.get_full_rf_table()): 328 raise RuntimeError("full_rf_table length of seq1 ({0}) and seq2 ({1}) " 329 "are not the same".format( 330 len(codon_seq1.get_full_rf_table()), 331 len(codon_seq2.get_full_rf_table())) 332 ) 333 if cfreq is None: 334 cfreq = 'F3x4' 335 elif cfreq is not None and method != 'ML': 336 raise RuntimeError("cfreq can only be specified when you " 337 "are using ML method") 338 if cfreq not in ('F1x4', 'F3x4', 'F61'): 339 import warnings 340 warnings.warn("Unknown cfreq ({0}). Only F1x4, F3x4 and F61 are " 341 "acceptable. Use F3x4 in the following.".format(cfreq)) 342 cfreq = 'F3x4' 343 seq1_codon_lst = _get_codon_list(codon_seq1) 344 seq2_codon_lst = _get_codon_list(codon_seq2) 345 # remove gaps in seq_codon_lst 346 seq1 = [] 347 seq2 = [] 348 for i, j in zip(seq1_codon_lst, seq2_codon_lst): 349 if (not '-' in i) and (not '-' in j): 350 seq1.append(i) 351 seq2.append(j) 352 dnds_func = {'ML': _ml, 'NG86': _ng86, 'LWL85': _lwl85, 'YN00': _yn00} 353 if method == "ML": 354 return dnds_func[method](seq1, seq2, cfreq, codon_table) 355 else: 356 return dnds_func[method](seq1, seq2, k, codon_table)
357
358 359 ################################################################# 360 # private functions for NG86 method 361 ################################################################# 362 363 -def _ng86(seq1, seq2, k, codon_table):
364 """Main function for NG86 method (PRIVATE). 365 """ 366 S_sites1, N_sites1 = _count_site_NG86(seq1, 367 codon_table=codon_table, k=k) 368 S_sites2, N_sites2 = _count_site_NG86(seq2, 369 codon_table=codon_table, k=k) 370 S_sites = (S_sites1 + S_sites2) / 2.0 371 N_sites = (N_sites1 + N_sites2) / 2.0 372 SN = [0, 0] 373 for i, j in zip(seq1, seq2): 374 SN = [m+n for m,n in zip(SN, _count_diff_NG86( 375 i, j, 376 codon_table=codon_table) 377 ) 378 ] 379 ps = SN[0] / S_sites 380 pn = SN[1] / N_sites 381 if ps < 3/4: 382 dS = abs(-3.0/4*log(1-4.0/3*ps)) 383 else: 384 dS = -1 385 if pn < 3/4: 386 dN = abs(-3.0/4*log(1-4.0/3*pn)) 387 else: 388 dN = -1 389 return dN, dS
390
391 392 -def _count_site_NG86(codon_lst, k=1, codon_table=default_codon_table):
393 """count synonymous and non-synonymous sites of a list of codons 394 (PRIVATE). 395 Argument: 396 - codon_lst - A three letter codon list from a CodonSeq object. 397 This can be returned from _get_codon_list method. 398 - k - transition/transversion rate ratio 399 """ 400 S_site = 0 # synonymous sites 401 N_site = 0 # non-synonymous sites 402 purine = ('A', 'G') 403 pyrimidine = ('T', 'C') 404 base_tuple = ('A', 'T', 'C', 'G') 405 for codon in codon_lst: 406 neighbor_codon = {'transition': [], 'transversion': []} 407 # classify neighbor codons 408 codon = codon.replace('U', 'T') 409 if codon == '---': continue 410 for n, i in enumerate(codon): 411 for j in base_tuple: 412 if i == j: 413 pass 414 elif i in purine and j in purine: 415 codon_chars = [c for c in codon] 416 codon_chars[n] = j 417 this_codon = ''.join(codon_chars) 418 neighbor_codon['transition'].append(this_codon) 419 elif i in pyrimidine and j in pyrimidine: 420 codon_chars = [c for c in codon] 421 codon_chars[n] = j 422 this_codon = ''.join(codon_chars) 423 neighbor_codon['transition'].append(this_codon) 424 else: 425 codon_chars = [c for c in codon] 426 codon_chars[n] = j 427 this_codon = ''.join(codon_chars) 428 neighbor_codon['transversion'].append(this_codon) 429 # count synonymous and non-synonymous sites 430 aa = codon_table.forward_table[codon] 431 this_codon_N_site = this_codon_S_site = 0 432 for neighbor in neighbor_codon['transition']: 433 if neighbor in codon_table.stop_codons: 434 this_codon_N_site += 1 435 elif codon_table.forward_table[neighbor] == aa: 436 this_codon_S_site += 1 437 else: 438 this_codon_N_site += 1 439 for neighbor in neighbor_codon['transversion']: 440 if neighbor in codon_table.stop_codons: 441 this_codon_N_site += k 442 elif codon_table.forward_table[neighbor] == aa: 443 this_codon_S_site += k 444 else: 445 this_codon_N_site += k 446 norm_const = (this_codon_N_site + this_codon_S_site)/3 447 S_site += this_codon_S_site / norm_const 448 N_site += this_codon_N_site / norm_const 449 return (S_site, N_site)
450
451 452 -def _count_diff_NG86(codon1, codon2, codon_table=default_codon_table):
453 """Count differences between two codons (three-letter string). 454 The function will take multiple pathways from codon1 to codon2 455 into account (PRIVATE). 456 """ 457 if not all([isinstance(codon1, str), isinstance(codon2, str)]): 458 raise TypeError("_count_diff_NG86 accept string object to represent " 459 "codon ({0}, {1} detected)".format( 460 type(codon1), 461 type(codon2)) 462 ) 463 if len(codon1) != 3 or len(codon2) != 3: 464 raise RuntimeError("codon should be three letter string ({0}, {1} " 465 "detected)".format(len(codon1), len(codon2))) 466 SN = [0, 0] # synonymous and nonsynonymous counts 467 if codon1 == '---' or codon2 == '---': 468 return SN 469 base_tuple = ('A', 'C', 'G', 'T') 470 if not all([i in base_tuple for i in codon1]): 471 raise RuntimeError("Unrecognized character detected in codon1 {0} " 472 "(Codon is consist of " 473 "A, T, C or G)".format(codon1)) 474 if not all([i in base_tuple for i in codon2]): 475 raise RuntimeError("Unrecognized character detected in codon2 {0} " 476 "(Codon is consist of " 477 "A, T, C or G)".format(codon2)) 478 if codon1 == codon2: 479 return SN 480 else: 481 diff_pos = [] 482 for i, k in enumerate(zip(codon1, codon2)): 483 if k[0] != k[1]: 484 diff_pos.append(i) 485 def compare_codon(codon1, codon2, codon_table=default_codon_table, 486 weight=1): 487 """Method to compare two codon accounting for different 488 pathways 489 """ 490 sd = nd = 0 491 if len(set(map(codon_table.forward_table.get, 492 [codon1, codon2]))) == 1: 493 sd += weight 494 else: 495 nd += weight 496 return (sd, nd)
497 if len(diff_pos) == 1: 498 SN = [i+j for i,j in zip(SN, 499 compare_codon(codon1, codon2, codon_table=codon_table))] 500 elif len(diff_pos) == 2: 501 codon2_aa = codon_table.forward_table[codon2] 502 for i in diff_pos: 503 temp_codon = codon1[:i] + codon2[i] + codon1[i+1:] 504 SN = [i+j for i,j in zip(SN, compare_codon( 505 codon1, temp_codon, 506 codon_table=codon_table, 507 weight=0.5)) 508 ] 509 SN = [i+j for i,j in zip(SN, compare_codon( 510 temp_codon, codon2, 511 codon_table=codon_table, 512 weight=0.5)) 513 ] 514 elif len(diff_pos) == 3: 515 codon2_aa = codon_table.forward_table[codon2] 516 paths = list(permutations([0, 1, 2], 3)) 517 tmp_codon = [] 518 for p in paths: 519 tmp1 = codon1[:p[0]] + codon2[p[0]] + codon1[p[0]+1:] 520 tmp2 = tmp1[:p[1]] + codon2[p[1]] + tmp1[p[1]+1:] 521 tmp_codon.append((tmp1, tmp2)) 522 SN = [i+j for i,j in zip(SN, compare_codon(codon1, tmp1, 523 codon_table, 524 weight=0.5/3)) 525 ] 526 SN = [i+j for i,j in zip(SN, compare_codon(tmp1, tmp2, 527 codon_table, 528 weight=0.5/3)) 529 ] 530 SN = [i+j for i,j in zip(SN, compare_codon(tmp2, codon2, 531 codon_table, 532 weight=0.5/3)) 533 ] 534 return SN 535
536 537 ################################################################# 538 # private functions for LWL85 method 539 ################################################################# 540 541 -def _lwl85(seq1, seq2, k, codon_table):
542 """Main function fo LWL85 method (PRIVATE). 543 """ 544 # Nomenclature is according to PMID (3916709) 545 codon_fold_dict = _get_codon_fold(codon_table) 546 # count number of sites in different degenerate classes 547 fold0 = [0, 0] 548 fold2 = [0, 0] 549 fold4 = [0, 0] 550 for codon in seq1 + seq2: 551 fold_num = codon_fold_dict[codon] 552 for f in fold_num: 553 if f == '0': 554 fold0[0] += 1 555 elif f == '2': 556 fold2[0] += 1 557 elif f == '4': 558 fold4[0] += 1 559 L = [sum(fold0)/2.0, sum(fold2)/2.0, sum(fold4)/2.0] 560 # count number of differences in different degenerate classes 561 PQ = [0] * 6 # with P0, P2, P4, Q0, Q2, Q4 in each position 562 for codon1, codon2 in zip(seq1, seq2): 563 if (codon1 == "---" or codon2 == "---") or codon1 == codon2: 564 continue 565 else: 566 PQ = [i+j for i, j in zip(PQ, _diff_codon( 567 codon1, 568 codon2, 569 fold_dict=codon_fold_dict) 570 )] 571 PQ = [i/j for i, j in zip(PQ, L*2)] 572 P = PQ[:3] 573 Q = PQ[3:] 574 A = [(1./2)*log(1./(1-2*i-j)) - (1./4)*log(1./(1-2*j)) \ 575 for i, j in zip(P, Q)] 576 B = [(1./2)*log(1./(1-2*i)) for i in Q] 577 dS = 3*(L[2]*A[1]+L[2]*(A[2]+B[2]))/(L[1]+3*L[2]) 578 dN = 3*(L[2]*B[1]+L[0]*(A[0]+B[0]))/(2*L[1]+3*L[0]) 579 return dN, dS
580
581 582 -def _get_codon_fold(codon_table):
583 """function to classify different position in a codon into 584 different fold (PRIVATE). 585 """ 586 def find_fold_class(codon, forward_table): 587 base = set(['A', 'T', 'C', 'G']) 588 fold = '' 589 codon_base_lst = [i for i in codon] 590 for i, b in enumerate(codon_base_lst): 591 other_base = base - set(b) 592 aa = [] 593 for j in other_base: 594 codon_base_lst[i] = j 595 try: 596 aa.append(forward_table[''.join(codon_base_lst)]) 597 except KeyError: 598 aa.append('stop') 599 if aa.count(forward_table[codon]) == 0: 600 fold += '0' 601 elif aa.count(forward_table[codon]) in (1,2): 602 fold += '2' 603 elif aa.count(forward_table[codon]) == 3: 604 fold += '4' 605 else: 606 raise RuntimeError("Unknown Error, cannot assign the " 607 "position to a fold") 608 codon_base_lst[i] = b 609 return fold
610 fold_table = {} 611 for codon in codon_table.forward_table: 612 if 'U' not in codon: 613 fold_table[codon] = find_fold_class(codon, 614 codon_table.forward_table) 615 fold_table["---"] = '---' 616 return fold_table 617
618 619 -def _diff_codon(codon1, codon2, fold_dict):
620 """function to get the differences of two codon and return 621 number of different types substitutions 622 623 return (P0, P2, P4, Q0, Q2, Q4) 624 Nomenclature is according to PMID (3916709) 625 """ 626 P0 = P2 = P4 = Q0 = Q2 = Q4 = 0 627 fold_num = fold_dict[codon1] 628 purine = ('A', 'G') 629 pyrimidine = ('T', 'C') 630 for n, (i, j) in enumerate(zip(codon1, codon2)): 631 if i!= j and (i in purine and j in purine): 632 if fold_num[n] == '0': 633 P0 += 1 634 elif fold_num[n] == '2': 635 P2 += 1 636 elif fold_num[n] == '4': 637 P4 += 1 638 else: 639 raise RuntimeError("Unexpected fold_num %d" % fold_num[n]) 640 if i!= j and (i in pyrimidine and j in pyrimidine): 641 if fold_num[n] == '0': 642 P0 += 1 643 elif fold_num[n] == '2': 644 P2 += 1 645 elif fold_num[n] == '4': 646 P4 += 1 647 else: 648 raise RuntimeError("Unexpected fold_num %d" % fold_num[n]) 649 if i != j and ((i in purine and j in pyrimidine) \ 650 or (i in pyrimidine and j in purine)): 651 if fold_num[n] == '0': 652 Q0 += 1 653 elif fold_num[n] == '2': 654 Q2 += 1 655 elif fold_num[n] == '4': 656 Q4 += 1 657 else: 658 raise RuntimeError("Unexpected fold_num %d" % fold_num[n]) 659 return (P0, P2, P4, Q0, Q2, Q4)
660
661 662 ################################################################# 663 # private functions for YN00 method 664 ################################################################# 665 666 -def _yn00(seq1, seq2, k, codon_table):
667 """Main function for yn00 method (PRIVATE). 668 """ 669 # nomenclature is according to PMID: 10666704 670 from collections import defaultdict 671 from scipy.linalg import expm 672 fcodon = [{'A': 0, 'G': 0, 'C': 0, 'T': 0}, 673 {'A': 0, 'G': 0, 'C': 0, 'T': 0}, 674 {'A': 0, 'G': 0, 'C': 0, 'T': 0}] 675 codon_fold_dict = _get_codon_fold(codon_table) 676 fold0_cnt = defaultdict(int) 677 fold4_cnt = defaultdict(int) 678 for codon in seq1 + seq2: 679 # count sites at different codon position 680 if codon != '---': 681 fcodon[0][codon[0]] += 1 682 fcodon[1][codon[1]] += 1 683 fcodon[2][codon[2]] += 1 684 # count sites in different degenerate fold class 685 fold_num = codon_fold_dict[codon] 686 for i, f in enumerate(fold_num): 687 if f == '0': 688 fold0_cnt[codon[i]] += 1 689 elif f == '4': 690 fold4_cnt[codon[i]] += 1 691 f0_total = sum(fold0_cnt.values()) 692 f4_total = sum(fold4_cnt.values()) 693 for i, j in zip(fold0_cnt, fold4_cnt): 694 fold0_cnt[i] = fold0_cnt[i]/f0_total 695 fold4_cnt[i] = fold4_cnt[i]/f4_total 696 # TODO: 697 # the initial kappa is different from what yn00 gives, 698 # try to find the problem. 699 TV = _get_TV(seq1, seq2, codon_table=codon_table) 700 k04 = (_get_kappa_t(fold0_cnt, TV), _get_kappa_t(fold4_cnt, TV)) 701 kappa = (f0_total*k04[0]+f4_total*k04[1])/(f0_total+f4_total) 702 #kappa = 2.4285 703 # count synonymous sites and non-synonymous sites 704 for i in range(3): 705 tot = sum(fcodon[i].values()) 706 fcodon[i] = dict((j, k/tot) for j, k in fcodon[i].items()) 707 pi = defaultdict(int) 708 for i in list(codon_table.forward_table.keys()) + codon_table.stop_codons: 709 if 'U' not in i: 710 pi[i] = 0 711 for i in seq1 + seq2: 712 pi[i] += 1 713 S_sites1, N_sites1, bfreqSN1 = _count_site_YN00(seq1, seq2, pi, 714 k=kappa, 715 codon_table=codon_table) 716 S_sites2, N_sites2, bfreqSN2 = _count_site_YN00(seq2, seq1, pi, 717 k=kappa, 718 codon_table=codon_table) 719 N_sites = (N_sites1+N_sites2)/2 720 S_sites = (S_sites1+S_sites2)/2 721 bfreqSN = [{'A': 0, 'T': 0, 'C': 0, 'G': 0}, 722 {'A': 0, 'T': 0, 'C': 0, 'G': 0}] 723 for i in range(2): 724 for b in ('A', 'T', 'C', 'G'): 725 bfreqSN[i][b] = (bfreqSN1[i][b]+bfreqSN2[i][b])/2 726 # use NG86 method to get initial t and w 727 SN = [0, 0] 728 for i, j in zip(seq1, seq2): 729 SN = [m+n for m, n in zip(SN, _count_diff_NG86( 730 i, j, 731 codon_table=codon_table) 732 ) 733 ] 734 ps = SN[0] / S_sites 735 pn = SN[1] / N_sites 736 p = sum(SN) / (S_sites+N_sites) 737 w = log(1-4.0/3*pn) / log(1-4.0/3*ps) 738 t = -3/4*log(1-4/3*p) 739 tolerance = 1e-5 740 dSdN_pre = [0, 0] 741 for temp in range(20): 742 # count synonymous and nonsynonymous differences under kappa, w, t 743 codon_lst = [i for i in \ 744 list(codon_table.forward_table.keys()) + \ 745 codon_table.stop_codons if 'U' not in i] 746 Q = _get_Q(pi, kappa, w, codon_lst, codon_table) 747 P = expm(Q*t) 748 TV = [0, 0, 0, 0] # synonymous/nonsynonymous transition/transvertion 749 sites = [0, 0] 750 codon_npath = {} 751 for i, j in zip(seq1, seq2): 752 if i != '---' and j != '---': 753 codon_npath.setdefault((i, j), 0) 754 codon_npath[(i, j)] += 1 755 for i in codon_npath: 756 tv = _count_diff_YN00(i[0], i[1], P, codon_lst, codon_table) 757 TV = [m+n*codon_npath[i] for m,n in zip(TV, tv)] 758 TV = (TV[0]/S_sites, TV[1]/S_sites), (TV[2]/N_sites, TV[3]/N_sites) 759 # according to the DistanceF84() function of yn00.c in paml, 760 # the t (e.q. 10) appears in PMID: 10666704 is dS and dN 761 dSdN = [] 762 for f, tv in zip(bfreqSN, TV): 763 dSdN.append(_get_kappa_t(f, tv, t=True)) 764 t = dSdN[0]*3*S_sites/(S_sites+N_sites)+dSdN[1]*3*N_sites/(S_sites+N_sites) 765 w = dSdN[1]/dSdN[0] 766 if all(map(lambda x: x<tolerance, [abs(i-j) for i,j in zip(dSdN, dSdN_pre)])): 767 return dSdN[1], dSdN[0] # dN, dS 768 dSdN_pre = dSdN
769
770 771 -def _get_TV(codon_lst1, codon_lst2, codon_table=default_codon_table):
772 """ 773 Argument: 774 - T - proportions of transitional differences 775 - V - proportions of transversional differences 776 """ 777 purine = ('A', 'G') 778 pyrimidine = ('C', 'T') 779 TV = [0, 0] 780 sites = 0 781 for codon1, codon2 in zip(codon_lst1, codon_lst2): 782 if "---" not in (codon1, codon2): 783 for i, j in zip(codon1, codon2): 784 if i == j: 785 pass 786 elif i in purine and j in purine: 787 TV[0] += 1 788 elif i in pyrimidine and j in pyrimidine: 789 TV[0] += 1 790 else: 791 TV[1] += 1 792 sites += 1 793 return (TV[0]/sites, TV[1]/sites)
794 #return (TV[0], TV[1])
795 796 797 -def _get_kappa_t(pi, TV, t=False):
798 """The following formula and variable names are according to 799 PMID: 10666704 800 """ 801 pi['Y'] = pi['T'] + pi['C'] 802 pi['R'] = pi['A'] + pi['G'] 803 A = (2*(pi['T']*pi['C']+pi['A']*pi['G'])+\ 804 2*(pi['T']*pi['C']*pi['R']/pi['Y']+pi['A']*pi['G']*pi['Y']/pi['R'])*\ 805 (1-TV[1]/(2*pi['Y']*pi['R']))-TV[0])/\ 806 (2*(pi['T']*pi['C']/pi['Y']+pi['A']*pi['G']/pi['R'])) 807 B = 1 - TV[1]/(2*pi['Y']*pi['R']) 808 a = -0.5*log(A) # this seems to be an error in YANG's original paper 809 b = -0.5*log(B) 810 kappaF84 = a/b-1 811 if t is False: 812 kappaHKY85 = 1+(pi['T']*pi['C']/pi['Y']+pi['A']*pi['G']/pi['R'])*\ 813 kappaF84/(pi['T']*pi['C']+pi['A']*pi['G']) 814 return kappaHKY85 815 else: 816 t = (4*pi['T']*pi['C']*(1+kappaF84/pi['Y'])+\ 817 4*pi['A']*pi['G']*(1+kappaF84/pi['R'])+4*pi['Y']*pi['R'])*b 818 return t
819
820 821 -def _count_site_YN00(codon_lst1, codon_lst2, pi, k, 822 codon_table=default_codon_table):
823 """Site counting method from Ina 1995, PMID: 7699723 and modified 824 by Yang, PMID: 10666704. The method will return the total number of 825 synonymous and nonsynonymous sites and base frequencies in each 826 category. The function is equivalent to CountSites() function in 827 yn00.c of PAML. 828 """ 829 if len(codon_lst1) != len(codon_lst2): 830 raise RuntimeError("Length of two codon_lst should be the same " 831 "(%d and %d detected)".format( 832 len(codon_lst1), 833 len(codon_lst2)) 834 ) 835 else: 836 length = len(codon_lst1) 837 purine = ('A', 'G') 838 pyrimidine = ('T', 'C') 839 base_tuple = ('A', 'T', 'C', 'G') 840 codon_dict = codon_table.forward_table 841 stop = codon_table.stop_codons 842 codon_npath = {} 843 for i, j in zip(codon_lst1, codon_lst2): 844 if i != '---' and j != '---': 845 codon_npath.setdefault((i, j), 0) 846 codon_npath[(i, j)] += 1 847 S_sites = N_sites = 0 848 freqSN = [{'A': 0, 'T': 0, 'C': 0, 'G': 0}, # synonymous 849 {'A': 0, 'T': 0, 'C': 0, 'G': 0}] # nonsynonymous 850 for codon_pair, npath in codon_npath.items(): 851 codon = codon_pair[0] 852 S = N = 0 853 for pos in range(3): 854 for base in base_tuple: 855 if codon[pos] == base: continue 856 neighbor_codon = codon[:pos] + base + codon[pos+1:] 857 if neighbor_codon in stop: continue 858 weight = pi[neighbor_codon] 859 if codon[pos] in pyrimidine and base in pyrimidine: 860 weight *= k 861 elif codon[pos] in purine and base in purine: 862 weight *= k 863 if codon_dict[codon] == codon_dict[neighbor_codon]: 864 S += weight 865 freqSN[0][base] += weight*npath 866 else: 867 N += weight 868 freqSN[1][base] += weight*npath 869 S_sites += S*npath 870 N_sites += N*npath 871 norm_const = 3*length/(S_sites+N_sites) 872 S_sites *= norm_const 873 N_sites *= norm_const 874 for i in freqSN: 875 norm_const = sum(i.values()) 876 for b in i: 877 i[b] /= norm_const 878 return S_sites, N_sites, freqSN
879
880 881 -def _count_diff_YN00(codon1, codon2, P, codon_lst, 882 codon_table=default_codon_table):
883 """Count differences between two codons (three-letter string). 884 The function will weighted multiple pathways from codon1 to codon2 885 according to P matrix of codon substitution. The proportion 886 of transition and transvertion (TV) will also be calculated in 887 the function (PRIVATE). 888 """ 889 if not all([isinstance(codon1, str), isinstance(codon2, str)]): 890 raise TypeError("_count_diff_YN00 accept string object to represent " 891 "codon ({0}, {1} detected)".format( 892 type(codon1), 893 type(codon2)) 894 ) 895 if len(codon1) != 3 or len(codon2) != 3: 896 raise RuntimeError("codon should be three letter string ({0}, {1} " 897 "detected)".format(len(codon1), len(codon2))) 898 TV = [0, 0, 0, 0] # transition and transvertion counts (synonymous and nonsynonymous) 899 site = 0 900 if codon1 == '---' or codon2 == '---': 901 return TV 902 base_tuple = ('A', 'C', 'G', 'T') 903 if not all([i in base_tuple for i in codon1]): 904 raise RuntimeError("Unrecognized character detected in codon1 {0} " 905 "(Codon is consist of " 906 "A, T, C or G)".format(codon1)) 907 if not all([i in base_tuple for i in codon2]): 908 raise RuntimeError("Unrecognized character detected in codon2 {0} " 909 "(Codon is consist of " 910 "A, T, C or G)".format(codon2)) 911 if codon1 == codon2: 912 return TV 913 else: 914 diff_pos = [] 915 for i, k in enumerate(zip(codon1, codon2)): 916 if k[0] != k[1]: 917 diff_pos.append(i) 918 def count_TV(codon1, codon2, diff, codon_table, weight=1): 919 purine = ('A', 'G') 920 pyrimidine = ('T', 'C') 921 dic = codon_table.forward_table 922 stop = codon_table.stop_codons 923 if codon1 in stop or codon2 in stop: 924 # stop codon is always considered as nonsynonymous 925 if codon1[diff] in purine and codon2[diff] in purine: 926 return [0, 0, weight, 0] 927 elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine: 928 return [0, 0, weight, 0] 929 else: 930 return [0, 0, 0, weight] 931 elif dic[codon1] == dic[codon2]: 932 if codon1[diff] in purine and codon2[diff] in purine: 933 return [weight, 0, 0, 0] 934 elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine: 935 return [weight, 0, 0, 0] 936 else: 937 return [0, weight, 0, 0] 938 else: 939 if codon1[diff] in purine and codon2[diff] in purine: 940 return [0, 0, weight, 0] 941 elif codon1[diff] in pyrimidine and codon2[diff] in pyrimidine: 942 return [0, 0, weight, 0] 943 else: 944 return [0, 0, 0, weight]
945 if len(diff_pos) == 1: 946 prob = 1 947 TV = [p+q for p,q in zip(TV,count_TV(codon1, codon2, diff_pos[0], codon_table))] 948 elif len(diff_pos) == 2: 949 codon2_aa = codon_table.forward_table[codon2] 950 tmp_codon = [codon1[:i] + codon2[i] + codon1[i+1:] \ 951 for i in diff_pos] 952 path_prob = [] 953 for i in tmp_codon: 954 codon_idx = list(map(codon_lst.index, [codon1, i, codon2])) 955 prob = (P[codon_idx[0], codon_idx[1]], 956 P[codon_idx[1], codon_idx[2]]) 957 path_prob.append(prob[0]*prob[1]) 958 path_prob = [2*i/sum(path_prob) for i in path_prob] 959 for n, i in enumerate(diff_pos): 960 temp_codon = codon1[:i] + codon2[i] + codon1[i+1:] 961 TV = [p+q for p,q in zip(TV,count_TV(codon1, temp_codon, i, 962 codon_table, 963 weight=path_prob[n]/2)) 964 ] 965 TV = [p+q for p,q in zip(TV,count_TV(codon1, temp_codon, i, 966 codon_table, 967 weight=path_prob[n]/2)) 968 ] 969 elif len(diff_pos) == 3: 970 codon2_aa = codon_table.forward_table[codon2] 971 paths = list(permutations([0, 1, 2], 3)) 972 path_prob = [] 973 tmp_codon = [] 974 for p in paths: 975 tmp1 = codon1[:p[0]] + codon2[p[0]] + codon1[p[0]+1:] 976 tmp2 = tmp1[:p[1]] + codon2[p[1]] + tmp1[p[1]+1:] 977 tmp_codon.append((tmp1, tmp2)) 978 codon_idx = list(map(codon_lst.index, [codon1, tmp1, tmp2, codon2])) 979 prob = (P[codon_idx[0], codon_idx[1]], 980 P[codon_idx[1], codon_idx[2]], 981 P[codon_idx[2], codon_idx[3]]) 982 path_prob.append(prob[0]*prob[1]*prob[2]) 983 path_prob = [3*i/sum(path_prob) for i in path_prob] 984 for i, j, k in zip(tmp_codon, path_prob, paths): 985 TV = [p+q for p,q in zip(TV, count_TV(codon1, i[0], k[0], 986 codon_table, weight=j/3)) 987 ] 988 TV = [p+q for p,q in zip(TV, count_TV(i[0], i[1], k[1], 989 codon_table, weight=j/3)) 990 ] 991 TV = [p+q for p,q in zip(TV, count_TV(i[1], codon2, k[1], 992 codon_table, weight=j/3)) 993 ] 994 if codon1 in codon_table.stop_codons or codon2 in codon_table.stop_codons: 995 site = [0, 3] 996 elif codon_table.forward_table[codon1] == codon_table.forward_table[codon2]: 997 site = [3, 0] 998 else: 999 site = [0, 3] 1000 return TV 1001
1002 1003 ################################################################# 1004 # private functions for Maximum Likelihood method 1005 ################################################################# 1006 1007 -def _ml(seq1, seq2, cmethod, codon_table):
1008 """Main function for ML method (PRIVATE). 1009 """ 1010 from collections import Counter 1011 from scipy.optimize import minimize 1012 codon_cnt = Counter() 1013 pi = _get_pi(seq1, seq2, cmethod, codon_table=codon_table) 1014 for i, j in zip(seq1, seq2): 1015 #if i != j and ('---' not in (i, j)): 1016 if '---' not in (i, j): 1017 codon_cnt[(i,j)] += 1 1018 codon_lst = [i for i in \ 1019 list(codon_table.forward_table.keys()) + codon_table.stop_codons \ 1020 if 'U' not in i] 1021 # apply optimization 1022 def func(params, pi=pi, codon_cnt=codon_cnt, codon_lst=codon_lst, 1023 codon_table=codon_table): 1024 """params = [t, k, w]""" 1025 return -_likelihood_func( 1026 params[0], params[1], params[2], pi, 1027 codon_cnt, codon_lst=codon_lst, 1028 codon_table=codon_table)
1029 # count sites 1030 opt_res = minimize(func, [1, 0.1, 2], method='L-BFGS-B', \ 1031 bounds=((1e-10, 20), (1e-10, 20), (1e-10, 10)), 1032 tol=1e-5) 1033 t, k, w = opt_res.x 1034 Q = _get_Q(pi, k, w, codon_lst, codon_table) 1035 Sd = Nd = 0 1036 for i, c1 in enumerate(codon_lst): 1037 for j, c2 in enumerate(codon_lst): 1038 if i != j: 1039 try: 1040 if codon_table.forward_table[c1] == \ 1041 codon_table.forward_table[c2]: 1042 # synonymous count 1043 Sd += pi[c1] * Q[i, j] 1044 else: 1045 # nonsynonymous count 1046 Nd += pi[c1] * Q[i, j] 1047 except: 1048 # This is probably due to stop codons 1049 pass 1050 Sd *= t 1051 Nd *= t 1052 # count differences (with w fixed to 1) 1053 opt_res = minimize(func, [1, 0.1, 2], method='L-BFGS-B', 1054 bounds=((1e-10, 20), (1e-10, 20), (1, 1)), 1055 tol=1e-5) 1056 t, k, w = opt_res.x 1057 Q = _get_Q(pi, k, w, codon_lst, codon_table) 1058 rhoS = rhoN = 0 1059 for i, c1 in enumerate(codon_lst): 1060 for j, c2 in enumerate(codon_lst): 1061 if i != j: 1062 try: 1063 if codon_table.forward_table[c1] == \ 1064 codon_table.forward_table[c2]: 1065 # synonymous count 1066 rhoS += pi[c1] * Q[i, j] 1067 else: 1068 # nonsynonymous count 1069 rhoN += pi[c1] * Q[i, j] 1070 except: 1071 # This is probably due to stop codons 1072 pass 1073 rhoS *= 3 1074 rhoN *= 3 1075 dN = Nd/rhoN 1076 dS = Sd/rhoS 1077 return dN, dS 1078
1079 1080 -def _get_pi(seq1, seq2, cmethod, codon_table=default_codon_table):
1081 """Obtain codon frequency dict (pi) from two codon list (PRIVATE). 1082 This function is designed for ML method. Available counting methods 1083 (cfreq) are F1x4, F3x4 and F64. 1084 """ 1085 #TODO: 1086 # Stop codon should not be allowed according to Yang. 1087 # Try to modify this! 1088 pi = {} 1089 if cmethod == 'F1x4': 1090 fcodon = {'A': 0, 'G': 0, 'C': 0, 'T': 0} 1091 for i in seq1 + seq2: 1092 if i != '---': 1093 for c in i: fcodon[c] += 1 1094 tot = sum(fcodon.values()) 1095 fcodon = dict((j, k/tot) for j, k in fcodon.items()) 1096 for i in codon_table.forward_table.keys() + codon_table.stop_codons: 1097 if 'U' not in i: 1098 pi[i] = fcodon[i[0]]*fcodon[i[1]]*fcodon[i[2]] 1099 elif cmethod == 'F3x4': 1100 # three codon position 1101 fcodon = [{'A': 0, 'G': 0, 'C': 0, 'T': 0}, 1102 {'A': 0, 'G': 0, 'C': 0, 'T': 0}, 1103 {'A': 0, 'G': 0, 'C': 0, 'T': 0}] 1104 for i in seq1 + seq2: 1105 if i != '---': 1106 fcodon[0][i[0]] += 1 1107 fcodon[1][i[1]] += 1 1108 fcodon[2][i[2]] += 1 1109 for i in range(3): 1110 tot = sum(fcodon[i].values()) 1111 fcodon[i] = dict((j, k/tot) for j, k in fcodon[i].items()) 1112 for i in list(codon_table.forward_table.keys()) + \ 1113 codon_table.stop_codons: 1114 if 'U' not in i: 1115 pi[i] = fcodon[0][i[0]]*fcodon[1][i[1]]*fcodon[2][i[2]] 1116 elif cmethod == 'F61': 1117 for i in codon_table.forward_table.keys() + codon_table.stop_codons: 1118 if 'U' not in i: 1119 pi[i] = 0.1 1120 for i in seq1 + seq2: 1121 if i != '---': pi[i] += 1 1122 tot = sum(pi.values()) 1123 pi = dict((j, k/tot) for j, k in pi.items()) 1124 return pi
1125
1126 1127 -def _q(i, j, pi, k, w, codon_table=default_codon_table):
1128 """Q matrix for codon substitution. 1129 1130 Arguments: 1131 - i, j : three letter codon string 1132 - pi : expected codon frequency 1133 - k : transition/transversion ratio 1134 - w : nonsynonymous/synonymous rate ratio 1135 - codon_table: Bio.Data.CodonTable object 1136 """ 1137 if i == j: 1138 # diagonal elements is the sum of all other elements 1139 return 0 1140 if i in codon_table.stop_codons or j in codon_table.stop_codons: 1141 return 0 1142 if (i not in pi) or (j not in pi): 1143 return 0 1144 purine = ('A', 'G') 1145 pyrimidine = ('T', 'C') 1146 diff = [] 1147 for n, (c1, c2) in enumerate(zip(i, j)): 1148 if c1 != c2: 1149 diff.append((n, c1, c2)) 1150 if len(diff) >= 2: 1151 return 0 1152 if codon_table.forward_table[i] == codon_table.forward_table[j]: 1153 # synonymous substitution 1154 if diff[0][1] in purine and diff[0][2] in purine: 1155 # transition 1156 return k*pi[j] 1157 elif diff[0][1] in pyrimidine and diff[0][2] in pyrimidine: 1158 # transition 1159 return k*pi[j] 1160 else: 1161 # transversion 1162 return pi[j] 1163 else: 1164 # nonsynonymous substitution 1165 if diff[0][1] in purine and diff[0][2] in purine: 1166 # transition 1167 return w*k*pi[j] 1168 elif diff[0][1] in pyrimidine and diff[0][2] in pyrimidine: 1169 # transition 1170 return w*k*pi[j] 1171 else: 1172 # transversion 1173 return w*pi[j]
1174
1175 1176 -def _get_Q(pi, k, w, codon_lst, codon_table):
1177 """Q matrix for codon substitution""" 1178 import numpy as np 1179 codon_num = len(codon_lst) 1180 Q = np.zeros((codon_num, codon_num)) 1181 for i in range(codon_num): 1182 for j in range(codon_num): 1183 if i != j: 1184 Q[i, j] = _q(codon_lst[i], codon_lst[j], pi, k, w, 1185 codon_table=codon_table) 1186 nucl_substitutions = 0 1187 for i in range(codon_num): 1188 Q[i,i] = -sum(Q[i,:]) 1189 try: 1190 nucl_substitutions += pi[codon_lst[i]] * (-Q[i, i]) 1191 except KeyError: 1192 pass 1193 Q = Q / nucl_substitutions 1194 return Q
1195
1196 1197 -def _likelihood_func(t, k, w, pi, codon_cnt, codon_lst, codon_table):
1198 """likelihood function for ML method 1199 """ 1200 from scipy.linalg import expm 1201 Q = _get_Q(pi, k, w, codon_lst, codon_table) 1202 P = expm(Q*t) 1203 l = 0 # likelihood value 1204 for i, c1 in enumerate(codon_lst): 1205 for j, c2 in enumerate(codon_lst): 1206 if (c1, c2) in codon_cnt: 1207 if P[i, j] * pi[c1] <= 0: 1208 l += codon_cnt[(c1, c2)]*0 1209 else: 1210 l += codon_cnt[(c1, c2)]*log(pi[c1]*P[i, j]) 1211 return l
1212 1213 1214 if __name__ == "__main__": 1215 from Bio._utils import run_doctest 1216 run_doctest() 1217