Package Bio :: Package CodonAlign
[hide private]
[frames] | no frames]

Source Code for Package Bio.CodonAlign

  1  # Copyright 2013 by Zheng Ruan (zruan1991@gmail.com). 
  2  # All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Code for dealing with Codon Alignment. 
  8  """ 
  9  from __future__ import print_function 
 10  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
 11   
 12  from Bio import BiopythonExperimentalWarning 
 13   
 14  import warnings 
 15  warnings.warn('Bio.CodonAlign is an experimental module which may undergo ' 
 16                'significant changes prior to its future official release.', 
 17                BiopythonExperimentalWarning) 
 18   
 19  try: 
 20      from itertools import izip 
 21  except ImportError: 
 22      izip = zip 
 23  #from itertools import izip 
 24   
 25  from Bio.SeqRecord import SeqRecord 
 26   
 27  from Bio.CodonAlign.CodonSeq import CodonSeq 
 28  from Bio.CodonAlign.CodonAlignment import CodonAlignment, mktest 
 29  from Bio.CodonAlign.CodonAlphabet import CodonAlphabet 
 30  from Bio.CodonAlign.CodonAlphabet import default_codon_table, default_codon_alphabet 
 31  from Bio.CodonAlign.CodonAlphabet import get_codon_alphabet as _get_codon_alphabet 
 32   
33 -def build(pro_align, nucl_seqs, corr_dict=None, gap_char='-', unknown='X', 34 codon_table=default_codon_table, alphabet=None, 35 complete_protein=False, anchor_len=10, max_score=10):
36 """Build a codon alignment from a protein alignment and 37 corresponding nucleotide sequences 38 39 Arguments: 40 - pro_align - a protein MultipleSeqAlignment object 41 - nucl_align - an object returned by SeqIO.parse or SeqIO.index 42 or a colloction of SeqRecord. 43 - alphabet - alphabet for the returned codon alignment 44 - corr_dict - a dict that maps protein id to nucleotide id 45 - complete_protein - whether the sequence begins with a start 46 codon 47 - frameshift - whether to appply frameshift detection 48 49 Return a CodonAlignment object 50 51 >>> from Bio.Alphabet import IUPAC 52 >>> from Bio.Seq import Seq 53 >>> from Bio.SeqRecord import SeqRecord 54 >>> from Bio.Align import MultipleSeqAlignment 55 >>> seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG', 56 ... alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') 57 >>> seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG', 58 ... alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') 59 >>> pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein),id='pro1') 60 >>> pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein),id='pro2') 61 >>> aln = MultipleSeqAlignment([pro1, pro2]) 62 >>> codon_aln = build(aln, [seq1, seq2]) 63 >>> print(codon_aln) 64 CodonAlphabet(Standard) CodonAlignment with 2 rows and 69 columns (23 codons) 65 TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGT...GAG pro1 66 TCAGGGACTTCGAGAACCAAGCG-CTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGT...GAG pro2 67 68 """ 69 # TODO 70 # add an option to allow the user to specify the returned object? 71 72 from Bio.Alphabet import ProteinAlphabet 73 from Bio.Align import MultipleSeqAlignment 74 75 # check the type of object of pro_align 76 if not isinstance(pro_align, MultipleSeqAlignment): 77 raise TypeError("the first argument should be a MultipleSeqAlignment " 78 "object") 79 # check the alphabet of pro_align 80 for pro in pro_align: 81 if not isinstance(pro.seq.alphabet, ProteinAlphabet): 82 raise TypeError("Alphabet Error!\nThe input alignment should be " 83 "a *PROTEIN* alignment") 84 if alphabet is None: 85 alphabet = _get_codon_alphabet(codon_table, gap_char=gap_char) 86 # check whether the number of seqs in pro_align and nucl_seqs is 87 # the same 88 pro_num = len(pro_align) 89 if corr_dict is None: 90 if nucl_seqs.__class__.__name__ == "generator": 91 # nucl_seqs will be a tuple if read by SeqIO.parse() 92 nucl_seqs = tuple(nucl_seqs) 93 nucl_num = len(nucl_seqs) 94 if pro_num > nucl_num: 95 raise ValueError("More Number of SeqRecords in Protein Alignment " 96 "({0}) than the Number of Nucleotide SeqRecords " 97 "({1}) are found!".format(pro_num, nucl_num)) 98 99 # Determine the protein sequences and nucl sequences 100 # correspondance. If nucl_seqs is a list, tuple or read by 101 # SeqIO.parse(), we assume the order of sequences in pro_align 102 # and nucl_seqs are the same. If nucl_seqs is a dict or read by 103 # SeqIO.index(), we match seqs in pro_align and those in 104 # nucl_seq by their id. 105 if nucl_seqs.__class__.__name__ in ("_IndexedSeqFileDict", "dict"): 106 corr_method = 1 107 elif nucl_seqs.__class__.__name__ in ("list", "tuple"): 108 corr_method = 0 109 else: 110 raise TypeError("Nucl Sequences Error, Unknown type to assign " 111 "correspondance method") 112 else: 113 if not isinstance(corr_dict, dict): 114 raise TypeError("corr_dict should be a dict that corresponds " 115 "protein id to nucleotide id!") 116 if len(corr_dict) >= pro_num: 117 # read by SeqIO.parse() 118 if nucl_seqs.__class__.__name__ == "generator": 119 from Bio import SeqIO 120 nucl_seqs = SeqIO.to_dict(nucl_seqs) 121 elif nucl_seqs.__class__.__name__ in ("list", "tuple"): 122 nucl_seqs = dict((i.id, i) for i in nucl_seqs) 123 #nucl_seqs = {i.id: i for i in nucl_seqs} 124 elif nucl_seqs.__class__.__name__ in \ 125 ("_IndexedSeqFileDict", "dict"): 126 pass 127 else: 128 raise TypeError("Nucl Sequences Error, Unknown type of " 129 "Nucleotide Records!") 130 corr_method = 2 131 else: 132 raise RuntimeError("Number of items in corr_dict ({0}) is less " 133 "than number of protein records " 134 "({1})".format(len(corr_dict), pro_num)) 135 136 # set up pro-nucl correspondance based on corr_method 137 # corr_method = 0, consecutive pairing 138 if corr_method == 0: 139 pro_nucl_pair = izip(pro_align, nucl_seqs) 140 # corr_method = 1, keyword pairing 141 elif corr_method == 1: 142 nucl_id = set(nucl_seqs.keys()) 143 pro_id = set([i.id for i in pro_align]) 144 # check if there is pro_id that does not have a nucleotide match 145 if pro_id - nucl_id: 146 diff = pro_id - nucl_id 147 raise ValueError("Protein Record {0} cannot find a nucleotide " 148 "sequence match, please check the " 149 "id".format(', '.join(diff))) 150 else: 151 pro_nucl_pair = [] 152 for pro_rec in pro_align: 153 pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id])) 154 # corr_method = 2, dict pairing 155 elif corr_method == 2: 156 pro_nucl_pair = [] 157 for pro_rec in pro_align: 158 try: 159 nucl_id = corr_dict[pro_rec.id] 160 except KeyError: 161 print("Protein record (%s) is not in corr_dict!" % pro_rec.id) 162 exit(1) 163 pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id])) 164 165 codon_aln = [] 166 shift = None 167 for pair in pro_nucl_pair: 168 # Beaware that the following span corresponds to an ungapped 169 # nucleotide sequence. 170 corr_span = _check_corr(pair[0], pair[1], gap_char=gap_char, 171 codon_table=codon_table, 172 complete_protein=complete_protein, 173 anchor_len=anchor_len) 174 if not corr_span: 175 raise ValueError("Protein Record {0} and Nucleotide Record {1} do" 176 " not match!".format((pair[0].id, pair[1].id))) 177 else: 178 codon_rec = _get_codon_rec(pair[0], pair[1], corr_span, 179 alphabet=alphabet, 180 complete_protein=False, 181 max_score=max_score) 182 codon_aln.append(codon_rec) 183 if corr_span[1] == 2: 184 shift = True 185 if shift is True: 186 return CodonAlignment(_align_shift_recs(codon_aln), alphabet=alphabet) 187 else: 188 return CodonAlignment(codon_aln, alphabet=alphabet)
189 190
191 -def _codons2re(codons):
192 """Generate regular expression based on a given list of codons 193 """ 194 reg = '' 195 for i in izip(*codons): 196 if len(set(i)) == 1: 197 reg += ''.join(set(i)) 198 else: 199 reg += '[' + ''.join(set(i)) + ']' 200 return reg
201 202
203 -def _get_aa_regex(codon_table, stop='*', unknown='X'):
204 """Set up the regular expression of a given CodonTable for 205 futher use. 206 207 >>> from Bio.Data.CodonTable import generic_by_id 208 >>> p = generic_by_id[1] 209 >>> t = _get_aa_regex(p) 210 >>> print(t['A'][0]) 211 G 212 >>> print(t['A'][1]) 213 C 214 >>> print(sorted(list(t['A'][2:]))) 215 ['A', 'C', 'G', 'T', 'U', '[', ']'] 216 >>> print(sorted(list(t['L'][:5]))) 217 ['C', 'T', 'U', '[', ']'] 218 >>> print(sorted(list(t['L'][5:9]))) 219 ['T', 'U', '[', ']'] 220 >>> print(sorted(list(t['L'][9:]))) 221 ['A', 'C', 'G', 'T', 'U', '[', ']'] 222 223 """ 224 from Bio.Data.CodonTable import CodonTable 225 if not isinstance(codon_table, CodonTable): 226 raise TypeError("Input table is not a instance of " 227 "Bio.Data.CodonTable object") 228 aa2codon = {} 229 for codon, aa in codon_table.forward_table.items(): 230 aa2codon.setdefault(aa, []).append(codon) 231 for aa, codons in aa2codon.items(): 232 aa2codon[aa] = _codons2re(codons) 233 aa2codon[stop] = _codons2re(codon_table.stop_codons) 234 aa2codon[unknown] = '...' 235 return aa2codon
236 237
238 -def _check_corr(pro, nucl, gap_char='-', codon_table=default_codon_table, 239 complete_protein=False, anchor_len=10):
240 """check if a give protein SeqRecord can be translated by another 241 nucleotide SeqRecord. 242 """ 243 import re 244 from Bio.Alphabet import NucleotideAlphabet 245 246 if not all([isinstance(pro, SeqRecord), isinstance(nucl, SeqRecord)]): 247 raise TypeError("_check_corr accept two SeqRecord object. Please " 248 "check your input.") 249 250 def get_alpha(alpha): 251 if hasattr(alpha, 'alphabet'): 252 return get_alpha(alpha.alphabet) 253 else: 254 return alpha
255 256 if not isinstance(get_alpha(nucl.seq.alphabet), NucleotideAlphabet): 257 raise TypeError("Alphabet for nucl should be an instance of " 258 "NucleotideAlphabet, {0} " 259 "detected".format(str(nucl.seq.alphabet))) 260 261 aa2re = _get_aa_regex(codon_table) 262 pro_re = "" 263 for aa in pro.seq: 264 if aa != gap_char: 265 pro_re += aa2re[aa] 266 267 nucl_seq = str(nucl.seq.upper().ungap(gap_char)) 268 match = re.search(pro_re, nucl_seq) 269 if match: 270 # mode = 0, direct match 271 return (match.span(), 0) 272 else: 273 # Might caused by mismatches or frameshift, using anchors to 274 # have a try 275 #anchor_len = 10 # adjust this value to test performance 276 pro_seq = str(pro.seq).replace(gap_char, "") 277 anchors = [pro_seq[i:(i+anchor_len)] for i in 278 range(0, len(pro_seq), anchor_len)] 279 # if the last anchor is less than the specified anchor 280 # size, we combine the penultimate and the last anchor 281 # together as the last one. 282 # TODO: modify this to deal with short sequence with only 283 # one anchor. 284 if len(anchors[-1]) < anchor_len: 285 anchors[-1] = anchors[-2] + anchors[-1] 286 287 pro_re = [] 288 anchor_distance = 0 289 anchor_pos = [] 290 for i, anchor in enumerate(anchors): 291 this_anchor_len = len(anchor) 292 qcodon = "" 293 fncodon = "" 294 ## dirty code to deal with the last anchor ## 295 # as the last anchor is combined in the steps 296 # above, we need to get the true last anchor to 297 # pro_re 298 if this_anchor_len == anchor_len: 299 for aa in anchor: 300 if complete_protein is True and i == 0: 301 qcodon += _codons2re(codon_table.start_codons) 302 fncodon += aa2re['X'] 303 continue 304 qcodon += aa2re[aa] 305 fncodon += aa2re['X'] 306 match = re.search(qcodon, nucl_seq) 307 elif this_anchor_len > anchor_len: 308 last_qcodon = "" 309 last_fcodon = "" 310 for j in range(anchor_len, len(anchor)): 311 last_qcodon += aa2re[anchor[j]] 312 last_fcodon += aa2re['X'] 313 match = re.search(last_qcodon, nucl_seq) 314 # build full_pro_re from anchors 315 if match: 316 anchor_pos.append((match.start(), match.end(), i)) 317 if this_anchor_len == anchor_len: 318 pro_re.append(qcodon) 319 else: 320 pro_re.append(last_qcodon) 321 else: 322 if this_anchor_len == anchor_len: 323 pro_re.append(fncodon) 324 else: 325 pro_re.append(last_fcodon) 326 full_pro_re = "".join(pro_re) 327 match = re.search(full_pro_re, nucl_seq) 328 if match: 329 # mode = 1, mismatch 330 return (match.span(), 1) 331 else: 332 # check frames of anchors 333 # ten frameshift events are allowed in a sequence 334 first_anchor = True 335 shift_id_pos = 0 336 # check the first anchor 337 if first_anchor is True and anchor_pos[0][2] != 0: 338 shift_val_lst = [1, 2, 3*anchor_len-2, 3*anchor_len-1, 0] 339 sh_anc = anchors[0] 340 for shift_val in shift_val_lst: 341 if shift_val == 0: 342 qcodon = None 343 break 344 if shift_val in (1, 2): 345 sh_nuc_len = anchor_len*3+shift_val 346 elif shift_val in (3*anchor_len-2, 3*anchor_len-1): 347 sh_nuc_len = anchor_len*3-(3*anchor_len-shift_val) 348 if anchor_pos[0][0] >= sh_nuc_len: 349 sh_nuc = nucl_seq[anchor_pos[0][0]-sh_nuc_len:anchor_pos[0][0]] 350 else: 351 #this is unlikely to produce the correct output 352 sh_nuc = nucl_seq[:anchor_pos[0][0]] 353 qcodon, shift_id_pos = _get_shift_anchor_re(sh_anc, sh_nuc, 354 shift_val, 355 aa2re, 356 anchor_len, 357 shift_id_pos) 358 if qcodon is not None and qcodon != -1: 359 # pro_re[0] should be '.'*anchor_len, therefore I 360 # replace it. 361 pro_re[0] = qcodon 362 break 363 if qcodon == -1: 364 warnings.warn("first frameshift detection failed for " 365 "{0}".format(nucl.id)) 366 # check anchors in the middle 367 for i in range(len(anchor_pos)-1): 368 shift_val = (anchor_pos[i+1][0]-anchor_pos[i][0]) % \ 369 (3*anchor_len) 370 sh_anc = "".join(anchors[anchor_pos[i][2]:anchor_pos[i+1][2]]) 371 sh_nuc = nucl_seq[anchor_pos[i][0]:anchor_pos[i+1][0]] 372 qcodon = None 373 if shift_val != 0: 374 qcodon, shift_id_pos = _get_shift_anchor_re(sh_anc, sh_nuc, 375 shift_val, 376 aa2re, 377 anchor_len, 378 shift_id_pos) 379 if qcodon is not None and qcodon != -1: 380 pro_re[anchor_pos[i][2]:anchor_pos[i+1][2]] = [qcodon] 381 qcodon = None 382 elif qcodon == -1: 383 warnings.warn("middle frameshift detection failed for " 384 "{0}".format(nucl.id)) 385 # check the last anchor 386 if anchor_pos[-1][2]+1 == len(anchors)-1: 387 sh_anc = anchors[-1] 388 this_anchor_len = len(sh_anc) 389 shift_val_lst = [1, 2, 3*this_anchor_len-2, 3*this_anchor_len-1, 0] 390 for shift_val in shift_val_lst: 391 if shift_val == 0: 392 qcodon = None 393 break 394 if shift_val in (1, 2): 395 sh_nuc_len = this_anchor_len*3+shift_val 396 elif shift_val in \ 397 (3*this_anchor_len-2, 3*this_anchor_len-1): 398 sh_nuc_len = this_anchor_len*3-(3*this_anchor_len-shift_val) 399 if len(nucl_seq)-anchor_pos[-1][0] >= sh_nuc_len: 400 sh_nuc = nucl_seq[anchor_pos[-1][0]:anchor_pos[-1][0]+sh_nuc_len] 401 else: 402 #this is unlikely to produce the correct output 403 sh_nuc = nucl_seq[anchor_pos[-1][0]:] 404 qcodon, shift_id_pos = _get_shift_anchor_re(sh_anc, sh_nuc, 405 shift_val, 406 aa2re, 407 this_anchor_len, 408 shift_id_pos) 409 if qcodon is not None and qcodon != -1: 410 pro_re.pop() 411 pro_re[-1] = qcodon 412 break 413 if qcodon == -1: 414 warnings.warn("last frameshift detection failed for " 415 "{0}".format(nucl.id)) 416 # try global match 417 full_pro_re = "".join(pro_re) 418 match = re.search(full_pro_re, nucl_seq) 419 if match: 420 return (match.span(), 2, match) 421 else: 422 raise RuntimeError("Protein SeqRecord ({0}) and Nucleotide " 423 "SeqRecord ({1}) do not " 424 "match!".format((pro.id, nucl.id))) 425 426
427 -def _get_shift_anchor_re(sh_anc, sh_nuc, shift_val, aa2re, anchor_len, 428 shift_id_pos):
429 """This function tries all the best to come up with an re that 430 matches a potentially shifted anchor. 431 432 Arguments: 433 - sh_anc - shifted anchor sequence 434 - sh_nuc - potentially corresponding nucleotide sequence 435 of sh_anc 436 - shift_val - 1 or 2 indicates forward frame shift, whereas 437 3*anchor_len-1 or 3*anchor_len-2 indicates 438 backward shift 439 - aa2re - aa to codon re dict 440 - anchor_len - length of the anchor 441 - shift_id_pos - specify current shift name we are at 442 """ 443 import re 444 shift_id = [chr(i) for i in range(97, 107)] 445 if 0 < shift_val < 3*anchor_len-2: 446 #if shift_val in (1, 2): 447 for j in range(len(sh_anc)): 448 qcodon = "^" 449 for k, aa in enumerate(sh_anc): 450 if k == j: 451 qcodon += aa2re[aa] + "(?P<" + shift_id[shift_id_pos] + ">..*)" 452 else: 453 qcodon += aa2re[aa] 454 qcodon += "$" 455 match = re.search(qcodon, sh_nuc) 456 if match: 457 qcodon = qcodon.replace('^', '').replace('$', '') 458 shift_id_pos += 1 459 return qcodon, shift_id_pos 460 if not match: 461 # failed to find a match (frameshift) 462 return -1, shift_id_pos 463 elif shift_val in (3*anchor_len-1, 3*anchor_len-2): 464 shift_val = 3*anchor_len-shift_val 465 # obtain shifted anchor and corresponding nucl 466 # first check if the shifted pos is just at the end of the 467 # previous anchor. 468 for j in range(1, len(sh_anc)): 469 qcodon = "^" 470 for k, aa in enumerate(sh_anc): 471 if k == j-1: 472 # will be considered in the next step 473 pass 474 elif k == j: 475 qcodon += _merge_aa2re( 476 sh_anc[j-1], sh_anc[j], shift_val, aa2re, 477 shift_id[shift_id_pos].upper()) 478 else: 479 qcodon += aa2re[aa] 480 qcodon += '$' 481 match = re.search(qcodon, sh_nuc) 482 if match: 483 qcodon = qcodon.replace('^', '').replace('$', '') 484 shift_id_pos += 1 485 return qcodon, shift_id_pos 486 if not match: 487 # failed to find a match (frameshift) 488 return -1, shift_id_pos
489 490
491 -def _merge_aa2re(aa1, aa2, shift_val, aa2re, reid):
492 """Function to merge two amino acids based on detected frame shift 493 value. 494 """ 495 def get_aa_from_codonre(re_aa): 496 aas = [] 497 m = 0 498 for i in re_aa: 499 if i == '[': 500 m = -1 501 aas.append('') 502 elif i == ']': 503 m = 0 504 continue 505 elif m == -1: 506 aas[-1] = aas[-1] + i 507 elif m == 0: 508 aas.append(i) 509 return aas
510 scodon = list(map(get_aa_from_codonre, (aa2re[aa1], aa2re[aa2]))) 511 if shift_val == 1: 512 intersect = ''.join(set(scodon[0][2]) & set(scodon[1][0])) 513 scodonre = '(?P<' + reid + '>' 514 scodonre += '[' + scodon[0][0] + ']' + \ 515 '[' + scodon[0][1] + ']' + \ 516 '[' + intersect + ']' + \ 517 '[' + scodon[1][1] + ']' + \ 518 '[' + scodon[1][2] + ']' 519 elif shift_val == 2: 520 intersect1 = ''.join(set(scodon[0][1]) & set(scodon[1][0])) 521 intersect2 = ''.join(set(scodon[0][2]) & set(scodon[1][1])) 522 scodonre = '(?P<' + reid + '>' 523 scodonre += '[' + scodon[0][0] + ']' + \ 524 '[' + intersect1 + ']' + \ 525 '[' + intersect2 + ']' + \ 526 '[' + scodon[1][2] + ']' 527 scodonre += ')' 528 return scodonre 529 530
531 -def _get_codon_rec(pro, nucl, span_mode, alphabet, gap_char="-", 532 codon_table=default_codon_table, complete_protein=False, 533 max_score=10):
534 """Generate codon alignment based on regular re match (PRIVATE) 535 536 span_mode is a tuple returned by _check_corr. The first element 537 is the span of a re search, and the second element is the mode 538 for the match. 539 540 mode 541 - 0: direct match 542 - 1: mismatch (no indels) 543 - 2: frameshift 544 545 """ 546 import re 547 from Bio.Seq import Seq 548 549 nucl_seq = nucl.seq.ungap(gap_char) 550 codon_seq = "" 551 span = span_mode[0] 552 mode = span_mode[1] 553 aa2re = _get_aa_regex(codon_table) 554 if mode in (0, 1): 555 if len(pro.seq.ungap(gap_char))*3 != (span[1]-span[0]): 556 raise ValueError("Protein Record {0} and Nucleotide Record {1} " 557 "do not match!".format((pro.id, nucl.id))) 558 aa_num = 0 559 for aa in pro.seq: 560 if aa == "-": 561 codon_seq += "---" 562 elif complete_protein is True and aa_num == 0: 563 this_codon = nucl_seq._data[span[0]:span[0]+3] 564 if not re.search(_codons2re[codon_table.start_codons], 565 this_codon.upper()): 566 max_score -= 1 567 warnings.warn("start codon of {0} ({1} {2}) does not " 568 "correspond to {3} " 569 "({4})".format(pro.id, aa, aa_num, 570 nucl.id, this_codon) 571 ) 572 if max_score == 0: 573 raise RuntimeError("max_score reached for {0}! Please " 574 "raise up the tolerance to get an " 575 "alignment in anyway".format(nucl.id)) 576 codon_seq += this_codon 577 aa_num += 1 578 else: 579 this_codon = nucl_seq._data[(span[0] + 3*aa_num): 580 (span[0] + 3*(aa_num+1))] 581 if not str(Seq(this_codon.upper()).translate()) == aa: 582 max_score -= 1 583 warnings.warn("%s(%s %d) does not correspond to %s(%s)" 584 % (pro.id, aa, aa_num, nucl.id, this_codon)) 585 if max_score == 0: 586 raise RuntimeError("max_score reached for {0}! Please " 587 "raise up the tolerance to get an " 588 "alignment in anyway".format(nucl.id)) 589 codon_seq += this_codon 590 aa_num += 1 591 return SeqRecord(CodonSeq(codon_seq, alphabet=alphabet), id=nucl.id) 592 elif mode == 2: 593 from collections import deque 594 shift_pos = deque([]) 595 shift_start = [] 596 match = span_mode[2] 597 m_groupdict = list(match.groupdict().keys()) 598 # backward frameshift 599 for i in m_groupdict: 600 shift_pos.append(match.span(i)) 601 shift_start.append(match.start(i)) 602 rf_table = [] 603 i = match.start() 604 while True: 605 rf_table.append(i) 606 i += 3 607 if i in shift_start and \ 608 m_groupdict[shift_start.index(i)].isupper(): 609 shift_index = shift_start.index(i) 610 shift_val = 6 - (shift_pos[shift_index][1] - 611 shift_pos[shift_index][0]) 612 rf_table.append(i) 613 rf_table.append(i+3-shift_val) 614 i = shift_pos[shift_index][1] 615 elif i in shift_start and \ 616 m_groupdict[shift_start.index(i)].islower(): 617 i = shift_pos[shift_start.index(i)][1] 618 if i >= match.end(): 619 break 620 aa_num = 0 621 for aa in pro.seq: 622 if aa == "-": 623 codon_seq += "---" 624 elif complete_protein is True and aa_num == 0: 625 this_codon = nucl_seq._data[rf_table[0]:rf_table[0]+3] 626 if not re.search(_codons2re[codon_table.start_codons], 627 this_codon.upper()): 628 max_score -= 1 629 warnings.warn("start codon of {0}({1} {2}) does not " 630 "correspond to {3}({4})".format( 631 pro.id, aa, aa_num, nucl.id, this_codon) 632 ) 633 codon_seq += this_codon 634 aa_num += 1 635 else: 636 if aa_num < len(pro.seq.ungap('-'))-1 and \ 637 rf_table[aa_num+1]-rf_table[aa_num]-3 < 0: 638 max_score -= 1 639 start = rf_table[aa_num] 640 end = start + (3-shift_val) 641 ngap = shift_val 642 this_codon = nucl_seq._data[start:end] + '-'*ngap 643 elif rf_table[aa_num]-rf_table[aa_num-1]-3 > 0: 644 max_score -= 1 645 start = rf_table[aa_num-1]+3 646 end = rf_table[aa_num] 647 ngap = 3-(rf_table[aa_num]-rf_table[aa_num-1]-3) 648 this_codon = nucl_seq._data[start:end] + '-'*ngap + \ 649 nucl_seq._data[rf_table[aa_num]:rf_table[aa_num]+3] 650 else: 651 start = rf_table[aa_num] 652 end = start + 3 653 this_codon = nucl_seq._data[start:end] 654 if not str(Seq(this_codon.upper()).translate()) == aa: 655 max_score -= 1 656 warnings.warn("Codon of {0}({1} {2}) does not " 657 "correspond to {3}({4})".format( 658 pro.id, aa, aa_num, nucl.id, 659 this_codon) 660 ) 661 if max_score == 0: 662 raise RuntimeError("max_score reached for {0}! Please " 663 "raise up the tolerance to get an " 664 "alignment in anyway".format(nucl.id)) 665 codon_seq += this_codon 666 aa_num += 1 667 return SeqRecord(CodonSeq(codon_seq, alphabet=alphabet, 668 rf_table=rf_table), id=nucl.id)
669 670
671 -def _align_shift_recs(recs):
672 """This function is useful to build alignment according to the 673 frameshift detected by _check_corr. 674 675 Argument: 676 - recs - a list of SeqRecords containing a CodonSeq dictated 677 by a rf_table (with frameshift in some of them). 678 """ 679 def find_next_int(k, lst): 680 idx = lst.index(k) 681 p = 0 682 while True: 683 if isinstance(lst[idx+p], int): 684 return lst[idx+p], p 685 p += 1
686 full_rf_table_lst = [rec.seq.get_full_rf_table() for rec in recs] 687 rf_num = [0] * len(recs) 688 for k, rec in enumerate(recs): 689 for i in rec.seq.get_full_rf_table(): 690 if isinstance(i, int): 691 rf_num[k] += 1 692 # isinstance(i, float) should be True 693 elif rec.seq._data[int(i):int(i)+3] == "---": 694 rf_num[k] += 1 695 if len(set(rf_num)) != 1: 696 raise RuntimeError("Number alignable codons unequal in given records") 697 i = 0 698 rec_num = len(recs) 699 while True: 700 add_lst = [] 701 try: 702 col_rf_lst = [k[i] for k in full_rf_table_lst] 703 except IndexError: 704 # we probably reached the last codon 705 break 706 for j, k in enumerate(col_rf_lst): 707 add_lst.append((j, int(k))) 708 if isinstance(k, float) and \ 709 recs[j].seq._data[int(k):int(k)+3] != "---": 710 m, p = find_next_int(k, full_rf_table_lst[j]) 711 if (m-k) % 3 != 0: 712 gap_num = 3 - (m - k) % 3 713 else: 714 gap_num = 0 715 if gap_num != 0: 716 gaps = '-'*int(gap_num) 717 seq = recs[j].seq._data[:int(k)] + gaps + \ 718 recs[j].seq._data[int(k):] 719 full_rf_table = full_rf_table_lst[j] 720 bp = full_rf_table.index(k) 721 full_rf_table = full_rf_table[:bp] + \ 722 [v+int(gap_num) for v in full_rf_table[bp+1:]] 723 full_rf_table_lst[j] = full_rf_table 724 recs[j].seq = CodonSeq(seq, 725 rf_table=recs[j].seq.rf_table, 726 alphabet=recs[j].seq.alphabet) 727 add_lst.pop() 728 gap_num += m-k 729 i += p - 1 730 if len(add_lst) != rec_num: 731 for j, k in add_lst: 732 gaps = "-"*int(gap_num) 733 seq = recs[j].seq._data[:int(k)] + gaps + \ 734 recs[j].seq._data[int(k):] 735 full_rf_table = full_rf_table_lst[j] 736 bp = full_rf_table.index(k) 737 inter_rf = [] 738 for t in filter(lambda x: x%3==0, range(len(gaps))): 739 inter_rf.append(k+t+3.0) 740 full_rf_table = full_rf_table[:bp] + inter_rf + \ 741 [v+int(gap_num) for v in full_rf_table[bp:]] 742 full_rf_table_lst[j] = full_rf_table 743 recs[j].seq = CodonSeq(seq, 744 rf_table=recs[j].seq.rf_table, 745 alphabet=recs[j].seq.alphabet) 746 i += 1 747 return recs 748 749 750 #def toCodonAlignment(align, alphabet=default_codon_alphabet): 751 # """Function to convert a MultipleSeqAlignment to CodonAlignment. 752 # It is the user's responsibility to ensure all the requirement 753 # needed by CodonAlignment is met. 754 # 755 # """ 756 # rec = [SeqRecord(CodonSeq(str(i.seq), alphabet=alphabet), id=i.id) \ 757 # for i in align._records] 758 # return CodonAlignment(rec, alphabet=align._alphabet) 759 760 761 if __name__ == "__main__": 762 from Bio._utils import run_doctest 763 run_doctest() 764