Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # Revisions copyright 2014 by Markus Piotrowski. 
  6  # Revisions copyright 2014 by Peter Cock. 
  7  # All rights reserved. 
  8  # This code is part of the Biopython distribution and governed by its 
  9  # license.  Please see the LICENSE file that should have been included 
 10  # as part of this package. 
 11   
 12  """Miscellaneous functions for dealing with sequences.""" 
 13   
 14  from __future__ import print_function 
 15   
 16  import re 
 17  from math import pi, sin, cos 
 18   
 19  from Bio.Seq import Seq, MutableSeq 
 20  from Bio import Alphabet 
 21  from Bio.Alphabet import IUPAC 
 22  from Bio.Data import IUPACData 
 23   
 24   
 25  ###################################### 
 26  # DNA 
 27  ###################### 
 28  # {{{ 
 29   
 30   
31 -def GC(seq):
32 """Calculates G+C content, returns the percentage (float between 0 and 100). 33 34 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) 35 when counting the G and C content. The percentage is calculated against 36 the full length, e.g.: 37 38 >>> from Bio.SeqUtils import GC 39 >>> GC("ACTGN") 40 40.0 41 42 Note that this will return zero for an empty sequence. 43 """ 44 try: 45 gc = sum(seq.count(x) for x in ['G', 'C', 'g', 'c', 'S', 's']) 46 return gc*100.0/len(seq) 47 except ZeroDivisionError: 48 return 0.0
49 50
51 -def GC123(seq):
52 """Calculates total G+C content plus first, second and third positions. 53 54 Returns a tuple of four floats (percentages between 0 and 100) for the 55 entire sequence, and the three codon positions. e.g. 56 57 >>> from Bio.SeqUtils import GC123 58 >>> GC123("ACTGTN") 59 (40.0, 50.0, 50.0, 0.0) 60 61 Copes with mixed case sequences, but does NOT deal with ambiguous 62 nucleotides. 63 """ 64 d= {} 65 for nt in ['A', 'T', 'G', 'C']: 66 d[nt] = [0, 0, 0] 67 68 for i in range(0, len(seq), 3): 69 codon = seq[i:i+3] 70 if len(codon) < 3: 71 codon += ' ' 72 for pos in range(0, 3): 73 for nt in ['A', 'T', 'G', 'C']: 74 if codon[pos] == nt or codon[pos] == nt.lower(): 75 d[nt][pos] += 1 76 gc = {} 77 gcall = 0 78 nall = 0 79 for i in range(0, 3): 80 try: 81 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 82 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 83 except: 84 gc[i] = 0 85 86 gcall = gcall + d['G'][i] + d['C'][i] 87 nall = nall + n 88 89 gcall = 100.0*gcall/nall 90 return gcall, gc[0], gc[1], gc[2]
91 92
93 -def GC_skew(seq, window=100):
94 """Calculates GC skew (G-C)/(G+C) for multiple windows along the sequence. 95 96 Returns a list of ratios (floats), controlled by the length of the sequence 97 and the size of the window. 98 99 Does NOT look at any ambiguous nucleotides. 100 """ 101 # 8/19/03: Iddo: added lowercase 102 values = [] 103 for i in range(0, len(seq), window): 104 s = seq[i: i + window] 105 g = s.count('G') + s.count('g') 106 c = s.count('C') + s.count('c') 107 skew = (g-c)/float(g+c) 108 values.append(skew) 109 return values
110 111
112 -def xGC_skew(seq, window=1000, zoom=100, 113 r=300, px=100, py=100):
114 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 115 try: 116 import Tkinter as tkinter # Python 2 117 except ImportError: 118 import tkinter # Python 3 119 120 yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL) 121 xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL) 122 canvas = tkinter.Canvas(yscrollcommand=yscroll.set, 123 xscrollcommand=xscroll.set, background='white') 124 win = canvas.winfo_toplevel() 125 win.geometry('700x700') 126 127 yscroll.config(command=canvas.yview) 128 xscroll.config(command=canvas.xview) 129 yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y) 130 xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X) 131 canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1) 132 canvas.update() 133 134 X0, Y0 = r + px, r + py 135 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r 136 137 ty = Y0 138 canvas.create_text(X0, ty, text='%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 139 ty += 20 140 canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq))) 141 ty += 20 142 canvas.create_text(X0, ty, text='GC Skew', fill='blue') 143 ty += 20 144 canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta') 145 ty += 20 146 canvas.create_oval(x1, y1, x2, y2) 147 148 acc = 0 149 start = 0 150 for gc in GC_skew(seq, window): 151 r1 = r 152 acc += gc 153 # GC skew 154 alpha = pi - (2*pi*start)/len(seq) 155 r2 = r1 - gc*zoom 156 x1 = X0 + r1 * sin(alpha) 157 y1 = Y0 + r1 * cos(alpha) 158 x2 = X0 + r2 * sin(alpha) 159 y2 = Y0 + r2 * cos(alpha) 160 canvas.create_line(x1, y1, x2, y2, fill='blue') 161 # accumulated GC skew 162 r1 = r - 50 163 r2 = r1 - acc 164 x1 = X0 + r1 * sin(alpha) 165 y1 = Y0 + r1 * cos(alpha) 166 x2 = X0 + r2 * sin(alpha) 167 y2 = Y0 + r2 * cos(alpha) 168 canvas.create_line(x1, y1, x2, y2, fill='magenta') 169 170 canvas.update() 171 start += window 172 173 canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
174 175
176 -def nt_search(seq, subseq):
177 """Search for a DNA subseq in sequence. 178 179 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 180 searches only on forward strand 181 """ 182 pattern = '' 183 for nt in subseq: 184 value = IUPACData.ambiguous_dna_values[nt] 185 if len(value) == 1: 186 pattern += value 187 else: 188 pattern += '[%s]' % value 189 190 pos = -1 191 result = [pattern] 192 l = len(seq) 193 while True: 194 pos += 1 195 s = seq[pos:] 196 m = re.search(pattern, s) 197 if not m: 198 break 199 pos += int(m.start(0)) 200 result.append(pos) 201 return result
202 203 # }}} 204 205 ###################################### 206 # Protein 207 ###################### 208 # {{{ 209
210 -def seq3(seq, custom_map={'*': 'Ter'}, undef_code='Xaa'):
211 """Turn a one letter code protein sequence into one with three letter codes. 212 213 The single input argument 'seq' should be a protein sequence using single 214 letter codes, either as a python string or as a Seq or MutableSeq object. 215 216 This function returns the amino acid sequence as a string using the three 217 letter amino acid codes. Output follows the IUPAC standard (including 218 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 219 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. 220 Any unknown character (including possible gap characters), is changed into 221 'Xaa'. 222 223 e.g. 224 225 >>> from Bio.SeqUtils import seq3 226 >>> seq3("MAIVMGRWKGAR*") 227 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 228 229 You can set a custom translation of the codon termination code using the 230 "custom_map" argument, e.g. 231 232 >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"}) 233 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***' 234 235 You can also set a custom translation for non-amino acid characters, such 236 as '-', using the "undef_code" argument, e.g. 237 238 >>> seq3("MAIVMGRWKGA--R*", undef_code='---') 239 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer' 240 241 If not given, "undef_code" defaults to "Xaa", e.g. 242 243 >>> seq3("MAIVMGRWKGA--R*") 244 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer' 245 246 This function was inspired by BioPerl's seq3. 247 """ 248 # not doing .update() on IUPACData dict with custom_map dict 249 # to preserve its initial state (may be imported in other modules) 250 threecode = dict(list(IUPACData.protein_letters_1to3_extended.items()) + 251 list(custom_map.items())) 252 #We use a default of 'Xaa' for undefined letters 253 #Note this will map '-' to 'Xaa' which may be undesirable! 254 return ''.join(threecode.get(aa, undef_code) for aa in seq)
255 256
257 -def seq1(seq, custom_map={'Ter': '*'}, undef_code='X'):
258 """Turns a three-letter code protein sequence into one with single letter codes. 259 260 The single input argument 'seq' should be a protein sequence using three- 261 letter codes, either as a python string or as a Seq or MutableSeq object. 262 263 This function returns the amino acid sequence as a string using the one 264 letter amino acid codes. Output follows the IUPAC standard (including 265 ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for 266 "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code. 267 Any unknown character (including possible gap characters), is changed into 268 '-'. 269 270 e.g. 271 272 >>> from Bio.SeqUtils import seq3 273 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer") 274 'MAIVMGRWKGAR*' 275 276 The input is case insensitive, e.g. 277 278 >>> from Bio.SeqUtils import seq3 279 >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer") 280 'MAIVMGRWKGAR*' 281 282 You can set a custom translation of the codon termination code using the 283 "custom_map" argument, e.g. 284 285 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArg***", custom_map={"***": "*"}) 286 'MAIVMGRWKGAR*' 287 288 You can also set a custom translation for non-amino acid characters, such 289 as '-', using the "undef_code" argument, e.g. 290 291 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?') 292 'MAIVMGRWKGA??R*' 293 294 If not given, "undef_code" defaults to "X", e.g. 295 296 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer") 297 'MAIVMGRWKGAXXR*' 298 299 """ 300 # reverse map of threecode 301 # upper() on all keys to enable caps-insensitive input seq handling 302 onecode = dict((k.upper(), v) for k, v in 303 IUPACData.protein_letters_3to1_extended.items()) 304 # add the given termination codon code and custom maps 305 onecode.update((k.upper(), v) for (k, v) in custom_map.items()) 306 seqlist = [seq[3*i:3*(i+1)] for i in range(len(seq) // 3)] 307 return ''.join(onecode.get(aa.upper(), undef_code) for aa in seqlist)
308 309 310 # }}} 311 312 ###################################### 313 # Mixed ??? 314 ###################### 315 # {{{ 316 317
318 -def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False, 319 monoisotopic=False):
320 """Calculates the molecular weight of a DNA, RNA or protein sequence. 321 322 Only unambiguous letters are allowed. Nucleotide sequences are assumed to 323 have a 5' phosphate. 324 325 seq: String or Biopython sequence object. 326 seq_type: The default (None) is to take the alphabet from the seq argument, 327 or assume DNA if the seq argument is a string. Override this with 328 a string 'DNA', 'RNA', or 'protein'. 329 double_stranded: Calculate the mass for the double stranded molecule? 330 circular: Is the molecule circular (has no ends)? 331 monoisotopic: Use the monoisotopic mass tables? 332 333 Note that for backwards compatibility, if the seq argument is a string, 334 or Seq object with a generic alphabet, and no seq_type is specified 335 (i.e. left as None), then DNA is assumed. 336 337 >>> print("%0.2f" % molecular_weight("AGC")) 338 949.61 339 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 340 949.61 341 342 However, it is better to be explicit - for example with strings: 343 344 >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 345 949.61 346 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 347 997.61 348 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 349 249.29 350 351 Or, with the sequence alphabet: 352 353 >>> from Bio.Seq import Seq 354 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 355 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna))) 356 949.61 357 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna))) 358 997.61 359 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein))) 360 249.29 361 362 Also note that contradictory sequence alphabets and seq_type will also 363 give an exception: 364 365 >>> from Bio.Seq import Seq 366 >>> from Bio.Alphabet import generic_dna 367 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA")) 368 Traceback (most recent call last): 369 ... 370 ValueError: seq_type='RNA' contradicts DNA from seq alphabet 371 372 """ 373 # Rewritten by Markus Piotrowski, 2014 374 375 # Find the alphabet type 376 tmp_type = '' 377 if isinstance(seq, Seq) or isinstance(seq, MutableSeq): 378 base_alphabet = Alphabet._get_base_alphabet(seq.alphabet) 379 if isinstance(base_alphabet, Alphabet.DNAAlphabet): 380 tmp_type = 'DNA' 381 elif isinstance(base_alphabet, Alphabet.RNAAlphabet): 382 tmp_type = 'RNA' 383 elif isinstance(base_alphabet, Alphabet.ProteinAlphabet): 384 tmp_type = 'protein' 385 elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein): 386 tmp_type = 'protein' 387 # Convert to one-letter sequence. Have to use a string for seq1 388 seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet()) 389 elif not isinstance(base_alphabet, Alphabet.Alphabet): 390 raise TypeError("%s is not a valid alphabet for mass calculations" 391 % base_alphabet) 392 else: 393 tmp_type = "DNA" # backward compatibity 394 if seq_type and tmp_type and tmp_type != seq_type: 395 raise ValueError("seq_type=%r contradicts %s from seq alphabet" 396 % (seq_type, tmp_type)) 397 seq_type = tmp_type 398 elif isinstance(seq, str): 399 if seq_type is None: 400 seq_type = "DNA" # backward compatibity 401 else: 402 raise TypeError("Expected a string or Seq object, not seq=%r" % seq) 403 404 seq = ''.join(str(seq).split()).upper() # Do the minimum formatting 405 406 if seq_type == 'DNA': 407 if monoisotopic: 408 weight_table = IUPACData.monoisotopic_unambiguous_dna_weights 409 else: 410 weight_table = IUPACData.unambiguous_dna_weights 411 elif seq_type == 'RNA': 412 if monoisotopic: 413 weight_table = IUPACData.monoisotopic_unambiguous_rna_weights 414 else: 415 weight_table = IUPACData.unambiguous_rna_weights 416 elif seq_type == 'protein': 417 if monoisotopic: 418 weight_table = IUPACData.monoisotopic_protein_weights 419 else: 420 weight_table = IUPACData.protein_weights 421 else: 422 raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" 423 % seq_type) 424 425 if monoisotopic: 426 water = 18.010565 427 else: 428 water = 18.0153 429 430 try: 431 weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water 432 if circular: 433 weight -= water 434 except KeyError as e: 435 raise ValueError('%s is not a valid unambiguous letter for %s' 436 %(e, seq_type)) 437 except: 438 raise 439 440 if seq_type in ('DNA', 'RNA') and double_stranded: 441 seq = str(Seq(seq).complement()) 442 weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water 443 if circular: 444 weight -= water 445 elif seq_type == 'protein' and double_stranded: 446 raise ValueError('double-stranded proteins await their discovery') 447 448 return weight
449 450
451 -def six_frame_translations(seq, genetic_code=1):
452 """Formatted string showing the 6 frame translations and GC content. 453 454 nice looking 6 frame translation with GC content - code from xbbtools 455 similar to DNA Striders six-frame translation 456 457 >>> from Bio.SeqUtils import six_frame_translations 458 >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) 459 GC_Frame: a:5 t:0 g:8 c:5 460 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC 461 <BLANKLINE> 462 <BLANKLINE> 463 1/1 464 G H C N G P L 465 W P L * W A A 466 M A I V M G R * 467 auggccauuguaaugggccgcuga 54 % 468 uaccgguaacauuacccggcgacu 469 A M T I P R Q 470 H G N Y H A A S 471 P W Q L P G S 472 <BLANKLINE> 473 <BLANKLINE> 474 475 """ 476 from Bio.Seq import reverse_complement, translate 477 anti = reverse_complement(seq) 478 comp = anti[::-1] 479 length = len(seq) 480 frames = {} 481 for i in range(0, 3): 482 fragment_length = 3 * ((length-i) // 3) 483 frames[i+1] = translate(seq[i:i+fragment_length], genetic_code) 484 frames[-(i+1)] = translate(anti[i:i+fragment_length], genetic_code)[::-1] 485 486 # create header 487 if length > 20: 488 short = '%s ... %s' % (seq[:10], seq[-10:]) 489 else: 490 short = seq 491 header = 'GC_Frame: ' 492 for nt in ['a', 't', 'g', 'c']: 493 header += '%s:%d ' % (nt, seq.count(nt.upper())) 494 495 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq)) 496 res = header 497 498 for i in range(0, length, 60): 499 subseq = seq[i:i+60] 500 csubseq = comp[i:i+60] 501 p = i//3 502 res += '%d/%d\n' % (i+1, i/3+1) 503 res += ' ' + ' '.join(frames[3][p:p+20]) + '\n' 504 res += ' ' + ' '.join(frames[2][p:p+20]) + '\n' 505 res += ' '.join(frames[1][p:p+20]) + '\n' 506 # seq 507 res += subseq.lower() + '%5d %%\n' % int(GC(subseq)) 508 res += csubseq.lower() + '\n' 509 # - frames 510 res += ' '.join(frames[-2][p:p+20]) +' \n' 511 res += ' ' + ' '.join(frames[-1][p:p+20]) + '\n' 512 res += ' ' + ' '.join(frames[-3][p:p+20]) + '\n\n' 513 return res
514 515 # }}} 516 517 ###################################### 518 # FASTA file utilities 519 ###################### 520 # {{{ 521 522
523 -def quick_FASTA_reader(file):
524 """Simple FASTA reader, returning a list of string tuples (DEPRECATED). 525 526 The single argument 'file' should be the filename of a FASTA format file. 527 This function will open and read in the entire file, constructing a list 528 of all the records, each held as a tuple of strings (the sequence name or 529 title, and its sequence). 530 531 >>> seqs = quick_FASTA_reader("Fasta/dups.fasta") 532 >>> for title, sequence in seqs: 533 ... print("%s %s" % (title, sequence)) 534 alpha ACGTA 535 beta CGTC 536 gamma CCGCC 537 alpha (again - this is a duplicate entry to test the indexing code) ACGTA 538 delta CGCGC 539 540 This function was is fast, but because it returns the data as a single in 541 memory list, is unsuitable for large files where an iterator approach is 542 preferable. 543 544 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 545 allows you to iterate over the records one by one (avoiding having all the 546 records in memory at once). Using Bio.SeqIO also makes it easy to switch 547 between different input file formats. However, please note that rather 548 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 549 550 If you want to use simple strings, use the function SimpleFastaParser 551 added to Bio.SeqIO.FastaIO in Biopython 1.61 instead. 552 """ 553 import warnings 554 from Bio import BiopythonDeprecationWarning 555 warnings.warn("The quick_FASTA_reader has been deprecated and will be " 556 "removed in a future release of Biopython. Please try " 557 "function SimpleFastaParser from Bio.SeqIO.FastaIO " 558 "instead.", BiopythonDeprecationWarning) 559 from Bio.SeqIO.FastaIO import SimpleFastaParser 560 with open(file) as handle: 561 entries = list(SimpleFastaParser(handle)) 562 return entries
563 564 565 # }}} 566 567
568 -def _test():
569 """Run the module's doctests (PRIVATE).""" 570 import os 571 import doctest 572 if os.path.isdir(os.path.join("..", "Tests")): 573 print("Running doctests...") 574 cur_dir = os.path.abspath(os.curdir) 575 os.chdir(os.path.join("..", "Tests")) 576 doctest.testmod() 577 os.chdir(cur_dir) 578 del cur_dir 579 print("Done") 580 elif os.path.isdir(os.path.join("Tests")): 581 print("Running doctests...") 582 cur_dir = os.path.abspath(os.curdir) 583 os.chdir(os.path.join("Tests")) 584 doctest.testmod() 585 os.chdir(cur_dir) 586 del cur_dir 587 print("Done")
588 589 590 if __name__ == "__main__": 591 _test() 592