Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # Revisions copyright 2014 by Markus Piotrowski. 
  6  # Revisions copyright 2014-2016 by Peter Cock. 
  7  # All rights reserved. 
  8  # This code is part of the Biopython distribution and governed by its 
  9  # license.  Please see the LICENSE file that should have been included 
 10  # as part of this package. 
 11   
 12  """Miscellaneous functions for dealing with sequences.""" 
 13   
 14  from __future__ import print_function 
 15   
 16  import re 
 17  from math import pi, sin, cos 
 18   
 19  from Bio.Seq import Seq, MutableSeq 
 20  from Bio import Alphabet 
 21  from Bio.Alphabet import IUPAC 
 22  from Bio.Data import IUPACData 
 23   
 24   
 25  ###################################### 
 26  # DNA 
 27  ###################### 
 28  # {{{ 
 29   
 30   
31 -def GC(seq):
32 """Calculates G+C content, returns the percentage (float between 0 and 100). 33 34 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) 35 when counting the G and C content. The percentage is calculated against 36 the full length, e.g.: 37 38 >>> from Bio.SeqUtils import GC 39 >>> GC("ACTGN") 40 40.0 41 42 Note that this will return zero for an empty sequence. 43 """ 44 gc = sum(seq.count(x) for x in ['G', 'C', 'g', 'c', 'S', 's']) 45 try: 46 return gc * 100.0 / len(seq) 47 except ZeroDivisionError: 48 return 0.0
49 50
51 -def GC123(seq):
52 """Calculates total G+C content plus first, second and third positions. 53 54 Returns a tuple of four floats (percentages between 0 and 100) for the 55 entire sequence, and the three codon positions. e.g. 56 57 >>> from Bio.SeqUtils import GC123 58 >>> GC123("ACTGTN") 59 (40.0, 50.0, 50.0, 0.0) 60 61 Copes with mixed case sequences, but does NOT deal with ambiguous 62 nucleotides. 63 """ 64 d = {} 65 for nt in ['A', 'T', 'G', 'C']: 66 d[nt] = [0, 0, 0] 67 68 for i in range(0, len(seq), 3): 69 codon = seq[i:i + 3] 70 if len(codon) < 3: 71 codon += ' ' 72 for pos in range(0, 3): 73 for nt in ['A', 'T', 'G', 'C']: 74 if codon[pos] == nt or codon[pos] == nt.lower(): 75 d[nt][pos] += 1 76 gc = {} 77 gcall = 0 78 nall = 0 79 for i in range(0, 3): 80 try: 81 n = d['G'][i] + d['C'][i] + d['T'][i] + d['A'][i] 82 gc[i] = (d['G'][i] + d['C'][i]) * 100.0 / n 83 except Exception: # TODO - ValueError? 84 gc[i] = 0 85 86 gcall = gcall + d['G'][i] + d['C'][i] 87 nall = nall + n 88 89 gcall = 100.0 * gcall / nall 90 return gcall, gc[0], gc[1], gc[2]
91 92
93 -def GC_skew(seq, window=100):
94 """Calculates GC skew (G-C)/(G+C) for multiple windows along the sequence. 95 96 Returns a list of ratios (floats), controlled by the length of the sequence 97 and the size of the window. 98 99 Does NOT look at any ambiguous nucleotides. 100 """ 101 # 8/19/03: Iddo: added lowercase 102 values = [] 103 for i in range(0, len(seq), window): 104 s = seq[i: i + window] 105 g = s.count('G') + s.count('g') 106 c = s.count('C') + s.count('c') 107 skew = (g - c) / float(g + c) 108 values.append(skew) 109 return values
110 111
112 -def xGC_skew(seq, window=1000, zoom=100, 113 r=300, px=100, py=100):
114 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 115 try: 116 import Tkinter as tkinter # Python 2 117 except ImportError: 118 import tkinter # Python 3 119 120 yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL) 121 xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL) 122 canvas = tkinter.Canvas(yscrollcommand=yscroll.set, 123 xscrollcommand=xscroll.set, background='white') 124 win = canvas.winfo_toplevel() 125 win.geometry('700x700') 126 127 yscroll.config(command=canvas.yview) 128 xscroll.config(command=canvas.xview) 129 yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y) 130 xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X) 131 canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1) 132 canvas.update() 133 134 X0, Y0 = r + px, r + py 135 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r 136 137 ty = Y0 138 canvas.create_text(X0, ty, text='%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 139 ty += 20 140 canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq))) 141 ty += 20 142 canvas.create_text(X0, ty, text='GC Skew', fill='blue') 143 ty += 20 144 canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta') 145 ty += 20 146 canvas.create_oval(x1, y1, x2, y2) 147 148 acc = 0 149 start = 0 150 for gc in GC_skew(seq, window): 151 r1 = r 152 acc += gc 153 # GC skew 154 alpha = pi - (2 * pi * start) / len(seq) 155 r2 = r1 - gc * zoom 156 x1 = X0 + r1 * sin(alpha) 157 y1 = Y0 + r1 * cos(alpha) 158 x2 = X0 + r2 * sin(alpha) 159 y2 = Y0 + r2 * cos(alpha) 160 canvas.create_line(x1, y1, x2, y2, fill='blue') 161 # accumulated GC skew 162 r1 = r - 50 163 r2 = r1 - acc 164 x1 = X0 + r1 * sin(alpha) 165 y1 = Y0 + r1 * cos(alpha) 166 x2 = X0 + r2 * sin(alpha) 167 y2 = Y0 + r2 * cos(alpha) 168 canvas.create_line(x1, y1, x2, y2, fill='magenta') 169 170 canvas.update() 171 start += window 172 173 canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
174 175
176 -def nt_search(seq, subseq):
177 """Search for a DNA subseq in sequence. 178 179 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 180 searches only on forward strand 181 """ 182 pattern = '' 183 for nt in subseq: 184 value = IUPACData.ambiguous_dna_values[nt] 185 if len(value) == 1: 186 pattern += value 187 else: 188 pattern += '[%s]' % value 189 190 pos = -1 191 result = [pattern] 192 l = len(seq) 193 while True: 194 pos += 1 195 s = seq[pos:] 196 m = re.search(pattern, s) 197 if not m: 198 break 199 pos += int(m.start(0)) 200 result.append(pos) 201 return result
202 203 # }}} 204 205 ###################################### 206 # Protein 207 ###################### 208 # {{{ 209 210
211 -def seq3(seq, custom_map=None, undef_code='Xaa'):
212 """Turn a one letter code protein sequence into one with three letter codes. 213 214 The single required input argument 'seq' should be a protein sequence using 215 single letter codes, either as a python string or as a Seq or MutableSeq 216 object. 217 218 This function returns the amino acid sequence as a string using the three 219 letter amino acid codes. Output follows the IUPAC standard (including 220 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 221 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. 222 Any unknown character (including possible gap characters), is changed into 223 'Xaa' by default. 224 225 e.g. 226 227 >>> from Bio.SeqUtils import seq3 228 >>> seq3("MAIVMGRWKGAR*") 229 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 230 231 You can set a custom translation of the codon termination code using the 232 dictionary "custom_map" argument (which defaults to {'*': 'Ter'}), e.g. 233 234 >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"}) 235 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***' 236 237 You can also set a custom translation for non-amino acid characters, such 238 as '-', using the "undef_code" argument, e.g. 239 240 >>> seq3("MAIVMGRWKGA--R*", undef_code='---') 241 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer' 242 243 If not given, "undef_code" defaults to "Xaa", e.g. 244 245 >>> seq3("MAIVMGRWKGA--R*") 246 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer' 247 248 This function was inspired by BioPerl's seq3. 249 """ 250 if custom_map is None: 251 custom_map = {'*': 'Ter'} 252 # not doing .update() on IUPACData dict with custom_map dict 253 # to preserve its initial state (may be imported in other modules) 254 threecode = dict(list(IUPACData.protein_letters_1to3_extended.items()) + 255 list(custom_map.items())) 256 # We use a default of 'Xaa' for undefined letters 257 # Note this will map '-' to 'Xaa' which may be undesirable! 258 return ''.join(threecode.get(aa, undef_code) for aa in seq)
259 260
261 -def seq1(seq, custom_map=None, undef_code='X'):
262 """Turns a three-letter code protein sequence into one with single letter codes. 263 264 The single required input argument 'seq' should be a protein sequence 265 using three-letter codes, either as a python string or as a Seq or 266 MutableSeq object. 267 268 This function returns the amino acid sequence as a string using the one 269 letter amino acid codes. Output follows the IUPAC standard (including 270 ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for 271 "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code. 272 Any unknown character (including possible gap characters), is changed 273 into '-' by default. 274 275 e.g. 276 277 >>> from Bio.SeqUtils import seq3 278 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer") 279 'MAIVMGRWKGAR*' 280 281 The input is case insensitive, e.g. 282 283 >>> from Bio.SeqUtils import seq3 284 >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer") 285 'MAIVMGRWKGAR*' 286 287 You can set a custom translation of the codon termination code using the 288 dictionary "custom_map" argument (defaulting to {'Ter': '*'}), e.g. 289 290 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArg***", custom_map={"***": "*"}) 291 'MAIVMGRWKGAR*' 292 293 You can also set a custom translation for non-amino acid characters, such 294 as '-', using the "undef_code" argument, e.g. 295 296 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?') 297 'MAIVMGRWKGA??R*' 298 299 If not given, "undef_code" defaults to "X", e.g. 300 301 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer") 302 'MAIVMGRWKGAXXR*' 303 304 """ 305 if custom_map is None: 306 custom_map = {'Ter': '*'} 307 # reverse map of threecode 308 # upper() on all keys to enable caps-insensitive input seq handling 309 onecode = dict((k.upper(), v) for k, v in 310 IUPACData.protein_letters_3to1_extended.items()) 311 # add the given termination codon code and custom maps 312 onecode.update((k.upper(), v) for (k, v) in custom_map.items()) 313 seqlist = [seq[3 * i:3 * (i + 1)] for i in range(len(seq) // 3)] 314 return ''.join(onecode.get(aa.upper(), undef_code) for aa in seqlist)
315 316 317 # }}} 318 319 ###################################### 320 # Mixed ??? 321 ###################### 322 # {{{ 323 324
325 -def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False, 326 monoisotopic=False):
327 """Calculates the molecular weight of a DNA, RNA or protein sequence. 328 329 Only unambiguous letters are allowed. Nucleotide sequences are assumed to 330 have a 5' phosphate. 331 332 - seq: String or Biopython sequence object. 333 - seq_type: The default (None) is to take the alphabet from the seq argument, 334 or assume DNA if the seq argument is a string. Override this with 335 a string 'DNA', 'RNA', or 'protein'. 336 - double_stranded: Calculate the mass for the double stranded molecule? 337 - circular: Is the molecule circular (has no ends)? 338 - monoisotopic: Use the monoisotopic mass tables? 339 340 Note that for backwards compatibility, if the seq argument is a string, 341 or Seq object with a generic alphabet, and no seq_type is specified 342 (i.e. left as None), then DNA is assumed. 343 344 >>> print("%0.2f" % molecular_weight("AGC")) 345 949.61 346 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 347 949.61 348 349 However, it is better to be explicit - for example with strings: 350 351 >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 352 949.61 353 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 354 997.61 355 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 356 249.29 357 358 Or, with the sequence alphabet: 359 360 >>> from Bio.Seq import Seq 361 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 362 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna))) 363 949.61 364 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna))) 365 997.61 366 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein))) 367 249.29 368 369 Also note that contradictory sequence alphabets and seq_type will also 370 give an exception: 371 372 >>> from Bio.Seq import Seq 373 >>> from Bio.Alphabet import generic_dna 374 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA")) 375 Traceback (most recent call last): 376 ... 377 ValueError: seq_type='RNA' contradicts DNA from seq alphabet 378 379 """ 380 # Rewritten by Markus Piotrowski, 2014 381 382 # Find the alphabet type 383 tmp_type = '' 384 if isinstance(seq, (Seq, MutableSeq)): 385 base_alphabet = Alphabet._get_base_alphabet(seq.alphabet) 386 if isinstance(base_alphabet, Alphabet.DNAAlphabet): 387 tmp_type = 'DNA' 388 elif isinstance(base_alphabet, Alphabet.RNAAlphabet): 389 tmp_type = 'RNA' 390 elif isinstance(base_alphabet, Alphabet.ProteinAlphabet): 391 tmp_type = 'protein' 392 elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein): 393 tmp_type = 'protein' 394 # Convert to one-letter sequence. Have to use a string for seq1 395 seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet()) 396 elif not isinstance(base_alphabet, Alphabet.Alphabet): 397 raise TypeError("%s is not a valid alphabet for mass calculations" 398 % base_alphabet) 399 else: 400 tmp_type = "DNA" # backward compatibity 401 if seq_type and tmp_type and tmp_type != seq_type: 402 raise ValueError("seq_type=%r contradicts %s from seq alphabet" 403 % (seq_type, tmp_type)) 404 seq_type = tmp_type 405 elif isinstance(seq, str): 406 if seq_type is None: 407 seq_type = "DNA" # backward compatibity 408 else: 409 raise TypeError("Expected a string or Seq object, not seq=%r" % seq) 410 411 seq = ''.join(str(seq).split()).upper() # Do the minimum formatting 412 413 if seq_type == 'DNA': 414 if monoisotopic: 415 weight_table = IUPACData.monoisotopic_unambiguous_dna_weights 416 else: 417 weight_table = IUPACData.unambiguous_dna_weights 418 elif seq_type == 'RNA': 419 if monoisotopic: 420 weight_table = IUPACData.monoisotopic_unambiguous_rna_weights 421 else: 422 weight_table = IUPACData.unambiguous_rna_weights 423 elif seq_type == 'protein': 424 if monoisotopic: 425 weight_table = IUPACData.monoisotopic_protein_weights 426 else: 427 weight_table = IUPACData.protein_weights 428 else: 429 raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" 430 % seq_type) 431 432 if monoisotopic: 433 water = 18.010565 434 else: 435 water = 18.0153 436 437 try: 438 weight = sum(weight_table[x] for x in seq) - (len(seq) - 1) * water 439 if circular: 440 weight -= water 441 except KeyError as e: 442 raise ValueError('%s is not a valid unambiguous letter for %s' 443 % (e, seq_type)) 444 except: 445 raise 446 447 if seq_type in ('DNA', 'RNA') and double_stranded: 448 seq = str(Seq(seq).complement()) 449 weight += sum(weight_table[x] for x in seq) - (len(seq) - 1) * water 450 if circular: 451 weight -= water 452 elif seq_type == 'protein' and double_stranded: 453 raise ValueError('double-stranded proteins await their discovery') 454 455 return weight
456 457
458 -def six_frame_translations(seq, genetic_code=1):
459 """Formatted string showing the 6 frame translations and GC content. 460 461 nice looking 6 frame translation with GC content - code from xbbtools 462 similar to DNA Striders six-frame translation 463 464 >>> from Bio.SeqUtils import six_frame_translations 465 >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) 466 GC_Frame: a:5 t:0 g:8 c:5 467 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC 468 <BLANKLINE> 469 <BLANKLINE> 470 1/1 471 G H C N G P L 472 W P L * W A A 473 M A I V M G R * 474 auggccauuguaaugggccgcuga 54 % 475 uaccgguaacauuacccggcgacu 476 A M T I P R Q 477 H G N Y H A A S 478 P W Q L P G S 479 <BLANKLINE> 480 <BLANKLINE> 481 482 """ # noqa for pep8 W291 trailing whitespace 483 from Bio.Seq import reverse_complement, translate 484 anti = reverse_complement(seq) 485 comp = anti[::-1] 486 length = len(seq) 487 frames = {} 488 for i in range(0, 3): 489 fragment_length = 3 * ((length - i) // 3) 490 frames[i + 1] = translate(seq[i:i + fragment_length], genetic_code) 491 frames[-(i + 1)] = translate(anti[i:i + fragment_length], genetic_code)[::-1] 492 493 # create header 494 if length > 20: 495 short = '%s ... %s' % (seq[:10], seq[-10:]) 496 else: 497 short = seq 498 header = 'GC_Frame: ' 499 for nt in ['a', 't', 'g', 'c']: 500 header += '%s:%d ' % (nt, seq.count(nt.upper())) 501 502 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq)) 503 res = header 504 505 for i in range(0, length, 60): 506 subseq = seq[i:i + 60] 507 csubseq = comp[i:i + 60] 508 p = i // 3 509 res += '%d/%d\n' % (i + 1, i / 3 + 1) 510 res += ' ' + ' '.join(frames[3][p:p + 20]) + '\n' 511 res += ' ' + ' '.join(frames[2][p:p + 20]) + '\n' 512 res += ' '.join(frames[1][p:p + 20]) + '\n' 513 # seq 514 res += subseq.lower() + '%5d %%\n' % int(GC(subseq)) 515 res += csubseq.lower() + '\n' 516 # - frames 517 res += ' '.join(frames[-2][p:p + 20]) + ' \n' 518 res += ' ' + ' '.join(frames[-1][p:p + 20]) + '\n' 519 res += ' ' + ' '.join(frames[-3][p:p + 20]) + '\n\n' 520 return res
521 522 # }}} 523 524
525 -def _test():
526 """Run the module's doctests (PRIVATE).""" 527 import os 528 import doctest 529 if os.path.isdir(os.path.join("..", "Tests")): 530 print("Running doctests...") 531 cur_dir = os.path.abspath(os.curdir) 532 os.chdir(os.path.join("..", "Tests")) 533 doctest.testmod() 534 os.chdir(cur_dir) 535 del cur_dir 536 print("Done") 537 elif os.path.isdir(os.path.join("Tests")): 538 print("Running doctests...") 539 cur_dir = os.path.abspath(os.curdir) 540 os.chdir(os.path.join("Tests")) 541 doctest.testmod() 542 os.chdir(cur_dir) 543 del cur_dir 544 print("Done")
545 546 547 if __name__ == "__main__": 548 _test() 549