Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # Revisions copyright 2014 by Markus Piotrowski. 
  6  # Revisions copyright 2014-2016 by Peter Cock. 
  7  # All rights reserved. 
  8  # This code is part of the Biopython distribution and governed by its 
  9  # license.  Please see the LICENSE file that should have been included 
 10  # as part of this package. 
 11   
 12  """Miscellaneous functions for dealing with sequences.""" 
 13   
 14  from __future__ import print_function 
 15   
 16  import re 
 17  from math import pi, sin, cos 
 18   
 19  from Bio.Seq import Seq, MutableSeq 
 20  from Bio import Alphabet 
 21  from Bio.Data import IUPACData 
 22   
 23   
 24  ###################################### 
 25  # DNA 
 26  ###################### 
 27  # {{{ 
 28   
 29   
30 -def GC(seq):
31 """Calculates G+C content, returns the percentage (float between 0 and 100). 32 33 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) 34 when counting the G and C content. The percentage is calculated against 35 the full length, e.g.: 36 37 >>> from Bio.SeqUtils import GC 38 >>> GC("ACTGN") 39 40.0 40 41 Note that this will return zero for an empty sequence. 42 """ 43 gc = sum(seq.count(x) for x in ['G', 'C', 'g', 'c', 'S', 's']) 44 try: 45 return gc * 100.0 / len(seq) 46 except ZeroDivisionError: 47 return 0.0
48 49
50 -def GC123(seq):
51 """Calculates total G+C content plus first, second and third positions. 52 53 Returns a tuple of four floats (percentages between 0 and 100) for the 54 entire sequence, and the three codon positions. e.g. 55 56 >>> from Bio.SeqUtils import GC123 57 >>> GC123("ACTGTN") 58 (40.0, 50.0, 50.0, 0.0) 59 60 Copes with mixed case sequences, but does NOT deal with ambiguous 61 nucleotides. 62 """ 63 d = {} 64 for nt in ['A', 'T', 'G', 'C']: 65 d[nt] = [0, 0, 0] 66 67 for i in range(0, len(seq), 3): 68 codon = seq[i:i + 3] 69 if len(codon) < 3: 70 codon += ' ' 71 for pos in range(0, 3): 72 for nt in ['A', 'T', 'G', 'C']: 73 if codon[pos] == nt or codon[pos] == nt.lower(): 74 d[nt][pos] += 1 75 gc = {} 76 gcall = 0 77 nall = 0 78 for i in range(0, 3): 79 try: 80 n = d['G'][i] + d['C'][i] + d['T'][i] + d['A'][i] 81 gc[i] = (d['G'][i] + d['C'][i]) * 100.0 / n 82 except Exception: # TODO - ValueError? 83 gc[i] = 0 84 85 gcall = gcall + d['G'][i] + d['C'][i] 86 nall = nall + n 87 88 gcall = 100.0 * gcall / nall 89 return gcall, gc[0], gc[1], gc[2]
90 91
92 -def GC_skew(seq, window=100):
93 """Calculates GC skew (G-C)/(G+C) for multiple windows along the sequence. 94 95 Returns a list of ratios (floats), controlled by the length of the sequence 96 and the size of the window. 97 98 Returns 0 for windows without any G/C by handling zero division errors. 99 100 Does NOT look at any ambiguous nucleotides. 101 """ 102 # 8/19/03: Iddo: added lowercase 103 values = [] 104 for i in range(0, len(seq), window): 105 s = seq[i: i + window] 106 g = s.count('G') + s.count('g') 107 c = s.count('C') + s.count('c') 108 try: 109 skew = (g - c) / float(g + c) 110 except ZeroDivisionError: 111 skew = 0.0 112 values.append(skew) 113 return values
114 115
116 -def xGC_skew(seq, window=1000, zoom=100, 117 r=300, px=100, py=100):
118 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 119 try: 120 import Tkinter as tkinter # Python 2 121 except ImportError: 122 import tkinter # Python 3 123 124 yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL) 125 xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL) 126 canvas = tkinter.Canvas(yscrollcommand=yscroll.set, 127 xscrollcommand=xscroll.set, background='white') 128 win = canvas.winfo_toplevel() 129 win.geometry('700x700') 130 131 yscroll.config(command=canvas.yview) 132 xscroll.config(command=canvas.xview) 133 yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y) 134 xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X) 135 canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1) 136 canvas.update() 137 138 X0, Y0 = r + px, r + py 139 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r 140 141 ty = Y0 142 canvas.create_text(X0, ty, text='%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 143 ty += 20 144 canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq))) 145 ty += 20 146 canvas.create_text(X0, ty, text='GC Skew', fill='blue') 147 ty += 20 148 canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta') 149 ty += 20 150 canvas.create_oval(x1, y1, x2, y2) 151 152 acc = 0 153 start = 0 154 for gc in GC_skew(seq, window): 155 r1 = r 156 acc += gc 157 # GC skew 158 alpha = pi - (2 * pi * start) / len(seq) 159 r2 = r1 - gc * zoom 160 x1 = X0 + r1 * sin(alpha) 161 y1 = Y0 + r1 * cos(alpha) 162 x2 = X0 + r2 * sin(alpha) 163 y2 = Y0 + r2 * cos(alpha) 164 canvas.create_line(x1, y1, x2, y2, fill='blue') 165 # accumulated GC skew 166 r1 = r - 50 167 r2 = r1 - acc 168 x1 = X0 + r1 * sin(alpha) 169 y1 = Y0 + r1 * cos(alpha) 170 x2 = X0 + r2 * sin(alpha) 171 y2 = Y0 + r2 * cos(alpha) 172 canvas.create_line(x1, y1, x2, y2, fill='magenta') 173 174 canvas.update() 175 start += window 176 177 canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
178 179
180 -def nt_search(seq, subseq):
181 """Search for a DNA subseq in sequence. 182 183 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 184 searches only on forward strand 185 """ 186 pattern = '' 187 for nt in subseq: 188 value = IUPACData.ambiguous_dna_values[nt] 189 if len(value) == 1: 190 pattern += value 191 else: 192 pattern += '[%s]' % value 193 194 pos = -1 195 result = [pattern] 196 l = len(seq) 197 while True: 198 pos += 1 199 s = seq[pos:] 200 m = re.search(pattern, s) 201 if not m: 202 break 203 pos += int(m.start(0)) 204 result.append(pos) 205 return result
206 207 # }}} 208 209 ###################################### 210 # Protein 211 ###################### 212 # {{{ 213 214
215 -def seq3(seq, custom_map=None, undef_code='Xaa'):
216 """Turn a one letter code protein sequence into one with three letter codes. 217 218 The single required input argument 'seq' should be a protein sequence using 219 single letter codes, either as a python string or as a Seq or MutableSeq 220 object. 221 222 This function returns the amino acid sequence as a string using the three 223 letter amino acid codes. Output follows the IUPAC standard (including 224 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 225 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. 226 Any unknown character (including possible gap characters), is changed into 227 'Xaa' by default. 228 229 e.g. 230 231 >>> from Bio.SeqUtils import seq3 232 >>> seq3("MAIVMGRWKGAR*") 233 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 234 235 You can set a custom translation of the codon termination code using the 236 dictionary "custom_map" argument (which defaults to {'*': 'Ter'}), e.g. 237 238 >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"}) 239 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***' 240 241 You can also set a custom translation for non-amino acid characters, such 242 as '-', using the "undef_code" argument, e.g. 243 244 >>> seq3("MAIVMGRWKGA--R*", undef_code='---') 245 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer' 246 247 If not given, "undef_code" defaults to "Xaa", e.g. 248 249 >>> seq3("MAIVMGRWKGA--R*") 250 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer' 251 252 This function was inspired by BioPerl's seq3. 253 """ 254 if custom_map is None: 255 custom_map = {'*': 'Ter'} 256 # not doing .update() on IUPACData dict with custom_map dict 257 # to preserve its initial state (may be imported in other modules) 258 threecode = dict(list(IUPACData.protein_letters_1to3_extended.items()) + 259 list(custom_map.items())) 260 # We use a default of 'Xaa' for undefined letters 261 # Note this will map '-' to 'Xaa' which may be undesirable! 262 return ''.join(threecode.get(aa, undef_code) for aa in seq)
263 264
265 -def seq1(seq, custom_map=None, undef_code='X'):
266 """Turns a three-letter code protein sequence into one with single letter codes. 267 268 The single required input argument 'seq' should be a protein sequence 269 using three-letter codes, either as a python string or as a Seq or 270 MutableSeq object. 271 272 This function returns the amino acid sequence as a string using the one 273 letter amino acid codes. Output follows the IUPAC standard (including 274 ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for 275 "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code. 276 Any unknown character (including possible gap characters), is changed 277 into '-' by default. 278 279 e.g. 280 281 >>> from Bio.SeqUtils import seq3 282 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer") 283 'MAIVMGRWKGAR*' 284 285 The input is case insensitive, e.g. 286 287 >>> from Bio.SeqUtils import seq3 288 >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer") 289 'MAIVMGRWKGAR*' 290 291 You can set a custom translation of the codon termination code using the 292 dictionary "custom_map" argument (defaulting to {'Ter': '*'}), e.g. 293 294 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArg***", custom_map={"***": "*"}) 295 'MAIVMGRWKGAR*' 296 297 You can also set a custom translation for non-amino acid characters, such 298 as '-', using the "undef_code" argument, e.g. 299 300 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?') 301 'MAIVMGRWKGA??R*' 302 303 If not given, "undef_code" defaults to "X", e.g. 304 305 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer") 306 'MAIVMGRWKGAXXR*' 307 308 """ 309 if custom_map is None: 310 custom_map = {'Ter': '*'} 311 # reverse map of threecode 312 # upper() on all keys to enable caps-insensitive input seq handling 313 onecode = dict((k.upper(), v) for k, v in 314 IUPACData.protein_letters_3to1_extended.items()) 315 # add the given termination codon code and custom maps 316 onecode.update((k.upper(), v) for (k, v) in custom_map.items()) 317 seqlist = [seq[3 * i:3 * (i + 1)] for i in range(len(seq) // 3)] 318 return ''.join(onecode.get(aa.upper(), undef_code) for aa in seqlist)
319 320 321 # }}} 322 323 ###################################### 324 # Mixed ??? 325 ###################### 326 # {{{ 327 328
329 -def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False, 330 monoisotopic=False):
331 """Calculates the molecular weight of a DNA, RNA or protein sequence. 332 333 Only unambiguous letters are allowed. Nucleotide sequences are assumed to 334 have a 5' phosphate. 335 336 Arguments: 337 - seq: String or Biopython sequence object. 338 - seq_type: The default (None) is to take the alphabet from the seq argument, 339 or assume DNA if the seq argument is a string. Override this with 340 a string 'DNA', 'RNA', or 'protein'. 341 - double_stranded: Calculate the mass for the double stranded molecule? 342 - circular: Is the molecule circular (has no ends)? 343 - monoisotopic: Use the monoisotopic mass tables? 344 345 Note that for backwards compatibility, if the seq argument is a string, 346 or Seq object with a generic alphabet, and no seq_type is specified 347 (i.e. left as None), then DNA is assumed. 348 349 >>> print("%0.2f" % molecular_weight("AGC")) 350 949.61 351 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 352 949.61 353 354 However, it is better to be explicit - for example with strings: 355 356 >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 357 949.61 358 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 359 997.61 360 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 361 249.29 362 363 Or, with the sequence alphabet: 364 365 >>> from Bio.Seq import Seq 366 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 367 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna))) 368 949.61 369 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna))) 370 997.61 371 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein))) 372 249.29 373 374 Also note that contradictory sequence alphabets and seq_type will also 375 give an exception: 376 377 >>> from Bio.Seq import Seq 378 >>> from Bio.Alphabet import generic_dna 379 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA")) 380 Traceback (most recent call last): 381 ... 382 ValueError: seq_type='RNA' contradicts DNA from seq alphabet 383 384 """ 385 # Rewritten by Markus Piotrowski, 2014 386 387 # Find the alphabet type 388 tmp_type = '' 389 if isinstance(seq, (Seq, MutableSeq)): 390 base_alphabet = Alphabet._get_base_alphabet(seq.alphabet) 391 if isinstance(base_alphabet, Alphabet.DNAAlphabet): 392 tmp_type = 'DNA' 393 elif isinstance(base_alphabet, Alphabet.RNAAlphabet): 394 tmp_type = 'RNA' 395 elif isinstance(base_alphabet, Alphabet.ProteinAlphabet): 396 tmp_type = 'protein' 397 elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein): 398 tmp_type = 'protein' 399 # Convert to one-letter sequence. Have to use a string for seq1 400 seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet()) 401 elif not isinstance(base_alphabet, Alphabet.Alphabet): 402 raise TypeError("%s is not a valid alphabet for mass calculations" 403 % base_alphabet) 404 else: 405 tmp_type = "DNA" # backward compatibity 406 if seq_type and tmp_type and tmp_type != seq_type: 407 raise ValueError("seq_type=%r contradicts %s from seq alphabet" 408 % (seq_type, tmp_type)) 409 seq_type = tmp_type 410 elif isinstance(seq, str): 411 if seq_type is None: 412 seq_type = "DNA" # backward compatibity 413 else: 414 raise TypeError("Expected a string or Seq object, not seq=%r" % seq) 415 416 seq = ''.join(str(seq).split()).upper() # Do the minimum formatting 417 418 if seq_type == 'DNA': 419 if monoisotopic: 420 weight_table = IUPACData.monoisotopic_unambiguous_dna_weights 421 else: 422 weight_table = IUPACData.unambiguous_dna_weights 423 elif seq_type == 'RNA': 424 if monoisotopic: 425 weight_table = IUPACData.monoisotopic_unambiguous_rna_weights 426 else: 427 weight_table = IUPACData.unambiguous_rna_weights 428 elif seq_type == 'protein': 429 if monoisotopic: 430 weight_table = IUPACData.monoisotopic_protein_weights 431 else: 432 weight_table = IUPACData.protein_weights 433 else: 434 raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" 435 % seq_type) 436 437 if monoisotopic: 438 water = 18.010565 439 else: 440 water = 18.0153 441 442 try: 443 weight = sum(weight_table[x] for x in seq) - (len(seq) - 1) * water 444 if circular: 445 weight -= water 446 except KeyError as e: 447 raise ValueError('%s is not a valid unambiguous letter for %s' 448 % (e, seq_type)) 449 450 if seq_type in ('DNA', 'RNA') and double_stranded: 451 seq = str(Seq(seq).complement()) 452 weight += sum(weight_table[x] for x in seq) - (len(seq) - 1) * water 453 if circular: 454 weight -= water 455 elif seq_type == 'protein' and double_stranded: 456 raise ValueError('double-stranded proteins await their discovery') 457 458 return weight
459 460
461 -def six_frame_translations(seq, genetic_code=1):
462 """Formatted string showing the 6 frame translations and GC content. 463 464 nice looking 6 frame translation with GC content - code from xbbtools 465 similar to DNA Striders six-frame translation 466 467 >>> from Bio.SeqUtils import six_frame_translations 468 >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) 469 GC_Frame: a:5 t:0 g:8 c:5 470 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC 471 <BLANKLINE> 472 <BLANKLINE> 473 1/1 474 G H C N G P L 475 W P L * W A A 476 M A I V M G R * 477 auggccauuguaaugggccgcuga 54 % 478 uaccgguaacauuacccggcgacu 479 A M T I P R Q 480 H G N Y H A A S 481 P W Q L P G S 482 <BLANKLINE> 483 <BLANKLINE> 484 485 """ # noqa for pep8 W291 trailing whitespace 486 from Bio.Seq import reverse_complement, translate 487 anti = reverse_complement(seq) 488 comp = anti[::-1] 489 length = len(seq) 490 frames = {} 491 for i in range(0, 3): 492 fragment_length = 3 * ((length - i) // 3) 493 frames[i + 1] = translate(seq[i:i + fragment_length], genetic_code) 494 frames[-(i + 1)] = translate(anti[i:i + fragment_length], genetic_code)[::-1] 495 496 # create header 497 if length > 20: 498 short = '%s ... %s' % (seq[:10], seq[-10:]) 499 else: 500 short = seq 501 header = 'GC_Frame: ' 502 for nt in ['a', 't', 'g', 'c']: 503 header += '%s:%d ' % (nt, seq.count(nt.upper())) 504 505 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq)) 506 res = header 507 508 for i in range(0, length, 60): 509 subseq = seq[i:i + 60] 510 csubseq = comp[i:i + 60] 511 p = i // 3 512 res += '%d/%d\n' % (i + 1, i / 3 + 1) 513 res += ' ' + ' '.join(frames[3][p:p + 20]) + '\n' 514 res += ' ' + ' '.join(frames[2][p:p + 20]) + '\n' 515 res += ' '.join(frames[1][p:p + 20]) + '\n' 516 # seq 517 res += subseq.lower() + '%5d %%\n' % int(GC(subseq)) 518 res += csubseq.lower() + '\n' 519 # - frames 520 res += ' '.join(frames[-2][p:p + 20]) + ' \n' 521 res += ' ' + ' '.join(frames[-1][p:p + 20]) + '\n' 522 res += ' ' + ' '.join(frames[-3][p:p + 20]) + '\n\n' 523 return res
524 525 # }}} 526 527 528 if __name__ == "__main__": 529 from Bio._utils import run_doctest 530 run_doctest() 531