Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # Revisions copyright 2014 by Markus Piotrowski. 
  6  # Revisions copyright 2014-2016 by Peter Cock. 
  7  # All rights reserved. 
  8  # This code is part of the Biopython distribution and governed by its 
  9  # license.  Please see the LICENSE file that should have been included 
 10  # as part of this package. 
 11   
 12  """Miscellaneous functions for dealing with sequences.""" 
 13   
 14  from __future__ import print_function 
 15   
 16  import re 
 17  from math import pi, sin, cos 
 18   
 19  from Bio.Seq import Seq, MutableSeq 
 20  from Bio import Alphabet 
 21  from Bio.Data import IUPACData 
 22   
 23   
 24  ###################################### 
 25  # DNA 
 26  ###################### 
 27  # {{{ 
 28   
 29   
30 -def GC(seq):
31 """Calculates G+C content, returns the percentage (float between 0 and 100). 32 33 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) 34 when counting the G and C content. The percentage is calculated against 35 the full length, e.g.: 36 37 >>> from Bio.SeqUtils import GC 38 >>> GC("ACTGN") 39 40.0 40 41 Note that this will return zero for an empty sequence. 42 """ 43 gc = sum(seq.count(x) for x in ['G', 'C', 'g', 'c', 'S', 's']) 44 try: 45 return gc * 100.0 / len(seq) 46 except ZeroDivisionError: 47 return 0.0
48 49
50 -def GC123(seq):
51 """Calculates total G+C content plus first, second and third positions. 52 53 Returns a tuple of four floats (percentages between 0 and 100) for the 54 entire sequence, and the three codon positions. e.g. 55 56 >>> from Bio.SeqUtils import GC123 57 >>> GC123("ACTGTN") 58 (40.0, 50.0, 50.0, 0.0) 59 60 Copes with mixed case sequences, but does NOT deal with ambiguous 61 nucleotides. 62 """ 63 d = {} 64 for nt in ['A', 'T', 'G', 'C']: 65 d[nt] = [0, 0, 0] 66 67 for i in range(0, len(seq), 3): 68 codon = seq[i:i + 3] 69 if len(codon) < 3: 70 codon += ' ' 71 for pos in range(0, 3): 72 for nt in ['A', 'T', 'G', 'C']: 73 if codon[pos] == nt or codon[pos] == nt.lower(): 74 d[nt][pos] += 1 75 gc = {} 76 gcall = 0 77 nall = 0 78 for i in range(0, 3): 79 try: 80 n = d['G'][i] + d['C'][i] + d['T'][i] + d['A'][i] 81 gc[i] = (d['G'][i] + d['C'][i]) * 100.0 / n 82 except Exception: # TODO - ValueError? 83 gc[i] = 0 84 85 gcall = gcall + d['G'][i] + d['C'][i] 86 nall = nall + n 87 88 gcall = 100.0 * gcall / nall 89 return gcall, gc[0], gc[1], gc[2]
90 91
92 -def GC_skew(seq, window=100):
93 """Calculates GC skew (G-C)/(G+C) for multiple windows along the sequence. 94 95 Returns a list of ratios (floats), controlled by the length of the sequence 96 and the size of the window. 97 98 Does NOT look at any ambiguous nucleotides. 99 """ 100 # 8/19/03: Iddo: added lowercase 101 values = [] 102 for i in range(0, len(seq), window): 103 s = seq[i: i + window] 104 g = s.count('G') + s.count('g') 105 c = s.count('C') + s.count('c') 106 skew = (g - c) / float(g + c) 107 values.append(skew) 108 return values
109 110
111 -def xGC_skew(seq, window=1000, zoom=100, 112 r=300, px=100, py=100):
113 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 114 try: 115 import Tkinter as tkinter # Python 2 116 except ImportError: 117 import tkinter # Python 3 118 119 yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL) 120 xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL) 121 canvas = tkinter.Canvas(yscrollcommand=yscroll.set, 122 xscrollcommand=xscroll.set, background='white') 123 win = canvas.winfo_toplevel() 124 win.geometry('700x700') 125 126 yscroll.config(command=canvas.yview) 127 xscroll.config(command=canvas.xview) 128 yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y) 129 xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X) 130 canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1) 131 canvas.update() 132 133 X0, Y0 = r + px, r + py 134 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r 135 136 ty = Y0 137 canvas.create_text(X0, ty, text='%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 138 ty += 20 139 canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq))) 140 ty += 20 141 canvas.create_text(X0, ty, text='GC Skew', fill='blue') 142 ty += 20 143 canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta') 144 ty += 20 145 canvas.create_oval(x1, y1, x2, y2) 146 147 acc = 0 148 start = 0 149 for gc in GC_skew(seq, window): 150 r1 = r 151 acc += gc 152 # GC skew 153 alpha = pi - (2 * pi * start) / len(seq) 154 r2 = r1 - gc * zoom 155 x1 = X0 + r1 * sin(alpha) 156 y1 = Y0 + r1 * cos(alpha) 157 x2 = X0 + r2 * sin(alpha) 158 y2 = Y0 + r2 * cos(alpha) 159 canvas.create_line(x1, y1, x2, y2, fill='blue') 160 # accumulated GC skew 161 r1 = r - 50 162 r2 = r1 - acc 163 x1 = X0 + r1 * sin(alpha) 164 y1 = Y0 + r1 * cos(alpha) 165 x2 = X0 + r2 * sin(alpha) 166 y2 = Y0 + r2 * cos(alpha) 167 canvas.create_line(x1, y1, x2, y2, fill='magenta') 168 169 canvas.update() 170 start += window 171 172 canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
173 174
175 -def nt_search(seq, subseq):
176 """Search for a DNA subseq in sequence. 177 178 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 179 searches only on forward strand 180 """ 181 pattern = '' 182 for nt in subseq: 183 value = IUPACData.ambiguous_dna_values[nt] 184 if len(value) == 1: 185 pattern += value 186 else: 187 pattern += '[%s]' % value 188 189 pos = -1 190 result = [pattern] 191 l = len(seq) 192 while True: 193 pos += 1 194 s = seq[pos:] 195 m = re.search(pattern, s) 196 if not m: 197 break 198 pos += int(m.start(0)) 199 result.append(pos) 200 return result
201 202 # }}} 203 204 ###################################### 205 # Protein 206 ###################### 207 # {{{ 208 209
210 -def seq3(seq, custom_map=None, undef_code='Xaa'):
211 """Turn a one letter code protein sequence into one with three letter codes. 212 213 The single required input argument 'seq' should be a protein sequence using 214 single letter codes, either as a python string or as a Seq or MutableSeq 215 object. 216 217 This function returns the amino acid sequence as a string using the three 218 letter amino acid codes. Output follows the IUPAC standard (including 219 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 220 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. 221 Any unknown character (including possible gap characters), is changed into 222 'Xaa' by default. 223 224 e.g. 225 226 >>> from Bio.SeqUtils import seq3 227 >>> seq3("MAIVMGRWKGAR*") 228 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 229 230 You can set a custom translation of the codon termination code using the 231 dictionary "custom_map" argument (which defaults to {'*': 'Ter'}), e.g. 232 233 >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"}) 234 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***' 235 236 You can also set a custom translation for non-amino acid characters, such 237 as '-', using the "undef_code" argument, e.g. 238 239 >>> seq3("MAIVMGRWKGA--R*", undef_code='---') 240 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer' 241 242 If not given, "undef_code" defaults to "Xaa", e.g. 243 244 >>> seq3("MAIVMGRWKGA--R*") 245 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer' 246 247 This function was inspired by BioPerl's seq3. 248 """ 249 if custom_map is None: 250 custom_map = {'*': 'Ter'} 251 # not doing .update() on IUPACData dict with custom_map dict 252 # to preserve its initial state (may be imported in other modules) 253 threecode = dict(list(IUPACData.protein_letters_1to3_extended.items()) + 254 list(custom_map.items())) 255 # We use a default of 'Xaa' for undefined letters 256 # Note this will map '-' to 'Xaa' which may be undesirable! 257 return ''.join(threecode.get(aa, undef_code) for aa in seq)
258 259
260 -def seq1(seq, custom_map=None, undef_code='X'):
261 """Turns a three-letter code protein sequence into one with single letter codes. 262 263 The single required input argument 'seq' should be a protein sequence 264 using three-letter codes, either as a python string or as a Seq or 265 MutableSeq object. 266 267 This function returns the amino acid sequence as a string using the one 268 letter amino acid codes. Output follows the IUPAC standard (including 269 ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for 270 "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code. 271 Any unknown character (including possible gap characters), is changed 272 into '-' by default. 273 274 e.g. 275 276 >>> from Bio.SeqUtils import seq3 277 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer") 278 'MAIVMGRWKGAR*' 279 280 The input is case insensitive, e.g. 281 282 >>> from Bio.SeqUtils import seq3 283 >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer") 284 'MAIVMGRWKGAR*' 285 286 You can set a custom translation of the codon termination code using the 287 dictionary "custom_map" argument (defaulting to {'Ter': '*'}), e.g. 288 289 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArg***", custom_map={"***": "*"}) 290 'MAIVMGRWKGAR*' 291 292 You can also set a custom translation for non-amino acid characters, such 293 as '-', using the "undef_code" argument, e.g. 294 295 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?') 296 'MAIVMGRWKGA??R*' 297 298 If not given, "undef_code" defaults to "X", e.g. 299 300 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer") 301 'MAIVMGRWKGAXXR*' 302 303 """ 304 if custom_map is None: 305 custom_map = {'Ter': '*'} 306 # reverse map of threecode 307 # upper() on all keys to enable caps-insensitive input seq handling 308 onecode = dict((k.upper(), v) for k, v in 309 IUPACData.protein_letters_3to1_extended.items()) 310 # add the given termination codon code and custom maps 311 onecode.update((k.upper(), v) for (k, v) in custom_map.items()) 312 seqlist = [seq[3 * i:3 * (i + 1)] for i in range(len(seq) // 3)] 313 return ''.join(onecode.get(aa.upper(), undef_code) for aa in seqlist)
314 315 316 # }}} 317 318 ###################################### 319 # Mixed ??? 320 ###################### 321 # {{{ 322 323
324 -def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False, 325 monoisotopic=False):
326 """Calculates the molecular weight of a DNA, RNA or protein sequence. 327 328 Only unambiguous letters are allowed. Nucleotide sequences are assumed to 329 have a 5' phosphate. 330 331 - seq: String or Biopython sequence object. 332 - seq_type: The default (None) is to take the alphabet from the seq argument, 333 or assume DNA if the seq argument is a string. Override this with 334 a string 'DNA', 'RNA', or 'protein'. 335 - double_stranded: Calculate the mass for the double stranded molecule? 336 - circular: Is the molecule circular (has no ends)? 337 - monoisotopic: Use the monoisotopic mass tables? 338 339 Note that for backwards compatibility, if the seq argument is a string, 340 or Seq object with a generic alphabet, and no seq_type is specified 341 (i.e. left as None), then DNA is assumed. 342 343 >>> print("%0.2f" % molecular_weight("AGC")) 344 949.61 345 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 346 949.61 347 348 However, it is better to be explicit - for example with strings: 349 350 >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 351 949.61 352 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 353 997.61 354 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 355 249.29 356 357 Or, with the sequence alphabet: 358 359 >>> from Bio.Seq import Seq 360 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 361 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna))) 362 949.61 363 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna))) 364 997.61 365 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein))) 366 249.29 367 368 Also note that contradictory sequence alphabets and seq_type will also 369 give an exception: 370 371 >>> from Bio.Seq import Seq 372 >>> from Bio.Alphabet import generic_dna 373 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA")) 374 Traceback (most recent call last): 375 ... 376 ValueError: seq_type='RNA' contradicts DNA from seq alphabet 377 378 """ 379 # Rewritten by Markus Piotrowski, 2014 380 381 # Find the alphabet type 382 tmp_type = '' 383 if isinstance(seq, (Seq, MutableSeq)): 384 base_alphabet = Alphabet._get_base_alphabet(seq.alphabet) 385 if isinstance(base_alphabet, Alphabet.DNAAlphabet): 386 tmp_type = 'DNA' 387 elif isinstance(base_alphabet, Alphabet.RNAAlphabet): 388 tmp_type = 'RNA' 389 elif isinstance(base_alphabet, Alphabet.ProteinAlphabet): 390 tmp_type = 'protein' 391 elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein): 392 tmp_type = 'protein' 393 # Convert to one-letter sequence. Have to use a string for seq1 394 seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet()) 395 elif not isinstance(base_alphabet, Alphabet.Alphabet): 396 raise TypeError("%s is not a valid alphabet for mass calculations" 397 % base_alphabet) 398 else: 399 tmp_type = "DNA" # backward compatibity 400 if seq_type and tmp_type and tmp_type != seq_type: 401 raise ValueError("seq_type=%r contradicts %s from seq alphabet" 402 % (seq_type, tmp_type)) 403 seq_type = tmp_type 404 elif isinstance(seq, str): 405 if seq_type is None: 406 seq_type = "DNA" # backward compatibity 407 else: 408 raise TypeError("Expected a string or Seq object, not seq=%r" % seq) 409 410 seq = ''.join(str(seq).split()).upper() # Do the minimum formatting 411 412 if seq_type == 'DNA': 413 if monoisotopic: 414 weight_table = IUPACData.monoisotopic_unambiguous_dna_weights 415 else: 416 weight_table = IUPACData.unambiguous_dna_weights 417 elif seq_type == 'RNA': 418 if monoisotopic: 419 weight_table = IUPACData.monoisotopic_unambiguous_rna_weights 420 else: 421 weight_table = IUPACData.unambiguous_rna_weights 422 elif seq_type == 'protein': 423 if monoisotopic: 424 weight_table = IUPACData.monoisotopic_protein_weights 425 else: 426 weight_table = IUPACData.protein_weights 427 else: 428 raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" 429 % seq_type) 430 431 if monoisotopic: 432 water = 18.010565 433 else: 434 water = 18.0153 435 436 try: 437 weight = sum(weight_table[x] for x in seq) - (len(seq) - 1) * water 438 if circular: 439 weight -= water 440 except KeyError as e: 441 raise ValueError('%s is not a valid unambiguous letter for %s' 442 % (e, seq_type)) 443 except: 444 raise 445 446 if seq_type in ('DNA', 'RNA') and double_stranded: 447 seq = str(Seq(seq).complement()) 448 weight += sum(weight_table[x] for x in seq) - (len(seq) - 1) * water 449 if circular: 450 weight -= water 451 elif seq_type == 'protein' and double_stranded: 452 raise ValueError('double-stranded proteins await their discovery') 453 454 return weight
455 456
457 -def six_frame_translations(seq, genetic_code=1):
458 """Formatted string showing the 6 frame translations and GC content. 459 460 nice looking 6 frame translation with GC content - code from xbbtools 461 similar to DNA Striders six-frame translation 462 463 >>> from Bio.SeqUtils import six_frame_translations 464 >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) 465 GC_Frame: a:5 t:0 g:8 c:5 466 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC 467 <BLANKLINE> 468 <BLANKLINE> 469 1/1 470 G H C N G P L 471 W P L * W A A 472 M A I V M G R * 473 auggccauuguaaugggccgcuga 54 % 474 uaccgguaacauuacccggcgacu 475 A M T I P R Q 476 H G N Y H A A S 477 P W Q L P G S 478 <BLANKLINE> 479 <BLANKLINE> 480 481 """ # noqa for pep8 W291 trailing whitespace 482 from Bio.Seq import reverse_complement, translate 483 anti = reverse_complement(seq) 484 comp = anti[::-1] 485 length = len(seq) 486 frames = {} 487 for i in range(0, 3): 488 fragment_length = 3 * ((length - i) // 3) 489 frames[i + 1] = translate(seq[i:i + fragment_length], genetic_code) 490 frames[-(i + 1)] = translate(anti[i:i + fragment_length], genetic_code)[::-1] 491 492 # create header 493 if length > 20: 494 short = '%s ... %s' % (seq[:10], seq[-10:]) 495 else: 496 short = seq 497 header = 'GC_Frame: ' 498 for nt in ['a', 't', 'g', 'c']: 499 header += '%s:%d ' % (nt, seq.count(nt.upper())) 500 501 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq)) 502 res = header 503 504 for i in range(0, length, 60): 505 subseq = seq[i:i + 60] 506 csubseq = comp[i:i + 60] 507 p = i // 3 508 res += '%d/%d\n' % (i + 1, i / 3 + 1) 509 res += ' ' + ' '.join(frames[3][p:p + 20]) + '\n' 510 res += ' ' + ' '.join(frames[2][p:p + 20]) + '\n' 511 res += ' '.join(frames[1][p:p + 20]) + '\n' 512 # seq 513 res += subseq.lower() + '%5d %%\n' % int(GC(subseq)) 514 res += csubseq.lower() + '\n' 515 # - frames 516 res += ' '.join(frames[-2][p:p + 20]) + ' \n' 517 res += ' ' + ' '.join(frames[-1][p:p + 20]) + '\n' 518 res += ' ' + ' '.join(frames[-3][p:p + 20]) + '\n\n' 519 return res
520 521 # }}} 522 523 524 if __name__ == "__main__": 525 from Bio._utils import run_doctest 526 run_doctest() 527