Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Copyright 2002 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  3  # Revisions copyright 2014 by Markus Piotrowski. 
  4  # Revisions copyright 2014-2016 by Peter Cock. 
  5  # All rights reserved. 
  6  # This file is part of the Biopython distribution and governed by your 
  7  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  8  # Please see the LICENSE file that should have been included as part of this 
  9  # package. 
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  from __future__ import print_function 
 13   
 14  import re 
 15  from math import pi, sin, cos 
 16   
 17  from Bio.Seq import Seq, MutableSeq 
 18  from Bio import Alphabet 
 19  from Bio.Data import IUPACData 
 20   
 21   
 22  ###################################### 
 23  # DNA 
 24  ###################### 
 25  # {{{ 
 26   
 27   
28 -def GC(seq):
29 """Calculate G+C content, return percentage (as float between 0 and 100). 30 31 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) 32 when counting the G and C content. The percentage is calculated against 33 the full length, e.g.: 34 35 >>> from Bio.SeqUtils import GC 36 >>> GC("ACTGN") 37 40.0 38 39 Note that this will return zero for an empty sequence. 40 """ 41 gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"]) 42 try: 43 return gc * 100.0 / len(seq) 44 except ZeroDivisionError: 45 return 0.0
46 47
48 -def GC123(seq):
49 """Calculate G+C content: total, for first, second and third positions. 50 51 Returns a tuple of four floats (percentages between 0 and 100) for the 52 entire sequence, and the three codon positions. e.g. 53 54 >>> from Bio.SeqUtils import GC123 55 >>> GC123("ACTGTN") 56 (40.0, 50.0, 50.0, 0.0) 57 58 Copes with mixed case sequences, but does NOT deal with ambiguous 59 nucleotides. 60 """ 61 d = {} 62 for nt in ["A", "T", "G", "C"]: 63 d[nt] = [0, 0, 0] 64 65 for i in range(0, len(seq), 3): 66 codon = seq[i:i + 3] 67 if len(codon) < 3: 68 codon += " " 69 for pos in range(0, 3): 70 for nt in ["A", "T", "G", "C"]: 71 if codon[pos] == nt or codon[pos] == nt.lower(): 72 d[nt][pos] += 1 73 gc = {} 74 gcall = 0 75 nall = 0 76 for i in range(0, 3): 77 try: 78 n = d["G"][i] + d["C"][i] + d["T"][i] + d["A"][i] 79 gc[i] = (d["G"][i] + d["C"][i]) * 100.0 / n 80 except Exception: # TODO - ValueError? 81 gc[i] = 0 82 83 gcall = gcall + d["G"][i] + d["C"][i] 84 nall = nall + n 85 86 gcall = 100.0 * gcall / nall 87 return gcall, gc[0], gc[1], gc[2]
88 89
90 -def GC_skew(seq, window=100):
91 """Calculate GC skew (G-C)/(G+C) for multiple windows along the sequence. 92 93 Returns a list of ratios (floats), controlled by the length of the sequence 94 and the size of the window. 95 96 Returns 0 for windows without any G/C by handling zero division errors. 97 98 Does NOT look at any ambiguous nucleotides. 99 """ 100 # 8/19/03: Iddo: added lowercase 101 values = [] 102 for i in range(0, len(seq), window): 103 s = seq[i: i + window] 104 g = s.count("G") + s.count("g") 105 c = s.count("C") + s.count("c") 106 try: 107 skew = (g - c) / float(g + c) 108 except ZeroDivisionError: 109 skew = 0.0 110 values.append(skew) 111 return values
112 113
114 -def xGC_skew(seq, window=1000, zoom=100, r=300, px=100, py=100):
115 """Calculate and plot normal and accumulated GC skew (GRAPHICS !!!).""" 116 try: 117 import Tkinter as tkinter # Python 2 118 except ImportError: 119 import tkinter # Python 3 120 121 yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL) 122 xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL) 123 canvas = tkinter.Canvas(yscrollcommand=yscroll.set, 124 xscrollcommand=xscroll.set, background="white") 125 win = canvas.winfo_toplevel() 126 win.geometry("700x700") 127 128 yscroll.config(command=canvas.yview) 129 xscroll.config(command=canvas.xview) 130 yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y) 131 xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X) 132 canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1) 133 canvas.update() 134 135 X0, Y0 = r + px, r + py 136 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r 137 138 ty = Y0 139 canvas.create_text(X0, ty, text="%s...%s (%d nt)" % (seq[:7], 140 seq[-7:], len(seq))) 141 ty += 20 142 canvas.create_text(X0, ty, text="GC %3.2f%%" % (GC(seq))) 143 ty += 20 144 canvas.create_text(X0, ty, text="GC Skew", fill="blue") 145 ty += 20 146 canvas.create_text(X0, ty, text="Accumulated GC Skew", fill="magenta") 147 ty += 20 148 canvas.create_oval(x1, y1, x2, y2) 149 150 acc = 0 151 start = 0 152 for gc in GC_skew(seq, window): 153 r1 = r 154 acc += gc 155 # GC skew 156 alpha = pi - (2 * pi * start) / len(seq) 157 r2 = r1 - gc * zoom 158 x1 = X0 + r1 * sin(alpha) 159 y1 = Y0 + r1 * cos(alpha) 160 x2 = X0 + r2 * sin(alpha) 161 y2 = Y0 + r2 * cos(alpha) 162 canvas.create_line(x1, y1, x2, y2, fill="blue") 163 # accumulated GC skew 164 r1 = r - 50 165 r2 = r1 - acc 166 x1 = X0 + r1 * sin(alpha) 167 y1 = Y0 + r1 * cos(alpha) 168 x2 = X0 + r2 * sin(alpha) 169 y2 = Y0 + r2 * cos(alpha) 170 canvas.create_line(x1, y1, x2, y2, fill="magenta") 171 172 canvas.update() 173 start += window 174 175 canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
176 177
178 -def nt_search(seq, subseq):
179 """Search for a DNA subseq in sequence, return list of [subseq, positions]. 180 181 Use ambiguous values (like N = A or T or C or G, R = A or G etc.), 182 searches only on forward strand. 183 """ 184 pattern = "" 185 for nt in subseq: 186 value = IUPACData.ambiguous_dna_values[nt] 187 if len(value) == 1: 188 pattern += value 189 else: 190 pattern += "[%s]" % value 191 192 pos = -1 193 result = [pattern] 194 while True: 195 pos += 1 196 s = seq[pos:] 197 m = re.search(pattern, s) 198 if not m: 199 break 200 pos += int(m.start(0)) 201 result.append(pos) 202 return result
203 204 205 ###################################### 206 # Protein 207 ###################### 208 209
210 -def seq3(seq, custom_map=None, undef_code="Xaa"):
211 """Convert protein sequence from one-letter to three-letter code. 212 213 The single required input argument 'seq' should be a protein sequence using 214 single letter codes, either as a Python string or as a Seq or MutableSeq 215 object. 216 217 This function returns the amino acid sequence as a string using the three 218 letter amino acid codes. Output follows the IUPAC standard (including 219 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 220 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an 221 asterisk. Any unknown character (including possible gap characters), 222 is changed into 'Xaa' by default. 223 224 e.g. 225 226 >>> from Bio.SeqUtils import seq3 227 >>> seq3("MAIVMGRWKGAR*") 228 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 229 230 You can set a custom translation of the codon termination code using the 231 dictionary "custom_map" argument (which defaults to {'*': 'Ter'}), e.g. 232 233 >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"}) 234 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***' 235 236 You can also set a custom translation for non-amino acid characters, such 237 as '-', using the "undef_code" argument, e.g. 238 239 >>> seq3("MAIVMGRWKGA--R*", undef_code='---') 240 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer' 241 242 If not given, "undef_code" defaults to "Xaa", e.g. 243 244 >>> seq3("MAIVMGRWKGA--R*") 245 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer' 246 247 This function was inspired by BioPerl's seq3. 248 """ 249 if custom_map is None: 250 custom_map = {"*": "Ter"} 251 # not doing .update() on IUPACData dict with custom_map dict 252 # to preserve its initial state (may be imported in other modules) 253 threecode = dict(list(IUPACData.protein_letters_1to3_extended.items()) + 254 list(custom_map.items())) 255 # We use a default of 'Xaa' for undefined letters 256 # Note this will map '-' to 'Xaa' which may be undesirable! 257 return "".join(threecode.get(aa, undef_code) for aa in seq)
258 259
260 -def seq1(seq, custom_map=None, undef_code="X"):
261 """Convert protein sequence from three-letter to one-letter code. 262 263 The single required input argument 'seq' should be a protein sequence 264 using three-letter codes, either as a Python string or as a Seq or 265 MutableSeq object. 266 267 This function returns the amino acid sequence as a string using the one 268 letter amino acid codes. Output follows the IUPAC standard (including 269 ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for 270 "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code. 271 Any unknown character (including possible gap characters), is changed 272 into '-' by default. 273 274 e.g. 275 276 >>> from Bio.SeqUtils import seq1 277 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer") 278 'MAIVMGRWKGAR*' 279 280 The input is case insensitive, e.g. 281 282 >>> from Bio.SeqUtils import seq1 283 >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer") 284 'MAIVMGRWKGAR*' 285 286 You can set a custom translation of the codon termination code using the 287 dictionary "custom_map" argument (defaulting to {'Ter': '*'}), e.g. 288 289 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla***", custom_map={"***": "*"}) 290 'MAIVMGRWKGA*' 291 292 You can also set a custom translation for non-amino acid characters, such 293 as '-', using the "undef_code" argument, e.g. 294 295 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?') 296 'MAIVMGRWKGA??R*' 297 298 If not given, "undef_code" defaults to "X", e.g. 299 300 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer") 301 'MAIVMGRWKGAXXR*' 302 303 """ 304 if custom_map is None: 305 custom_map = {"Ter": "*"} 306 # reverse map of threecode 307 # upper() on all keys to enable caps-insensitive input seq handling 308 onecode = {k.upper(): v for k, v in 309 IUPACData.protein_letters_3to1_extended.items()} 310 # add the given termination codon code and custom maps 311 onecode.update((k.upper(), v) for k, v in custom_map.items()) 312 seqlist = [seq[3 * i:3 * (i + 1)] for i in range(len(seq) // 3)] 313 return "".join(onecode.get(aa.upper(), undef_code) for aa in seqlist)
314 315 316 ###################################### 317 # Mixed ??? 318 ###################### 319 320
321 -def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False, 322 monoisotopic=False):
323 """Calculate the molecular mass of DNA, RNA or protein sequences as float. 324 325 Only unambiguous letters are allowed. Nucleotide sequences are assumed to 326 have a 5' phosphate. 327 328 Arguments: 329 - seq: String or Biopython sequence object. 330 - seq_type: The default (None) is to take the alphabet from the seq 331 argument, or assume DNA if the seq argument is a string. Override this 332 with a string 'DNA', 'RNA', or 'protein'. 333 - double_stranded: Calculate the mass for the double stranded molecule? 334 - circular: Is the molecule circular (has no ends)? 335 - monoisotopic: Use the monoisotopic mass tables? 336 337 Note that for backwards compatibility, if the seq argument is a string, 338 or Seq object with a generic alphabet, and no seq_type is specified 339 (i.e. left as None), then DNA is assumed. 340 341 >>> print("%0.2f" % molecular_weight("AGC")) 342 949.61 343 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 344 949.61 345 346 However, it is better to be explicit - for example with strings: 347 348 >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 349 949.61 350 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 351 997.61 352 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 353 249.29 354 355 Or, with the sequence alphabet: 356 357 >>> from Bio.Seq import Seq 358 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 359 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna))) 360 949.61 361 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna))) 362 997.61 363 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein))) 364 249.29 365 366 Also note that contradictory sequence alphabets and seq_type will also 367 give an exception: 368 369 >>> from Bio.Seq import Seq 370 >>> from Bio.Alphabet import generic_dna 371 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA")) 372 Traceback (most recent call last): 373 ... 374 ValueError: seq_type='RNA' contradicts DNA from seq alphabet 375 376 """ 377 # Rewritten by Markus Piotrowski, 2014 378 379 # Find the alphabet type 380 tmp_type = "" 381 if isinstance(seq, (Seq, MutableSeq)): 382 base_alphabet = Alphabet._get_base_alphabet(seq.alphabet) 383 if isinstance(base_alphabet, Alphabet.DNAAlphabet): 384 tmp_type = "DNA" 385 elif isinstance(base_alphabet, Alphabet.RNAAlphabet): 386 tmp_type = "RNA" 387 elif isinstance(base_alphabet, Alphabet.ProteinAlphabet): 388 tmp_type = "protein" 389 elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein): 390 tmp_type = "protein" 391 # Convert to one-letter sequence. Have to use a string for seq1 392 seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet()) 393 elif not isinstance(base_alphabet, Alphabet.Alphabet): 394 raise TypeError("%s is not a valid alphabet for mass calculations" 395 % base_alphabet) 396 else: 397 tmp_type = "DNA" # backward compatibity 398 if seq_type and tmp_type and tmp_type != seq_type: 399 raise ValueError("seq_type=%r contradicts %s from seq alphabet" 400 % (seq_type, tmp_type)) 401 seq_type = tmp_type 402 elif isinstance(seq, str): 403 if seq_type is None: 404 seq_type = "DNA" # backward compatibity 405 else: 406 raise TypeError("Expected a string or Seq object, not seq=%r" % seq) 407 408 seq = "".join(str(seq).split()).upper() # Do the minimum formatting 409 410 if seq_type == "DNA": 411 if monoisotopic: 412 weight_table = IUPACData.monoisotopic_unambiguous_dna_weights 413 else: 414 weight_table = IUPACData.unambiguous_dna_weights 415 elif seq_type == "RNA": 416 if monoisotopic: 417 weight_table = IUPACData.monoisotopic_unambiguous_rna_weights 418 else: 419 weight_table = IUPACData.unambiguous_rna_weights 420 elif seq_type == "protein": 421 if monoisotopic: 422 weight_table = IUPACData.monoisotopic_protein_weights 423 else: 424 weight_table = IUPACData.protein_weights 425 else: 426 raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" 427 % seq_type) 428 429 if monoisotopic: 430 water = 18.010565 431 else: 432 water = 18.0153 433 434 try: 435 weight = sum(weight_table[x] for x in seq) - (len(seq) - 1) * water 436 if circular: 437 weight -= water 438 except KeyError as e: 439 raise ValueError("%s is not a valid unambiguous letter for %s" 440 % (e, seq_type)) 441 442 if seq_type in ("DNA", "RNA") and double_stranded: 443 seq = str(Seq(seq).complement()) 444 weight += sum(weight_table[x] for x in seq) - (len(seq) - 1) * water 445 if circular: 446 weight -= water 447 elif seq_type == "protein" and double_stranded: 448 raise ValueError("double-stranded proteins await their discovery") 449 450 return weight
451 452
453 -def six_frame_translations(seq, genetic_code=1):
454 """Return pretty string showing the 6 frame translations and GC content. 455 456 Nice looking 6 frame translation with GC content - code from xbbtools 457 similar to DNA Striders six-frame translation 458 459 >>> from Bio.SeqUtils import six_frame_translations 460 >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) 461 GC_Frame: a:5 t:0 g:8 c:5 462 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC 463 <BLANKLINE> 464 <BLANKLINE> 465 1/1 466 G H C N G P L 467 W P L * W A A 468 M A I V M G R * 469 auggccauuguaaugggccgcuga 54 % 470 uaccgguaacauuacccggcgacu 471 A M T I P R Q 472 H G N Y H A A S 473 P W Q L P G S 474 <BLANKLINE> 475 <BLANKLINE> 476 477 """ # noqa for pep8 W291 trailing whitespace 478 from Bio.Seq import reverse_complement, translate 479 anti = reverse_complement(seq) 480 comp = anti[::-1] 481 length = len(seq) 482 frames = {} 483 for i in range(0, 3): 484 fragment_length = 3 * ((length - i) // 3) 485 frames[i + 1] = translate(seq[i:i + fragment_length], genetic_code) 486 frames[-(i + 1)] = translate(anti[i:i + fragment_length], 487 genetic_code)[::-1] 488 489 # create header 490 if length > 20: 491 short = "%s ... %s" % (seq[:10], seq[-10:]) 492 else: 493 short = seq 494 header = "GC_Frame: " 495 for nt in ["a", "t", "g", "c"]: 496 header += "%s:%d " % (nt, seq.count(nt.upper())) 497 498 header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % (short.lower(), 499 length, GC(seq)) 500 res = header 501 502 for i in range(0, length, 60): 503 subseq = seq[i:i + 60] 504 csubseq = comp[i:i + 60] 505 p = i // 3 506 res += "%d/%d\n" % (i + 1, i / 3 + 1) 507 res += " " + " ".join(frames[3][p:p + 20]) + "\n" 508 res += " " + " ".join(frames[2][p:p + 20]) + "\n" 509 res += " ".join(frames[1][p:p + 20]) + "\n" 510 # seq 511 res += subseq.lower() + "%5d %%\n" % int(GC(subseq)) 512 res += csubseq.lower() + "\n" 513 # - frames 514 res += " ".join(frames[-2][p:p + 20]) + " \n" 515 res += " " + " ".join(frames[-1][p:p + 20]) + "\n" 516 res += " " + " ".join(frames[-3][p:p + 20]) + "\n\n" 517 return res
518 519 520 if __name__ == "__main__": 521 from Bio._utils import run_doctest 522 run_doctest() 523