Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  from __future__ import print_function 
 13   
 14  import re 
 15  from math import pi, sin, cos 
 16   
 17  from Bio.Seq import Seq 
 18  from Bio.Alphabet import IUPAC 
 19  from Bio.Data import IUPACData 
 20   
 21   
 22  ###################################### 
 23  # DNA 
 24  ###################### 
 25  # {{{ 
 26   
 27   
28 -def GC(seq):
29 """Calculates G+C content, returns the percentage (float between 0 and 100). 30 31 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) 32 when counting the G and C content. The percentage is calculated against 33 the full length, e.g.: 34 35 >>> from Bio.SeqUtils import GC 36 >>> GC("ACTGN") 37 40.0 38 39 Note that this will return zero for an empty sequence. 40 """ 41 try: 42 gc = sum(seq.count(x) for x in ['G', 'C', 'g', 'c', 'S', 's']) 43 return gc*100.0/len(seq) 44 except ZeroDivisionError: 45 return 0.0
46 47
48 -def GC123(seq):
49 """Calculates total G+C content plus first, second and third positions. 50 51 Returns a tuple of four floats (percentages between 0 and 100) for the 52 entire sequence, and the three codon positions. e.g. 53 54 >>> from Bio.SeqUtils import GC123 55 >>> GC123("ACTGTN") 56 (40.0, 50.0, 50.0, 0.0) 57 58 Copes with mixed case sequences, but does NOT deal with ambiguous 59 nucleotides. 60 """ 61 d= {} 62 for nt in ['A', 'T', 'G', 'C']: 63 d[nt] = [0, 0, 0] 64 65 for i in range(0, len(seq), 3): 66 codon = seq[i:i+3] 67 if len(codon) < 3: 68 codon += ' ' 69 for pos in range(0, 3): 70 for nt in ['A', 'T', 'G', 'C']: 71 if codon[pos] == nt or codon[pos] == nt.lower(): 72 d[nt][pos] += 1 73 gc = {} 74 gcall = 0 75 nall = 0 76 for i in range(0, 3): 77 try: 78 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 79 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 80 except: 81 gc[i] = 0 82 83 gcall = gcall + d['G'][i] + d['C'][i] 84 nall = nall + n 85 86 gcall = 100.0*gcall/nall 87 return gcall, gc[0], gc[1], gc[2]
88 89
90 -def GC_skew(seq, window=100):
91 """Calculates GC skew (G-C)/(G+C) for multiple windows along the sequence. 92 93 Returns a list of ratios (floats), controlled by the length of the sequence 94 and the size of the window. 95 96 Does NOT look at any ambiguous nucleotides. 97 """ 98 # 8/19/03: Iddo: added lowercase 99 values = [] 100 for i in range(0, len(seq), window): 101 s = seq[i: i + window] 102 g = s.count('G') + s.count('g') 103 c = s.count('C') + s.count('c') 104 skew = (g-c)/float(g+c) 105 values.append(skew) 106 return values
107 108
109 -def xGC_skew(seq, window=1000, zoom=100, 110 r=300, px=100, py=100):
111 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 112 try: 113 import Tkinter as tkinter # Python 2 114 except ImportError: 115 import tkinter # Python 3 116 117 yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL) 118 xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL) 119 canvas = tkinter.Canvas(yscrollcommand=yscroll.set, 120 xscrollcommand=xscroll.set, background='white') 121 win = canvas.winfo_toplevel() 122 win.geometry('700x700') 123 124 yscroll.config(command=canvas.yview) 125 xscroll.config(command=canvas.xview) 126 yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y) 127 xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X) 128 canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1) 129 canvas.update() 130 131 X0, Y0 = r + px, r + py 132 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r 133 134 ty = Y0 135 canvas.create_text(X0, ty, text='%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 136 ty += 20 137 canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq))) 138 ty += 20 139 canvas.create_text(X0, ty, text='GC Skew', fill='blue') 140 ty += 20 141 canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta') 142 ty += 20 143 canvas.create_oval(x1, y1, x2, y2) 144 145 acc = 0 146 start = 0 147 for gc in GC_skew(seq, window): 148 r1 = r 149 acc += gc 150 # GC skew 151 alpha = pi - (2*pi*start)/len(seq) 152 r2 = r1 - gc*zoom 153 x1 = X0 + r1 * sin(alpha) 154 y1 = Y0 + r1 * cos(alpha) 155 x2 = X0 + r2 * sin(alpha) 156 y2 = Y0 + r2 * cos(alpha) 157 canvas.create_line(x1, y1, x2, y2, fill='blue') 158 # accumulated GC skew 159 r1 = r - 50 160 r2 = r1 - acc 161 x1 = X0 + r1 * sin(alpha) 162 y1 = Y0 + r1 * cos(alpha) 163 x2 = X0 + r2 * sin(alpha) 164 y2 = Y0 + r2 * cos(alpha) 165 canvas.create_line(x1, y1, x2, y2, fill='magenta') 166 167 canvas.update() 168 start += window 169 170 canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
171 172
173 -def molecular_weight(seq):
174 """Calculate the molecular weight of a DNA sequence.""" 175 if isinstance(seq, str): 176 seq = Seq(seq, IUPAC.unambiguous_dna) 177 weight_table = IUPACData.unambiguous_dna_weights 178 return sum(weight_table[x] for x in seq)
179 180
181 -def nt_search(seq, subseq):
182 """Search for a DNA subseq in sequence. 183 184 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 185 searches only on forward strand 186 """ 187 pattern = '' 188 for nt in subseq: 189 value = IUPACData.ambiguous_dna_values[nt] 190 if len(value) == 1: 191 pattern += value 192 else: 193 pattern += '[%s]' % value 194 195 pos = -1 196 result = [pattern] 197 l = len(seq) 198 while True: 199 pos += 1 200 s = seq[pos:] 201 m = re.search(pattern, s) 202 if not m: 203 break 204 pos += int(m.start(0)) 205 result.append(pos) 206 return result
207 208 # }}} 209 210 ###################################### 211 # Protein 212 ###################### 213 # {{{ 214
215 -def seq3(seq, custom_map={'*': 'Ter'}, undef_code='Xaa'):
216 """Turn a one letter code protein sequence into one with three letter codes. 217 218 The single input argument 'seq' should be a protein sequence using single 219 letter codes, either as a python string or as a Seq or MutableSeq object. 220 221 This function returns the amino acid sequence as a string using the three 222 letter amino acid codes. Output follows the IUPAC standard (including 223 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 224 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. 225 Any unknown character (including possible gap characters), is changed into 226 'Xaa'. 227 228 e.g. 229 230 >>> from Bio.SeqUtils import seq3 231 >>> seq3("MAIVMGRWKGAR*") 232 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 233 234 You can set a custom translation of the codon termination code using the 235 "custom_map" argument, e.g. 236 237 >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"}) 238 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***' 239 240 You can also set a custom translation for non-amino acid characters, such 241 as '-', using the "undef_code" argument, e.g. 242 243 >>> seq3("MAIVMGRWKGA--R*", undef_code='---') 244 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer' 245 246 If not given, "undef_code" defaults to "Xaa", e.g. 247 248 >>> seq3("MAIVMGRWKGA--R*") 249 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer' 250 251 This function was inspired by BioPerl's seq3. 252 """ 253 # not doing .update() on IUPACData dict with custom_map dict 254 # to preserve its initial state (may be imported in other modules) 255 threecode = dict(list(IUPACData.protein_letters_1to3_extended.items()) + 256 list(custom_map.items())) 257 #We use a default of 'Xaa' for undefined letters 258 #Note this will map '-' to 'Xaa' which may be undesirable! 259 return ''.join(threecode.get(aa, undef_code) for aa in seq)
260 261
262 -def seq1(seq, custom_map={'Ter': '*'}, undef_code='X'):
263 """Turns a three-letter code protein sequence into one with single letter codes. 264 265 The single input argument 'seq' should be a protein sequence using three- 266 letter codes, either as a python string or as a Seq or MutableSeq object. 267 268 This function returns the amino acid sequence as a string using the one 269 letter amino acid codes. Output follows the IUPAC standard (including 270 ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for 271 "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code. 272 Any unknown character (including possible gap characters), is changed into 273 '-'. 274 275 e.g. 276 277 >>> from Bio.SeqUtils import seq3 278 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer") 279 'MAIVMGRWKGAR*' 280 281 The input is case insensitive, e.g. 282 283 >>> from Bio.SeqUtils import seq3 284 >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer") 285 'MAIVMGRWKGAR*' 286 287 You can set a custom translation of the codon termination code using the 288 "custom_map" argument, e.g. 289 290 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArg***", custom_map={"***": "*"}) 291 'MAIVMGRWKGAR*' 292 293 You can also set a custom translation for non-amino acid characters, such 294 as '-', using the "undef_code" argument, e.g. 295 296 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?') 297 'MAIVMGRWKGA??R*' 298 299 If not given, "undef_code" defaults to "X", e.g. 300 301 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer") 302 'MAIVMGRWKGAXXR*' 303 304 """ 305 # reverse map of threecode 306 # upper() on all keys to enable caps-insensitive input seq handling 307 onecode = dict((k.upper(), v) for k, v in 308 IUPACData.protein_letters_3to1_extended.items()) 309 # add the given termination codon code and custom maps 310 onecode.update((k.upper(), v) for (k, v) in custom_map.items()) 311 seqlist = [seq[3*i:3*(i+1)] for i in range(len(seq) // 3)] 312 return ''.join(onecode.get(aa.upper(), undef_code) for aa in seqlist)
313 314 315 # }}} 316 317 ###################################### 318 # Mixed ??? 319 ###################### 320 # {{{ 321 322
323 -def six_frame_translations(seq, genetic_code=1):
324 """Formatted string showing the 6 frame translations and GC content. 325 326 nice looking 6 frame translation with GC content - code from xbbtools 327 similar to DNA Striders six-frame translation 328 329 >>> from Bio.SeqUtils import six_frame_translations 330 >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) 331 GC_Frame: a:5 t:0 g:8 c:5 332 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC 333 <BLANKLINE> 334 <BLANKLINE> 335 1/1 336 G H C N G P L 337 W P L * W A A 338 M A I V M G R * 339 auggccauuguaaugggccgcuga 54 % 340 uaccgguaacauuacccggcgacu 341 A M T I P R Q 342 H G N Y H A A S 343 P W Q L P G S 344 <BLANKLINE> 345 <BLANKLINE> 346 347 """ 348 from Bio.Seq import reverse_complement, translate 349 anti = reverse_complement(seq) 350 comp = anti[::-1] 351 length = len(seq) 352 frames = {} 353 for i in range(0, 3): 354 fragment_length = 3 * ((length-i) // 3) 355 frames[i+1] = translate(seq[i:i+fragment_length], genetic_code) 356 frames[-(i+1)] = translate(anti[i:i+fragment_length], genetic_code)[::-1] 357 358 # create header 359 if length > 20: 360 short = '%s ... %s' % (seq[:10], seq[-10:]) 361 else: 362 short = seq 363 header = 'GC_Frame: ' 364 for nt in ['a', 't', 'g', 'c']: 365 header += '%s:%d ' % (nt, seq.count(nt.upper())) 366 367 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq)) 368 res = header 369 370 for i in range(0, length, 60): 371 subseq = seq[i:i+60] 372 csubseq = comp[i:i+60] 373 p = i//3 374 res += '%d/%d\n' % (i+1, i/3+1) 375 res += ' ' + ' '.join(frames[3][p:p+20]) + '\n' 376 res += ' ' + ' '.join(frames[2][p:p+20]) + '\n' 377 res += ' '.join(frames[1][p:p+20]) + '\n' 378 # seq 379 res += subseq.lower() + '%5d %%\n' % int(GC(subseq)) 380 res += csubseq.lower() + '\n' 381 # - frames 382 res += ' '.join(frames[-2][p:p+20]) +' \n' 383 res += ' ' + ' '.join(frames[-1][p:p+20]) + '\n' 384 res += ' ' + ' '.join(frames[-3][p:p+20]) + '\n\n' 385 return res
386 387 # }}} 388 389 ###################################### 390 # FASTA file utilities 391 ###################### 392 # {{{ 393 394
395 -def quick_FASTA_reader(file):
396 """Simple FASTA reader, returning a list of string tuples (OBSOLETE). 397 398 The single argument 'file' should be the filename of a FASTA format file. 399 This function will open and read in the entire file, constructing a list 400 of all the records, each held as a tuple of strings (the sequence name or 401 title, and its sequence). 402 403 >>> seqs = quick_FASTA_reader("Fasta/dups.fasta") 404 >>> for title, sequence in seqs: 405 ... print("%s %s" % (title, sequence)) 406 alpha ACGTA 407 beta CGTC 408 gamma CCGCC 409 alpha (again - this is a duplicate entry to test the indexing code) ACGTA 410 delta CGCGC 411 412 This function was is fast, but because it returns the data as a single in 413 memory list, is unsuitable for large files where an iterator approach is 414 preferable. 415 416 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 417 allows you to iterate over the records one by one (avoiding having all the 418 records in memory at once). Using Bio.SeqIO also makes it easy to switch 419 between different input file formats. However, please note that rather 420 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 421 422 If you want to use simple strings, use the function SimpleFastaParser 423 added to Bio.SeqIO.FastaIO in Biopython 1.61 instead. 424 """ 425 from Bio.SeqIO.FastaIO import SimpleFastaParser 426 with open(file) as handle: 427 entries = list(SimpleFastaParser(handle)) 428 return entries
429 430 431 # }}} 432 433
434 -def _test():
435 """Run the module's doctests (PRIVATE).""" 436 import os 437 import doctest 438 if os.path.isdir(os.path.join("..", "Tests")): 439 print("Running doctests...") 440 cur_dir = os.path.abspath(os.curdir) 441 os.chdir(os.path.join("..", "Tests")) 442 doctest.testmod() 443 os.chdir(cur_dir) 444 del cur_dir 445 print("Done") 446 elif os.path.isdir(os.path.join("Tests")): 447 print("Running doctests...") 448 cur_dir = os.path.abspath(os.curdir) 449 os.chdir(os.path.join("Tests")) 450 doctest.testmod() 451 os.chdir(cur_dir) 452 del cur_dir 453 print("Done")
454 455 456 if __name__ == "__main__": 457 _test() 458