1
2
3
4
5
6
7
8
9
10 """Miscellaneous functions for dealing with sequences."""
11
12 import re
13 from math import pi, sin, cos
14
15 from Bio.Seq import Seq
16 from Bio.Alphabet import IUPAC
17 from Bio.Data import IUPACData
18
19
20
21
22
23
24
25
27 """Calculates G+C content, returns the percentage (float between 0 and 100).
28
29 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
30 when counting the G and C content. The percentage is calculated against
31 the full length, e.g.:
32
33 >>> from Bio.SeqUtils import GC
34 >>> GC("ACTGN")
35 40.0
36
37 Note that this will return zero for an empty sequence.
38 """
39 try:
40 gc = sum(map(seq.count, ['G', 'C', 'g', 'c', 'S', 's']))
41 return gc*100.0/len(seq)
42 except ZeroDivisionError:
43 return 0.0
44
45
47 """Calculates total G+C content plus first, second and third positions.
48
49 Returns a tuple of four floats (percentages between 0 and 100) for the
50 entire sequence, and the three codon positions. e.g.
51
52 >>> from Bio.SeqUtils import GC123
53 >>> GC123("ACTGTN")
54 (40.0, 50.0, 50.0, 0.0)
55
56 Copes with mixed case sequences, but does NOT deal with ambiguous
57 nucleotides.
58 """
59 d= {}
60 for nt in ['A', 'T', 'G', 'C']:
61 d[nt] = [0, 0, 0]
62
63 for i in range(0, len(seq), 3):
64 codon = seq[i:i+3]
65 if len(codon) < 3:
66 codon += ' '
67 for pos in range(0, 3):
68 for nt in ['A', 'T', 'G', 'C']:
69 if codon[pos] == nt or codon[pos] == nt.lower():
70 d[nt][pos] += 1
71 gc = {}
72 gcall = 0
73 nall = 0
74 for i in range(0, 3):
75 try:
76 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
77 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
78 except:
79 gc[i] = 0
80
81 gcall = gcall + d['G'][i] + d['C'][i]
82 nall = nall + n
83
84 gcall = 100.0*gcall/nall
85 return gcall, gc[0], gc[1], gc[2]
86
87
89 """Calculates GC skew (G-C)/(G+C) for multiple windows along the sequence.
90
91 Returns a list of ratios (floats), controlled by the length of the sequence
92 and the size of the window.
93
94 Does NOT look at any ambiguous nucleotides.
95 """
96
97 values = []
98 for i in range(0, len(seq), window):
99 s = seq[i: i + window]
100 g = s.count('G') + s.count('g')
101 c = s.count('C') + s.count('c')
102 skew = (g-c)/float(g+c)
103 values.append(skew)
104 return values
105
106
107 -def xGC_skew(seq, window=1000, zoom=100,
108 r=300, px=100, py=100):
109 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
110 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
111 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
112 yscroll = Scrollbar(orient=VERTICAL)
113 xscroll = Scrollbar(orient=HORIZONTAL)
114 canvas = Canvas(yscrollcommand=yscroll.set,
115 xscrollcommand=xscroll.set, background='white')
116 win = canvas.winfo_toplevel()
117 win.geometry('700x700')
118
119 yscroll.config(command=canvas.yview)
120 xscroll.config(command=canvas.xview)
121 yscroll.pack(side=RIGHT, fill=Y)
122 xscroll.pack(side=BOTTOM, fill=X)
123 canvas.pack(fill=BOTH, side=LEFT, expand=1)
124 canvas.update()
125
126 X0, Y0 = r + px, r + py
127 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r
128
129 ty = Y0
130 canvas.create_text(X0, ty, text='%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
131 ty += 20
132 canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq)))
133 ty += 20
134 canvas.create_text(X0, ty, text='GC Skew', fill='blue')
135 ty += 20
136 canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta')
137 ty += 20
138 canvas.create_oval(x1, y1, x2, y2)
139
140 acc = 0
141 start = 0
142 for gc in GC_skew(seq, window):
143 r1 = r
144 acc += gc
145
146 alpha = pi - (2*pi*start)/len(seq)
147 r2 = r1 - gc*zoom
148 x1 = X0 + r1 * sin(alpha)
149 y1 = Y0 + r1 * cos(alpha)
150 x2 = X0 + r2 * sin(alpha)
151 y2 = Y0 + r2 * cos(alpha)
152 canvas.create_line(x1, y1, x2, y2, fill='blue')
153
154 r1 = r - 50
155 r2 = r1 - acc
156 x1 = X0 + r1 * sin(alpha)
157 y1 = Y0 + r1 * cos(alpha)
158 x2 = X0 + r2 * sin(alpha)
159 y2 = Y0 + r2 * cos(alpha)
160 canvas.create_line(x1, y1, x2, y2, fill='magenta')
161
162 canvas.update()
163 start += window
164
165 canvas.configure(scrollregion=canvas.bbox(ALL))
166
167
174
175
177 """Search for a DNA subseq in sequence.
178
179 use ambiguous values (like N = A or T or C or G, R = A or G etc.)
180 searches only on forward strand
181 """
182 pattern = ''
183 for nt in subseq:
184 value = IUPACData.ambiguous_dna_values[nt]
185 if len(value) == 1:
186 pattern += value
187 else:
188 pattern += '[%s]' % value
189
190 pos = -1
191 result = [pattern]
192 l = len(seq)
193 while True:
194 pos += 1
195 s = seq[pos:]
196 m = re.search(pattern, s)
197 if not m:
198 break
199 pos += int(m.start(0))
200 result.append(pos)
201 return result
202
203
204
205
206
207
208
209
210 _THREECODE = {'A': 'Ala', 'B': 'Asx', 'C': 'Cys', 'D': 'Asp',
211 'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His',
212 'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met',
213 'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg',
214 'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp',
215 'Y': 'Tyr', 'Z': 'Glx', 'X': 'Xaa',
216 'U': 'Sel', 'O': 'Pyl', 'J': 'Xle',
217 }
218
219
220 -def seq3(seq, custom_map={'*': 'Ter'}, undef_code='Xaa'):
221 """Turn a one letter code protein sequence into one with three letter codes.
222
223 The single input argument 'seq' should be a protein sequence using single
224 letter codes, either as a python string or as a Seq or MutableSeq object.
225
226 This function returns the amino acid sequence as a string using the three
227 letter amino acid codes. Output follows the IUPAC standard (including
228 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
229 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
230 Any unknown character (including possible gap characters), is changed into
231 'Xaa'.
232
233 e.g.
234
235 >>> from Bio.SeqUtils import seq3
236 >>> seq3("MAIVMGRWKGAR*")
237 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
238
239 You can set a custom translation of the codon termination code using the
240 "custom_map" argument, e.g.
241
242 >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"})
243 'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***'
244
245 You can also set a custom translation for non-amino acid characters, such
246 as '-', using the "undef_code" argument, e.g.
247
248 >>> seq3("MAIVMGRWKGA--R*", undef_code='---')
249 'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer'
250
251 If not given, "undef_code" defaults to "Xaa", e.g.
252
253 >>> seq3("MAIVMGRWKGA--R*")
254 'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer'
255
256 This function was inspired by BioPerl's seq3.
257 """
258 threecode = _THREECODE
259
260 threecode.update(custom_map)
261
262
263 return ''.join([threecode.get(aa, undef_code) for aa in seq])
264
265
266 -def seq1(seq, custom_map={'Ter': '*'}, undef_code='X'):
267 """Turns a three-letter code protein sequence into one with single letter codes.
268
269 The single input argument 'seq' should be a protein sequence using three-
270 letter codes, either as a python string or as a Seq or MutableSeq object.
271
272 This function returns the amino acid sequence as a string using the one
273 letter amino acid codes. Output follows the IUPAC standard (including
274 ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for
275 "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code.
276 Any unknown character (including possible gap characters), is changed into
277 '-'.
278
279 e.g.
280
281 >>> from Bio.SeqUtils import seq3
282 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer")
283 'MAIVMGRWKGAR*'
284
285 The input is case insensitive, e.g.
286
287 >>> from Bio.SeqUtils import seq3
288 >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer")
289 'MAIVMGRWKGAR*'
290
291 You can set a custom translation of the codon termination code using the
292 "custom_map" argument, e.g.
293
294 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArg***", custom_map={"***": "*"})
295 'MAIVMGRWKGAR*'
296
297 You can also set a custom translation for non-amino acid characters, such
298 as '-', using the "undef_code" argument, e.g.
299
300 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?')
301 'MAIVMGRWKGA??R*'
302
303 If not given, "undef_code" defaults to "X", e.g.
304
305 >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer")
306 'MAIVMGRWKGAXXR*'
307
308 """
309
310 onecode = dict([(x[1].upper(), x[0]) for x in _THREECODE.items()])
311
312 onecode.update((k.upper(), v) for (k, v) in custom_map.iteritems())
313 seqlist = [seq[3*i:3*(i+1)] for i in range(len(seq) // 3)]
314 return ''.join([onecode.get(aa.upper(), undef_code) for aa in seqlist])
315
316
317
318
319
320
321
322
323
324
326 """Formatted string showing the 6 frame translations and GC content.
327
328 nice looking 6 frame translation with GC content - code from xbbtools
329 similar to DNA Striders six-frame translation
330
331 >>> from Bio.SeqUtils import six_frame_translations
332 >>> print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
333 GC_Frame: a:5 t:0 g:8 c:5
334 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC
335 <BLANKLINE>
336 <BLANKLINE>
337 1/1
338 G H C N G P L
339 W P L * W A A
340 M A I V M G R *
341 auggccauuguaaugggccgcuga 54 %
342 uaccgguaacauuacccggcgacu
343 A M T I P R Q
344 H G N Y H A A S
345 P W Q L P G S
346 <BLANKLINE>
347 <BLANKLINE>
348
349 """
350 from Bio.Seq import reverse_complement, translate
351 anti = reverse_complement(seq)
352 comp = anti[::-1]
353 length = len(seq)
354 frames = {}
355 for i in range(0, 3):
356 frames[i+1] = translate(seq[i:], genetic_code)
357 frames[-(i+1)] = translate(anti[i:], genetic_code)[::-1]
358
359
360 if length > 20:
361 short = '%s ... %s' % (seq[:10], seq[-10:])
362 else:
363 short = seq
364 header = 'GC_Frame: '
365 for nt in ['a', 't', 'g', 'c']:
366 header += '%s:%d ' % (nt, seq.count(nt.upper()))
367
368 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq))
369 res = header
370
371 for i in range(0, length, 60):
372 subseq = seq[i:i+60]
373 csubseq = comp[i:i+60]
374 p = i//3
375 res += '%d/%d\n' % (i+1, i/3+1)
376 res += ' ' + ' '.join(map(None, frames[3][p:p+20])) + '\n'
377 res += ' ' + ' '.join(map(None, frames[2][p:p+20])) + '\n'
378 res += ' '.join(map(None, frames[1][p:p+20])) + '\n'
379
380 res += subseq.lower() + '%5d %%\n' % int(GC(subseq))
381 res += csubseq.lower() + '\n'
382
383 res += ' '.join(map(None, frames[-2][p:p+20])) +' \n'
384 res += ' ' + ' '.join(map(None, frames[-1][p:p+20])) + '\n'
385 res += ' ' + ' '.join(map(None, frames[-3][p:p+20])) + '\n\n'
386 return res
387
388
389
390
391
392
393
394
395
397 """Simple FASTA reader, returning a list of string tuples (OBSOLETE).
398
399 The single argument 'file' should be the filename of a FASTA format file.
400 This function will open and read in the entire file, constructing a list
401 of all the records, each held as a tuple of strings (the sequence name or
402 title, and its sequence).
403
404 >>> seqs = quick_FASTA_reader("Fasta/dups.fasta")
405 >>> for title, sequence in seqs:
406 ... print title, sequence
407 alpha ACGTA
408 beta CGTC
409 gamma CCGCC
410 alpha (again - this is a duplicate entry to test the indexing code) ACGTA
411 delta CGCGC
412
413 This function was is fast, but because it returns the data as a single in
414 memory list, is unsuitable for large files where an iterator approach is
415 preferable.
416
417 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
418 allows you to iterate over the records one by one (avoiding having all the
419 records in memory at once). Using Bio.SeqIO also makes it easy to switch
420 between different input file formats. However, please note that rather
421 than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
422
423 If you want to use simple strings, use the function SimpleFastaParser
424 added to Bio.SeqIO.FastaIO in Biopython 1.61 instead.
425 """
426 handle = open(file)
427 entries = []
428 from Bio.SeqIO.FastaIO import SimpleFastaParser
429 for title, sequence in SimpleFastaParser(handle):
430 entries.append((title, sequence))
431 handle.close()
432 return entries
433
434
435
436
437
439 """Run the module's doctests (PRIVATE)."""
440 import os
441 import doctest
442 if os.path.isdir(os.path.join("..", "Tests")):
443 print "Running doctests..."
444 cur_dir = os.path.abspath(os.curdir)
445 os.chdir(os.path.join("..", "Tests"))
446 doctest.testmod()
447 os.chdir(cur_dir)
448 del cur_dir
449 print "Done"
450 elif os.path.isdir(os.path.join("Tests")):
451 print "Running doctests..."
452 cur_dir = os.path.abspath(os.curdir)
453 os.chdir(os.path.join("Tests"))
454 doctest.testmod()
455 os.chdir(cur_dir)
456 del cur_dir
457 print "Done"
458
459
460 if __name__ == "__main__":
461 _test()
462