[frames] | no frames]

# Source Code for Package Bio.SeqUtils

```  1  #!/usr/bin/env python
2  # Created: Wed May 29 08:07:18 2002
3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se
4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark.
5  # Revisions copyright 2014 by Markus Piotrowski.
6  # Revisions copyright 2014 by Peter Cock.
8  # This code is part of the Biopython distribution and governed by its
10  # as part of this package.
11
12  """Miscellaneous functions for dealing with sequences."""
13
14  from __future__ import print_function
15
16  import re
17  from math import pi, sin, cos
18
19  from Bio.Seq import Seq, MutableSeq
20  from Bio import Alphabet
21  from Bio.Alphabet import IUPAC
22  from Bio.Data import IUPACData
23
24  __docformat__ = "restructuredtext en"
25
26  ######################################
27  # DNA
28  ######################
29  # {{{
30
31
32 -def GC(seq):
33      """Calculates G+C content, returns the percentage (float between 0 and 100).
34
35      Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
36      when counting the G and C content.  The percentage is calculated against
37      the full length, e.g.:
38
39      >>> from Bio.SeqUtils import GC
40      >>> GC("ACTGN")
41      40.0
42
43      Note that this will return zero for an empty sequence.
44      """
45      gc = sum(seq.count(x) for x in ['G', 'C', 'g', 'c', 'S', 's'])
46      try:
47          return gc * 100.0 / len(seq)
48      except ZeroDivisionError:
49          return 0.0
50
51
52 -def GC123(seq):
53      """Calculates total G+C content plus first, second and third positions.
54
55      Returns a tuple of four floats (percentages between 0 and 100) for the
56      entire sequence, and the three codon positions.  e.g.
57
58      >>> from Bio.SeqUtils import GC123
59      >>> GC123("ACTGTN")
60      (40.0, 50.0, 50.0, 0.0)
61
62      Copes with mixed case sequences, but does NOT deal with ambiguous
63      nucleotides.
64      """
65      d = {}
66      for nt in ['A', 'T', 'G', 'C']:
67          d[nt] = [0, 0, 0]
68
69      for i in range(0, len(seq), 3):
70          codon = seq[i:i + 3]
71          if len(codon) < 3:
72              codon += '  '
73          for pos in range(0, 3):
74              for nt in ['A', 'T', 'G', 'C']:
75                  if codon[pos] == nt or codon[pos] == nt.lower():
76                      d[nt][pos] += 1
77      gc = {}
78      gcall = 0
79      nall = 0
80      for i in range(0, 3):
81          try:
82              n = d['G'][i] + d['C'][i] + d['T'][i] + d['A'][i]
83              gc[i] = (d['G'][i] + d['C'][i]) * 100.0 / n
84          except:
85              gc[i] = 0
86
87          gcall = gcall + d['G'][i] + d['C'][i]
88          nall = nall + n
89
90      gcall = 100.0 * gcall / nall
91      return gcall, gc[0], gc[1], gc[2]
92
93
94 -def GC_skew(seq, window=100):
95      """Calculates GC skew (G-C)/(G+C) for multiple windows along the sequence.
96
97      Returns a list of ratios (floats), controlled by the length of the sequence
98      and the size of the window.
99
100      Does NOT look at any ambiguous nucleotides.
101      """
102      # 8/19/03: Iddo: added lowercase
103      values = []
104      for i in range(0, len(seq), window):
105          s = seq[i: i + window]
106          g = s.count('G') + s.count('g')
107          c = s.count('C') + s.count('c')
108          skew = (g - c) / float(g + c)
109          values.append(skew)
110      return values
111
112
113 -def xGC_skew(seq, window=1000, zoom=100,
114                           r=300, px=100, py=100):
115      """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
116      try:
117          import Tkinter as tkinter  # Python 2
118      except ImportError:
119          import tkinter  # Python 3
120
121      yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL)
122      xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL)
123      canvas = tkinter.Canvas(yscrollcommand=yscroll.set,
124                              xscrollcommand=xscroll.set, background='white')
125      win = canvas.winfo_toplevel()
126      win.geometry('700x700')
127
128      yscroll.config(command=canvas.yview)
129      xscroll.config(command=canvas.xview)
130      yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y)
131      xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X)
132      canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1)
133      canvas.update()
134
135      X0, Y0 = r + px, r + py
136      x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r
137
138      ty = Y0
139      canvas.create_text(X0, ty, text='%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
140      ty += 20
141      canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq)))
142      ty += 20
143      canvas.create_text(X0, ty, text='GC Skew', fill='blue')
144      ty += 20
145      canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta')
146      ty += 20
147      canvas.create_oval(x1, y1, x2, y2)
148
149      acc = 0
150      start = 0
151      for gc in GC_skew(seq, window):
152          r1 = r
153          acc += gc
154          # GC skew
155          alpha = pi - (2 * pi * start) / len(seq)
156          r2 = r1 - gc * zoom
157          x1 = X0 + r1 * sin(alpha)
158          y1 = Y0 + r1 * cos(alpha)
159          x2 = X0 + r2 * sin(alpha)
160          y2 = Y0 + r2 * cos(alpha)
161          canvas.create_line(x1, y1, x2, y2, fill='blue')
162          # accumulated GC skew
163          r1 = r - 50
164          r2 = r1 - acc
165          x1 = X0 + r1 * sin(alpha)
166          y1 = Y0 + r1 * cos(alpha)
167          x2 = X0 + r2 * sin(alpha)
168          y2 = Y0 + r2 * cos(alpha)
169          canvas.create_line(x1, y1, x2, y2, fill='magenta')
170
171          canvas.update()
172          start += window
173
174      canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
175
176
177 -def nt_search(seq, subseq):
178      """Search for a DNA subseq in sequence.
179
180      use ambiguous values (like N = A or T or C or G, R = A or G etc.)
181      searches only on forward strand
182      """
183      pattern = ''
184      for nt in subseq:
185          value = IUPACData.ambiguous_dna_values[nt]
186          if len(value) == 1:
187              pattern += value
188          else:
189              pattern += '[%s]' % value
190
191      pos = -1
192      result = [pattern]
193      l = len(seq)
194      while True:
195          pos += 1
196          s = seq[pos:]
197          m = re.search(pattern, s)
198          if not m:
199              break
200          pos += int(m.start(0))
201          result.append(pos)
202      return result
203
204  # }}}
205
206  ######################################
207  # Protein
208  ######################
209  # {{{
210
211
212 -def seq3(seq, custom_map={'*': 'Ter'}, undef_code='Xaa'):
213      """Turn a one letter code protein sequence into one with three letter codes.
214
215      The single input argument 'seq' should be a protein sequence using single
216      letter codes, either as a python string or as a Seq or MutableSeq object.
217
218      This function returns the amino acid sequence as a string using the three
219      letter amino acid codes. Output follows the IUPAC standard (including
220      ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
221      for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
222      Any unknown character (including possible gap characters), is changed into
223      'Xaa'.
224
225      e.g.
226
227      >>> from Bio.SeqUtils import seq3
228      >>> seq3("MAIVMGRWKGAR*")
229      'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
230
231      You can set a custom translation of the codon termination code using the
232      "custom_map" argument, e.g.
233
234      >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"})
235      'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***'
236
237      You can also set a custom translation for non-amino acid characters, such
238      as '-', using the "undef_code" argument, e.g.
239
240      >>> seq3("MAIVMGRWKGA--R*", undef_code='---')
241      'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer'
242
243      If not given, "undef_code" defaults to "Xaa", e.g.
244
245      >>> seq3("MAIVMGRWKGA--R*")
246      'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer'
247
248      This function was inspired by BioPerl's seq3.
249      """
250      # not doing .update() on IUPACData dict with custom_map dict
251      # to preserve its initial state (may be imported in other modules)
252      threecode = dict(list(IUPACData.protein_letters_1to3_extended.items()) +
253                       list(custom_map.items()))
254      # We use a default of 'Xaa' for undefined letters
255      # Note this will map '-' to 'Xaa' which may be undesirable!
256      return ''.join(threecode.get(aa, undef_code) for aa in seq)
257
258
259 -def seq1(seq, custom_map={'Ter': '*'}, undef_code='X'):
260      """Turns a three-letter code protein sequence into one with single letter codes.
261
262      The single input argument 'seq' should be a protein sequence using three-
263      letter codes, either as a python string or as a Seq or MutableSeq object.
264
265      This function returns the amino acid sequence as a string using the one
266      letter amino acid codes. Output follows the IUPAC standard (including
267      ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for
268      "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code.
269      Any unknown character (including possible gap characters), is changed into
270      '-'.
271
272      e.g.
273
274      >>> from Bio.SeqUtils import seq3
275      >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer")
276      'MAIVMGRWKGAR*'
277
278      The input is case insensitive, e.g.
279
280      >>> from Bio.SeqUtils import seq3
281      >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer")
282      'MAIVMGRWKGAR*'
283
284      You can set a custom translation of the codon termination code using the
285      "custom_map" argument, e.g.
286
287      >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArg***", custom_map={"***": "*"})
288      'MAIVMGRWKGAR*'
289
290      You can also set a custom translation for non-amino acid characters, such
291      as '-', using the "undef_code" argument, e.g.
292
293      >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?')
294      'MAIVMGRWKGA??R*'
295
296      If not given, "undef_code" defaults to "X", e.g.
297
298      >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer")
299      'MAIVMGRWKGAXXR*'
300
301      """
302      # reverse map of threecode
303      # upper() on all keys to enable caps-insensitive input seq handling
304      onecode = dict((k.upper(), v) for k, v in
305                     IUPACData.protein_letters_3to1_extended.items())
306      # add the given termination codon code and custom maps
307      onecode.update((k.upper(), v) for (k, v) in custom_map.items())
308      seqlist = [seq[3 * i:3 * (i + 1)] for i in range(len(seq) // 3)]
309      return ''.join(onecode.get(aa.upper(), undef_code) for aa in seqlist)
310
311
312  # }}}
313
314  ######################################
315  # Mixed ???
316  ######################
317  # {{{
318
319
320 -def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False,
321                       monoisotopic=False):
322      """Calculates the molecular weight of a DNA, RNA or protein sequence.
323
324      Only unambiguous letters are allowed. Nucleotide sequences are assumed to
325      have a 5' phosphate.
326
327          - seq: String or Biopython sequence object.
328          - seq_type: The default (None) is to take the alphabet from the seq argument,
329            or assume DNA if the seq argument is a string. Override this with
330            a string 'DNA', 'RNA', or 'protein'.
331          - double_stranded: Calculate the mass for the double stranded molecule?
332          - circular: Is the molecule circular (has no ends)?
333          - monoisotopic: Use the monoisotopic mass tables?
334
335      Note that for backwards compatibility, if the seq argument is a string,
336      or Seq object with a generic alphabet, and no seq_type is specified
337      (i.e. left as None), then DNA is assumed.
338
339      >>> print("%0.2f" % molecular_weight("AGC"))
340      949.61
341      >>> print("%0.2f" % molecular_weight(Seq("AGC")))
342      949.61
343
344      However, it is better to be explicit - for example with strings:
345
346      >>> print("%0.2f" % molecular_weight("AGC", "DNA"))
347      949.61
348      >>> print("%0.2f" % molecular_weight("AGC", "RNA"))
349      997.61
350      >>> print("%0.2f" % molecular_weight("AGC", "protein"))
351      249.29
352
353      Or, with the sequence alphabet:
354
355      >>> from Bio.Seq import Seq
356      >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
357      >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna)))
358      949.61
359      >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna)))
360      997.61
361      >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein)))
362      249.29
363
364      Also note that contradictory sequence alphabets and seq_type will also
365      give an exception:
366
367      >>> from Bio.Seq import Seq
368      >>> from Bio.Alphabet import generic_dna
369      >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA"))
370      Traceback (most recent call last):
371        ...
372      ValueError: seq_type='RNA' contradicts DNA from seq alphabet
373
374      """
375      # Rewritten by Markus Piotrowski, 2014
376
377      # Find the alphabet type
378      tmp_type = ''
379      if isinstance(seq, Seq) or isinstance(seq, MutableSeq):
380          base_alphabet = Alphabet._get_base_alphabet(seq.alphabet)
381          if isinstance(base_alphabet, Alphabet.DNAAlphabet):
382              tmp_type = 'DNA'
383          elif isinstance(base_alphabet, Alphabet.RNAAlphabet):
384              tmp_type = 'RNA'
385          elif isinstance(base_alphabet, Alphabet.ProteinAlphabet):
386              tmp_type = 'protein'
387          elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein):
388              tmp_type = 'protein'
389              # Convert to one-letter sequence. Have to use a string for seq1
390              seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet())
391          elif not isinstance(base_alphabet, Alphabet.Alphabet):
392              raise TypeError("%s is not a valid alphabet for mass calculations"
393                               % base_alphabet)
394          else:
395              tmp_type = "DNA"  # backward compatibity
396          if seq_type and tmp_type and tmp_type != seq_type:
397              raise ValueError("seq_type=%r contradicts %s from seq alphabet"
398                               % (seq_type, tmp_type))
399          seq_type = tmp_type
400      elif isinstance(seq, str):
401          if seq_type is None:
402              seq_type = "DNA"  # backward compatibity
403      else:
404          raise TypeError("Expected a string or Seq object, not seq=%r" % seq)
405
406      seq = ''.join(str(seq).split()).upper()  # Do the minimum formatting
407
408      if seq_type == 'DNA':
409          if monoisotopic:
410              weight_table = IUPACData.monoisotopic_unambiguous_dna_weights
411          else:
412              weight_table = IUPACData.unambiguous_dna_weights
413      elif seq_type == 'RNA':
414          if monoisotopic:
415              weight_table = IUPACData.monoisotopic_unambiguous_rna_weights
416          else:
417              weight_table = IUPACData.unambiguous_rna_weights
418      elif seq_type == 'protein':
419          if monoisotopic:
420              weight_table = IUPACData.monoisotopic_protein_weights
421          else:
422              weight_table = IUPACData.protein_weights
423      else:
424          raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r"
425                           % seq_type)
426
427      if monoisotopic:
428          water = 18.010565
429      else:
430          water = 18.0153
431
432      try:
433          weight = sum(weight_table[x] for x in seq) - (len(seq) - 1) * water
434          if circular:
435              weight -= water
436      except KeyError as e:
437          raise ValueError('%s is not a valid unambiguous letter for %s'
438                           % (e, seq_type))
439      except:
440          raise
441
442      if seq_type in ('DNA', 'RNA') and double_stranded:
443          seq = str(Seq(seq).complement())
444          weight += sum(weight_table[x] for x in seq) - (len(seq) - 1) * water
445          if circular:
446              weight -= water
447      elif seq_type == 'protein' and double_stranded:
448          raise ValueError('double-stranded proteins await their discovery')
449
450      return weight
451
452
453 -def six_frame_translations(seq, genetic_code=1):
454      """Formatted string showing the 6 frame translations and GC content.
455
456      nice looking 6 frame translation with GC content - code from xbbtools
457      similar to DNA Striders six-frame translation
458
459      >>> from Bio.SeqUtils import six_frame_translations
460      >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA"))
461      GC_Frame: a:5 t:0 g:8 c:5
462      Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC
463      <BLANKLINE>
464      <BLANKLINE>
465      1/1
466        G  H  C  N  G  P  L
467       W  P  L  *  W  A  A
468      M  A  I  V  M  G  R  *
469      auggccauuguaaugggccgcuga   54 %
470      uaccgguaacauuacccggcgacu
471      A  M  T  I  P  R  Q
472       H  G  N  Y  H  A  A  S
473        P  W  Q  L  P  G  S
474      <BLANKLINE>
475      <BLANKLINE>
476
477      """
478      from Bio.Seq import reverse_complement, translate
479      anti = reverse_complement(seq)
480      comp = anti[::-1]
481      length = len(seq)
482      frames = {}
483      for i in range(0, 3):
484          fragment_length = 3 * ((length - i) // 3)
485          frames[i + 1] = translate(seq[i:i + fragment_length], genetic_code)
486          frames[-(i + 1)] = translate(anti[i:i + fragment_length], genetic_code)[::-1]
487
489      if length > 20:
490          short = '%s ... %s' % (seq[:10], seq[-10:])
491      else:
492          short = seq
494      for nt in ['a', 't', 'g', 'c']:
495          header += '%s:%d ' % (nt, seq.count(nt.upper()))
496
497      header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq))
499
500      for i in range(0, length, 60):
501          subseq = seq[i:i + 60]
502          csubseq = comp[i:i + 60]
503          p = i // 3
504          res += '%d/%d\n' % (i + 1, i / 3 + 1)
505          res += '  ' + '  '.join(frames[3][p:p + 20]) + '\n'
506          res += ' ' + '  '.join(frames[2][p:p + 20]) + '\n'
507          res += '  '.join(frames[1][p:p + 20]) + '\n'
508          # seq
509          res += subseq.lower() + '%5d %%\n' % int(GC(subseq))
510          res += csubseq.lower() + '\n'
511          # - frames
512          res += '  '.join(frames[-2][p:p + 20]) + ' \n'
513          res += ' ' + '  '.join(frames[-1][p:p + 20]) + '\n'
514          res += '  ' + '  '.join(frames[-3][p:p + 20]) + '\n\n'
515      return res
516
517  # }}}
518
519  ######################################
520  # FASTA file utilities
521  ######################
522  # {{{
523
524
526      """Simple FASTA reader, returning a list of string tuples (DEPRECATED).
527
528      The single argument 'file' should be the filename of a FASTA format file.
529      This function will open and read in the entire file, constructing a list
530      of all the records, each held as a tuple of strings (the sequence name or
531      title, and its sequence).
532
534      >>> for title, sequence in seqs:
535      ...     print("%s %s" % (title, sequence))
536      alpha ACGTA
537      beta CGTC
538      gamma CCGCC
539      alpha (again - this is a duplicate entry to test the indexing code) ACGTA
540      delta CGCGC
541
542      This function was is fast, but because it returns the data as a single in
543      memory list, is unsuitable for large files where an iterator approach is
544      preferable.
545
546      You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
547      allows you to iterate over the records one by one (avoiding having all the
548      records in memory at once).  Using Bio.SeqIO also makes it easy to switch
549      between different input file formats.  However, please note that rather
550      than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
551
552      If you want to use simple strings, use the function SimpleFastaParser
554      """
555      import warnings
556      from Bio import BiopythonDeprecationWarning
557      warnings.warn("The quick_FASTA_reader has been deprecated and will be "
558                    "removed in a future release of Biopython. Please try "
559                    "function SimpleFastaParser from Bio.SeqIO.FastaIO "
561      from Bio.SeqIO.FastaIO import SimpleFastaParser
562      with open(file) as handle:
563          entries = list(SimpleFastaParser(handle))
564      return entries
565
566
567  # }}}
568
569
570 -def _test():
571      """Run the module's doctests (PRIVATE)."""
572      import os
573      import doctest
574      if os.path.isdir(os.path.join("..", "Tests")):
575          print("Running doctests...")
576          cur_dir = os.path.abspath(os.curdir)
577          os.chdir(os.path.join("..", "Tests"))
578          doctest.testmod()
579          os.chdir(cur_dir)
580          del cur_dir
581          print("Done")
582      elif os.path.isdir(os.path.join("Tests")):
583          print("Running doctests...")
584          cur_dir = os.path.abspath(os.curdir)
585          os.chdir(os.path.join("Tests"))
586          doctest.testmod()
587          os.chdir(cur_dir)
588          del cur_dir
589          print("Done")
590
591
592  if __name__ == "__main__":
593      _test()
594
<!--
expandto(location.href);
// -->

```

 Generated by Epydoc 3.0.1 on Wed Oct 21 18:04:30 2015 http://epydoc.sourceforge.net