1
2
3
4
5
6
7
8 """Provides objects to represent biological sequences with alphabets.
9
10 See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
11 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
12 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
13 """
14 __docformat__ ="epytext en"
15
16 import string
17 import array
18 import sys
19
20 from Bio import Alphabet
21 from Bio.Alphabet import IUPAC
22 from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
23 from Bio.Data import CodonTable
24
25
27 """Makes a python string translation table (PRIVATE).
28
29 Arguments:
30 - complement_mapping - a dictionary such as ambiguous_dna_complement
31 and ambiguous_rna_complement from Data.IUPACData.
32
33 Returns a translation table (a string of length 256) for use with the
34 python string's translate method to use in a (reverse) complement.
35
36 Compatible with lower case and upper case sequences.
37
38 For internal use only.
39 """
40 before = ''.join(complement_mapping.keys())
41 after = ''.join(complement_mapping.values())
42 before = before + before.lower()
43 after = after + after.lower()
44 if sys.version_info[0] == 3:
45 return str.maketrans(before, after)
46 else:
47 return string.maketrans(before, after)
48
49 _dna_complement_table = _maketrans(ambiguous_dna_complement)
50 _rna_complement_table = _maketrans(ambiguous_rna_complement)
51
52
54 """A read-only sequence object (essentially a string with an alphabet).
55
56 Like normal python strings, our basic sequence object is immutable.
57 This prevents you from doing my_seq[5] = "A" for example, but does allow
58 Seq objects to be used as dictionary keys.
59
60 The Seq object provides a number of string like methods (such as count,
61 find, split and strip), which are alphabet aware where appropriate.
62
63 In addition to the string like sequence, the Seq object has an alphabet
64 property. This is an instance of an Alphabet class from Bio.Alphabet,
65 for example generic DNA, or IUPAC DNA. This describes the type of molecule
66 (e.g. RNA, DNA, protein) and may also indicate the expected symbols
67 (letters).
68
69 The Seq object also provides some biological methods, such as complement,
70 reverse_complement, transcribe, back_transcribe and translate (which are
71 not applicable to sequences with a protein alphabet).
72 """
74 """Create a Seq object.
75
76 Arguments:
77 - seq - Sequence, required (string)
78 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
79
80 You will typically use Bio.SeqIO to read in sequences from files as
81 SeqRecord objects, whose sequence will be exposed as a Seq object via
82 the seq property.
83
84 However, will often want to create your own Seq objects directly:
85
86 >>> from Bio.Seq import Seq
87 >>> from Bio.Alphabet import IUPAC
88 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
89 ... IUPAC.protein)
90 >>> my_seq
91 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
92 >>> print my_seq
93 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
94 >>> my_seq.alphabet
95 IUPACProtein()
96
97 """
98
99 if not isinstance(data, basestring):
100 raise TypeError("The sequence data given to a Seq object should "
101 "be a string (not another Seq object etc)")
102 self._data = data
103 self.alphabet = alphabet
104
106 """Returns a (truncated) representation of the sequence for debugging."""
107 if len(self) > 60:
108
109
110
111 return "%s('%s...%s', %s)" % (self.__class__.__name__,
112 str(self)[:54], str(self)[-3:],
113 repr(self.alphabet))
114 else:
115 return "%s(%s, %s)" % (self.__class__.__name__,
116 repr(self._data),
117 repr(self.alphabet))
118
120 """Returns the full sequence as a python string, use str(my_seq).
121
122 Note that Biopython 1.44 and earlier would give a truncated
123 version of repr(my_seq) for str(my_seq). If you are writing code
124 which need to be backwards compatible with old Biopython, you
125 should continue to use my_seq.tostring() rather than str(my_seq).
126 """
127 return self._data
128
130 """Hash for comparison.
131
132 See the __cmp__ documentation - we plan to change this!
133 """
134 return id(self)
135
137 """Compare the sequence to another sequence or a string (README).
138
139 Historically comparing Seq objects has done Python object comparison.
140 After considerable discussion (keeping in mind constraints of the
141 Python language, hashes and dictionary support) a future release of
142 Biopython will change this to use simple string comparison. The plan is
143 that comparing incompatible alphabets (e.g. DNA to RNA) will trigger a
144 warning.
145
146 This version of Biopython still does Python object comparison, but with
147 a warning about this future change. During this transition period,
148 please just do explicit comparisons:
149
150 >>> seq1 = Seq("ACGT")
151 >>> seq2 = Seq("ACGT")
152 >>> id(seq1) == id(seq2)
153 False
154 >>> str(seq1) == str(seq2)
155 True
156
157 Note - This method indirectly supports ==, < , etc.
158 """
159 if hasattr(other, "alphabet"):
160
161 import warnings
162 warnings.warn("In future comparing Seq objects will use string "
163 "comparison (not object comparison). Incompatible "
164 "alphabets will trigger a warning (not an exception). "
165 "In the interim please use id(seq1)==id(seq2) or "
166 "str(seq1)==str(seq2) to make your code explicit "
167 "and to avoid this warning.", FutureWarning)
168 return cmp(id(self), id(other))
169
171 """Returns the length of the sequence, use len(my_seq)."""
172 return len(self._data)
173
175 """Returns a subsequence of single letter, use my_seq[index]."""
176
177
178
179 if isinstance(index, int):
180
181 return self._data[index]
182 else:
183
184 return Seq(self._data[index], self.alphabet)
185
187 """Add another sequence or string to this sequence.
188
189 If adding a string to a Seq, the alphabet is preserved:
190
191 >>> from Bio.Seq import Seq
192 >>> from Bio.Alphabet import generic_protein
193 >>> Seq("MELKI", generic_protein) + "LV"
194 Seq('MELKILV', ProteinAlphabet())
195
196 When adding two Seq (like) objects, the alphabets are important.
197 Consider this example:
198
199 >>> from Bio.Seq import Seq
200 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
201 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna)
202 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna)
203 >>> unamb_dna_seq
204 Seq('ACGT', IUPACUnambiguousDNA())
205 >>> ambig_dna_seq
206 Seq('ACRGT', IUPACAmbiguousDNA())
207
208 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get
209 the more general ambiguous IUPAC DNA alphabet:
210
211 >>> unamb_dna_seq + ambig_dna_seq
212 Seq('ACGTACRGT', IUPACAmbiguousDNA())
213
214 However, if the default generic alphabet is included, the result is
215 a generic alphabet:
216
217 >>> Seq("") + ambig_dna_seq
218 Seq('ACRGT', Alphabet())
219
220 You can't add RNA and DNA sequences:
221
222 >>> from Bio.Alphabet import generic_dna, generic_rna
223 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna)
224 Traceback (most recent call last):
225 ...
226 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet()
227
228 You can't add nucleotide and protein sequences:
229
230 >>> from Bio.Alphabet import generic_dna, generic_protein
231 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein)
232 Traceback (most recent call last):
233 ...
234 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet()
235 """
236 if hasattr(other, "alphabet"):
237
238 if not Alphabet._check_type_compatible([self.alphabet,
239 other.alphabet]):
240 raise TypeError("Incompatible alphabets %s and %s"
241 % (repr(self.alphabet), repr(other.alphabet)))
242
243 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
244 return self.__class__(str(self) + str(other), a)
245 elif isinstance(other, basestring):
246
247 return self.__class__(str(self) + other, self.alphabet)
248 from Bio.SeqRecord import SeqRecord
249 if isinstance(other, SeqRecord):
250
251 return NotImplemented
252 else:
253 raise TypeError
254
256 """Adding a sequence on the left.
257
258 If adding a string to a Seq, the alphabet is preserved:
259
260 >>> from Bio.Seq import Seq
261 >>> from Bio.Alphabet import generic_protein
262 >>> "LV" + Seq("MELKI", generic_protein)
263 Seq('LVMELKI', ProteinAlphabet())
264
265 Adding two Seq (like) objects is handled via the __add__ method.
266 """
267 if hasattr(other, "alphabet"):
268
269 if not Alphabet._check_type_compatible([self.alphabet,
270 other.alphabet]):
271 raise TypeError("Incompatable alphabets %s and %s"
272 % (repr(self.alphabet), repr(other.alphabet)))
273
274 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
275 return self.__class__(str(other) + str(self), a)
276 elif isinstance(other, basestring):
277
278 return self.__class__(other + str(self), self.alphabet)
279 else:
280 raise TypeError
281
283 """Returns the full sequence as a python string (semi-obsolete).
284
285 Although not formally deprecated, you are now encouraged to use
286 str(my_seq) instead of my_seq.tostring()."""
287
288
289
290
291
292
293 return str(self)
294
296 """Returns the full sequence as a MutableSeq object.
297
298 >>> from Bio.Seq import Seq
299 >>> from Bio.Alphabet import IUPAC
300 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
301 ... IUPAC.protein)
302 >>> my_seq
303 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
304 >>> my_seq.tomutable()
305 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
306
307 Note that the alphabet is preserved.
308 """
309 return MutableSeq(str(self), self.alphabet)
310
312 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
313
314 For a string argument, returns the string.
315
316 For a Seq or MutableSeq, it checks the alphabet is compatible
317 (raising an exception if it isn't), and then returns a string.
318 """
319 try:
320 other_alpha = other_sequence.alphabet
321 except AttributeError:
322
323 return other_sequence
324
325
326 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]):
327 raise TypeError("Incompatable alphabets %s and %s"
328 % (repr(self.alphabet), repr(other_alpha)))
329
330 return str(other_sequence)
331
333 """Non-overlapping count method, like that of a python string.
334
335 This behaves like the python string method of the same name,
336 which does a non-overlapping count!
337
338 Returns an integer, the number of occurrences of substring
339 argument sub in the (sub)sequence given by [start:end].
340 Optional arguments start and end are interpreted as in slice
341 notation.
342
343 Arguments:
344 - sub - a string or another Seq object to look for
345 - start - optional integer, slice start
346 - end - optional integer, slice end
347
348 e.g.
349
350 >>> from Bio.Seq import Seq
351 >>> my_seq = Seq("AAAATGA")
352 >>> print my_seq.count("A")
353 5
354 >>> print my_seq.count("ATG")
355 1
356 >>> print my_seq.count(Seq("AT"))
357 1
358 >>> print my_seq.count("AT", 2, -1)
359 1
360
361 HOWEVER, please note because python strings and Seq objects (and
362 MutableSeq objects) do a non-overlapping search, this may not give
363 the answer you expect:
364
365 >>> "AAAA".count("AA")
366 2
367 >>> print Seq("AAAA").count("AA")
368 2
369
370 A non-overlapping search would give the answer as three!
371 """
372
373 sub_str = self._get_seq_str_and_check_alphabet(sub)
374 return str(self).count(sub_str, start, end)
375
377 """Implements the 'in' keyword, like a python string.
378
379 e.g.
380
381 >>> from Bio.Seq import Seq
382 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
383 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna)
384 >>> "AAA" in my_dna
385 True
386 >>> Seq("AAA") in my_dna
387 True
388 >>> Seq("AAA", generic_dna) in my_dna
389 True
390
391 Like other Seq methods, this will raise a type error if another Seq
392 (or Seq like) object with an incompatible alphabet is used:
393
394 >>> Seq("AAA", generic_rna) in my_dna
395 Traceback (most recent call last):
396 ...
397 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet()
398 >>> Seq("AAA", generic_protein) in my_dna
399 Traceback (most recent call last):
400 ...
401 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet()
402 """
403
404 sub_str = self._get_seq_str_and_check_alphabet(char)
405 return sub_str in str(self)
406
408 """Find method, like that of a python string.
409
410 This behaves like the python string method of the same name.
411
412 Returns an integer, the index of the first occurrence of substring
413 argument sub in the (sub)sequence given by [start:end].
414
415 Arguments:
416 - sub - a string or another Seq object to look for
417 - start - optional integer, slice start
418 - end - optional integer, slice end
419
420 Returns -1 if the subsequence is NOT found.
421
422 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
423
424 >>> from Bio.Seq import Seq
425 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
426 >>> my_rna.find("AUG")
427 3
428 """
429
430 sub_str = self._get_seq_str_and_check_alphabet(sub)
431 return str(self).find(sub_str, start, end)
432
434 """Find from right method, like that of a python string.
435
436 This behaves like the python string method of the same name.
437
438 Returns an integer, the index of the last (right most) occurrence of
439 substring argument sub in the (sub)sequence given by [start:end].
440
441 Arguments:
442 - sub - a string or another Seq object to look for
443 - start - optional integer, slice start
444 - end - optional integer, slice end
445
446 Returns -1 if the subsequence is NOT found.
447
448 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
449
450 >>> from Bio.Seq import Seq
451 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
452 >>> my_rna.rfind("AUG")
453 15
454 """
455
456 sub_str = self._get_seq_str_and_check_alphabet(sub)
457 return str(self).rfind(sub_str, start, end)
458
460 """Does the Seq start with the given prefix? Returns True/False.
461
462 This behaves like the python string method of the same name.
463
464 Return True if the sequence starts with the specified prefix
465 (a string or another Seq object), False otherwise.
466 With optional start, test sequence beginning at that position.
467 With optional end, stop comparing sequence at that position.
468 prefix can also be a tuple of strings to try. e.g.
469
470 >>> from Bio.Seq import Seq
471 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
472 >>> my_rna.startswith("GUC")
473 True
474 >>> my_rna.startswith("AUG")
475 False
476 >>> my_rna.startswith("AUG", 3)
477 True
478 >>> my_rna.startswith(("UCC","UCA","UCG"),1)
479 True
480 """
481
482 if isinstance(prefix, tuple):
483 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p)
484 for p in prefix)
485 return str(self).startswith(prefix_strs, start, end)
486 else:
487 prefix_str = self._get_seq_str_and_check_alphabet(prefix)
488 return str(self).startswith(prefix_str, start, end)
489
491 """Does the Seq end with the given suffix? Returns True/False.
492
493 This behaves like the python string method of the same name.
494
495 Return True if the sequence ends with the specified suffix
496 (a string or another Seq object), False otherwise.
497 With optional start, test sequence beginning at that position.
498 With optional end, stop comparing sequence at that position.
499 suffix can also be a tuple of strings to try. e.g.
500
501 >>> from Bio.Seq import Seq
502 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
503 >>> my_rna.endswith("UUG")
504 True
505 >>> my_rna.endswith("AUG")
506 False
507 >>> my_rna.endswith("AUG", 0, 18)
508 True
509 >>> my_rna.endswith(("UCC","UCA","UUG"))
510 True
511 """
512
513 if isinstance(suffix, tuple):
514 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p)
515 for p in suffix)
516 return str(self).endswith(suffix_strs, start, end)
517 else:
518 suffix_str = self._get_seq_str_and_check_alphabet(suffix)
519 return str(self).endswith(suffix_str, start, end)
520
521 - def split(self, sep=None, maxsplit=-1):
522 """Split method, like that of a python string.
523
524 This behaves like the python string method of the same name.
525
526 Return a list of the 'words' in the string (as Seq objects),
527 using sep as the delimiter string. If maxsplit is given, at
528 most maxsplit splits are done. If maxsplit is omitted, all
529 splits are made.
530
531 Following the python string method, sep will by default be any
532 white space (tabs, spaces, newlines) but this is unlikely to
533 apply to biological sequences.
534
535 e.g.
536
537 >>> from Bio.Seq import Seq
538 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
539 >>> my_aa = my_rna.translate()
540 >>> my_aa
541 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
542 >>> my_aa.split("*")
543 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
544 >>> my_aa.split("*",1)
545 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
546
547 See also the rsplit method:
548
549 >>> my_aa.rsplit("*",1)
550 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
551 """
552
553 sep_str = self._get_seq_str_and_check_alphabet(sep)
554
555
556 return [Seq(part, self.alphabet)
557 for part in str(self).split(sep_str, maxsplit)]
558
559 - def rsplit(self, sep=None, maxsplit=-1):
560 """Right split method, like that of a python string.
561
562 This behaves like the python string method of the same name.
563
564 Return a list of the 'words' in the string (as Seq objects),
565 using sep as the delimiter string. If maxsplit is given, at
566 most maxsplit splits are done COUNTING FROM THE RIGHT.
567 If maxsplit is omitted, all splits are made.
568
569 Following the python string method, sep will by default be any
570 white space (tabs, spaces, newlines) but this is unlikely to
571 apply to biological sequences.
572
573 e.g. print my_seq.rsplit("*",1)
574
575 See also the split method.
576 """
577
578 sep_str = self._get_seq_str_and_check_alphabet(sep)
579 return [Seq(part, self.alphabet)
580 for part in str(self).rsplit(sep_str, maxsplit)]
581
582 - def strip(self, chars=None):
583 """Returns a new Seq object with leading and trailing ends stripped.
584
585 This behaves like the python string method of the same name.
586
587 Optional argument chars defines which characters to remove. If
588 omitted or None (default) then as for the python string method,
589 this defaults to removing any white space.
590
591 e.g. print my_seq.strip("-")
592
593 See also the lstrip and rstrip methods.
594 """
595
596 strip_str = self._get_seq_str_and_check_alphabet(chars)
597 return Seq(str(self).strip(strip_str), self.alphabet)
598
599 - def lstrip(self, chars=None):
600 """Returns a new Seq object with leading (left) end stripped.
601
602 This behaves like the python string method of the same name.
603
604 Optional argument chars defines which characters to remove. If
605 omitted or None (default) then as for the python string method,
606 this defaults to removing any white space.
607
608 e.g. print my_seq.lstrip("-")
609
610 See also the strip and rstrip methods.
611 """
612
613 strip_str = self._get_seq_str_and_check_alphabet(chars)
614 return Seq(str(self).lstrip(strip_str), self.alphabet)
615
616 - def rstrip(self, chars=None):
617 """Returns a new Seq object with trailing (right) end stripped.
618
619 This behaves like the python string method of the same name.
620
621 Optional argument chars defines which characters to remove. If
622 omitted or None (default) then as for the python string method,
623 this defaults to removing any white space.
624
625 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
626
627 >>> from Bio.Alphabet import IUPAC
628 >>> from Bio.Seq import Seq
629 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
630 >>> my_seq
631 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
632 >>> my_seq.rstrip("A")
633 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
634
635 See also the strip and lstrip methods.
636 """
637
638 strip_str = self._get_seq_str_and_check_alphabet(chars)
639 return Seq(str(self).rstrip(strip_str), self.alphabet)
640
642 """Returns an upper case copy of the sequence.
643
644 >>> from Bio.Alphabet import HasStopCodon, generic_protein
645 >>> from Bio.Seq import Seq
646 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein))
647 >>> my_seq
648 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*'))
649 >>> my_seq.lower()
650 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*'))
651 >>> my_seq.upper()
652 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*'))
653
654 This will adjust the alphabet if required. See also the lower method.
655 """
656 return Seq(str(self).upper(), self.alphabet._upper())
657
659 """Returns a lower case copy of the sequence.
660
661 This will adjust the alphabet if required. Note that the IUPAC alphabets
662 are upper case only, and thus a generic alphabet must be substituted.
663
664 >>> from Bio.Alphabet import Gapped, generic_dna
665 >>> from Bio.Alphabet import IUPAC
666 >>> from Bio.Seq import Seq
667 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*"))
668 >>> my_seq
669 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*'))
670 >>> my_seq.lower()
671 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*'))
672
673 See also the upper method.
674 """
675 return Seq(str(self).lower(), self.alphabet._lower())
676
678 """Returns the complement sequence. New Seq object.
679
680 >>> from Bio.Seq import Seq
681 >>> from Bio.Alphabet import IUPAC
682 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
683 >>> my_dna
684 Seq('CCCCCGATAG', IUPACUnambiguousDNA())
685 >>> my_dna.complement()
686 Seq('GGGGGCTATC', IUPACUnambiguousDNA())
687
688 You can of course used mixed case sequences,
689
690 >>> from Bio.Seq import Seq
691 >>> from Bio.Alphabet import generic_dna
692 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
693 >>> my_dna
694 Seq('CCCCCgatA-GD', DNAAlphabet())
695 >>> my_dna.complement()
696 Seq('GGGGGctaT-CH', DNAAlphabet())
697
698 Note in the above example, ambiguous character D denotes
699 G, A or T so its complement is H (for C, T or A).
700
701 Trying to complement a protein sequence raises an exception.
702
703 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
704 >>> my_protein.complement()
705 Traceback (most recent call last):
706 ...
707 ValueError: Proteins do not have complements!
708 """
709 base = Alphabet._get_base_alphabet(self.alphabet)
710 if isinstance(base, Alphabet.ProteinAlphabet):
711 raise ValueError("Proteins do not have complements!")
712 if isinstance(base, Alphabet.DNAAlphabet):
713 ttable = _dna_complement_table
714 elif isinstance(base, Alphabet.RNAAlphabet):
715 ttable = _rna_complement_table
716 elif ('U' in self._data or 'u' in self._data) \
717 and ('T' in self._data or 't' in self._data):
718
719 raise ValueError("Mixed RNA/DNA found")
720 elif 'U' in self._data or 'u' in self._data:
721 ttable = _rna_complement_table
722 else:
723 ttable = _dna_complement_table
724
725
726 return Seq(str(self).translate(ttable), self.alphabet)
727
729 """Returns the reverse complement sequence. New Seq object.
730
731 >>> from Bio.Seq import Seq
732 >>> from Bio.Alphabet import IUPAC
733 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
734 >>> my_dna
735 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
736 >>> my_dna.reverse_complement()
737 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
738
739 Note in the above example, since R = G or A, its complement
740 is Y (which denotes C or T).
741
742 You can of course used mixed case sequences,
743
744 >>> from Bio.Seq import Seq
745 >>> from Bio.Alphabet import generic_dna
746 >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
747 >>> my_dna
748 Seq('CCCCCgatA-G', DNAAlphabet())
749 >>> my_dna.reverse_complement()
750 Seq('C-TatcGGGGG', DNAAlphabet())
751
752 Trying to complement a protein sequence raises an exception:
753
754 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
755 >>> my_protein.reverse_complement()
756 Traceback (most recent call last):
757 ...
758 ValueError: Proteins do not have complements!
759 """
760
761 return self.complement()[::-1]
762
764 """Returns the RNA sequence from a DNA sequence. New Seq object.
765
766 >>> from Bio.Seq import Seq
767 >>> from Bio.Alphabet import IUPAC
768 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
769 ... IUPAC.unambiguous_dna)
770 >>> coding_dna
771 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
772 >>> coding_dna.transcribe()
773 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
774
775 Trying to transcribe a protein or RNA sequence raises an exception:
776
777 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
778 >>> my_protein.transcribe()
779 Traceback (most recent call last):
780 ...
781 ValueError: Proteins cannot be transcribed!
782 """
783 base = Alphabet._get_base_alphabet(self.alphabet)
784 if isinstance(base, Alphabet.ProteinAlphabet):
785 raise ValueError("Proteins cannot be transcribed!")
786 if isinstance(base, Alphabet.RNAAlphabet):
787 raise ValueError("RNA cannot be transcribed!")
788
789 if self.alphabet==IUPAC.unambiguous_dna:
790 alphabet = IUPAC.unambiguous_rna
791 elif self.alphabet==IUPAC.ambiguous_dna:
792 alphabet = IUPAC.ambiguous_rna
793 else:
794 alphabet = Alphabet.generic_rna
795 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
796
798 """Returns the DNA sequence from an RNA sequence. New Seq object.
799
800 >>> from Bio.Seq import Seq
801 >>> from Bio.Alphabet import IUPAC
802 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
803 ... IUPAC.unambiguous_rna)
804 >>> messenger_rna
805 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
806 >>> messenger_rna.back_transcribe()
807 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
808
809 Trying to back-transcribe a protein or DNA sequence raises an
810 exception:
811
812 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
813 >>> my_protein.back_transcribe()
814 Traceback (most recent call last):
815 ...
816 ValueError: Proteins cannot be back transcribed!
817 """
818 base = Alphabet._get_base_alphabet(self.alphabet)
819 if isinstance(base, Alphabet.ProteinAlphabet):
820 raise ValueError("Proteins cannot be back transcribed!")
821 if isinstance(base, Alphabet.DNAAlphabet):
822 raise ValueError("DNA cannot be back transcribed!")
823
824 if self.alphabet==IUPAC.unambiguous_rna:
825 alphabet = IUPAC.unambiguous_dna
826 elif self.alphabet==IUPAC.ambiguous_rna:
827 alphabet = IUPAC.ambiguous_dna
828 else:
829 alphabet = Alphabet.generic_dna
830 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
831
832 - def translate(self, table="Standard", stop_symbol="*", to_stop=False,
833 cds=False):
834 """Turns a nucleotide sequence into a protein sequence. New Seq object.
835
836 This method will translate DNA or RNA sequences, and those with a
837 nucleotide or generic alphabet. Trying to translate a protein
838 sequence raises an exception.
839
840 Arguments:
841 - table - Which codon table to use? This can be either a name
842 (string), an NCBI identifier (integer), or a CodonTable
843 object (useful for non-standard genetic codes). This
844 defaults to the "Standard" table.
845 - stop_symbol - Single character string, what to use for terminators.
846 This defaults to the asterisk, "*".
847 - to_stop - Boolean, defaults to False meaning do a full translation
848 continuing on past any stop codons (translated as the
849 specified stop_symbol). If True, translation is
850 terminated at the first in frame stop codon (and the
851 stop_symbol is not appended to the returned protein
852 sequence).
853 - cds - Boolean, indicates this is a complete CDS. If True,
854 this checks the sequence starts with a valid alternative start
855 codon (which will be translated as methionine, M), that the
856 sequence length is a multiple of three, and that there is a
857 single in frame stop codon at the end (this will be excluded
858 from the protein sequence, regardless of the to_stop option).
859 If these tests fail, an exception is raised.
860
861 e.g. Using the standard table:
862
863 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
864 >>> coding_dna.translate()
865 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
866 >>> coding_dna.translate(stop_symbol="@")
867 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
868 >>> coding_dna.translate(to_stop=True)
869 Seq('VAIVMGR', ExtendedIUPACProtein())
870
871 Now using NCBI table 2, where TGA is not a stop codon:
872
873 >>> coding_dna.translate(table=2)
874 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
875 >>> coding_dna.translate(table=2, to_stop=True)
876 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
877
878 In fact, GTG is an alternative start codon under NCBI table 2, meaning
879 this sequence could be a complete CDS:
880
881 >>> coding_dna.translate(table=2, cds=True)
882 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein())
883
884 It isn't a valid CDS under NCBI table 1, due to both the start codon and
885 also the in frame stop codons:
886
887 >>> coding_dna.translate(table=1, cds=True)
888 Traceback (most recent call last):
889 ...
890 TranslationError: First codon 'GTG' is not a start codon
891
892 If the sequence has no in-frame stop codon, then the to_stop argument
893 has no effect:
894
895 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
896 >>> coding_dna2.translate()
897 Seq('LAIVMGR', ExtendedIUPACProtein())
898 >>> coding_dna2.translate(to_stop=True)
899 Seq('LAIVMGR', ExtendedIUPACProtein())
900
901 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
902 or a stop codon. These are translated as "X". Any invalid codon
903 (e.g. "TA?" or "T-A") will throw a TranslationError.
904
905 NOTE - Does NOT support gapped sequences.
906
907 NOTE - This does NOT behave like the python string's translate
908 method. For that use str(my_seq).translate(...) instead.
909 """
910 if isinstance(table, str) and len(table)==256:
911 raise ValueError("The Seq object translate method DOES NOT take "
912 + "a 256 character string mapping table like "
913 + "the python string object's translate method. "
914 + "Use str(my_seq).translate(...) instead.")
915 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
916 Alphabet.ProteinAlphabet):
917 raise ValueError("Proteins cannot be translated!")
918 try:
919 table_id = int(table)
920 except ValueError:
921
922 if self.alphabet==IUPAC.unambiguous_dna:
923
924 codon_table = CodonTable.unambiguous_dna_by_name[table]
925 elif self.alphabet==IUPAC.unambiguous_rna:
926
927 codon_table = CodonTable.unambiguous_rna_by_name[table]
928 else:
929
930
931
932 codon_table = CodonTable.ambiguous_generic_by_name[table]
933 except (AttributeError, TypeError):
934
935 if isinstance(table, CodonTable.CodonTable):
936 codon_table = table
937 else:
938 raise ValueError('Bad table argument')
939 else:
940
941 if self.alphabet==IUPAC.unambiguous_dna:
942
943 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
944 elif self.alphabet==IUPAC.unambiguous_rna:
945
946 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
947 else:
948
949
950
951 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
952 protein = _translate_str(str(self), codon_table,
953 stop_symbol, to_stop, cds)
954 if stop_symbol in protein:
955 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
956 stop_symbol = stop_symbol)
957 else:
958 alphabet = codon_table.protein_alphabet
959 return Seq(protein, alphabet)
960
961 - def ungap(self, gap=None):
962 """Return a copy of the sequence without the gap character(s).
963
964 The gap character can be specified in two ways - either as an explicit
965 argument, or via the sequence's alphabet. For example:
966
967 >>> from Bio.Seq import Seq
968 >>> from Bio.Alphabet import generic_dna
969 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna)
970 >>> my_dna
971 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet())
972 >>> my_dna.ungap("-")
973 Seq('ATATGAAATTTGAAAA', DNAAlphabet())
974
975 If the gap character is not given as an argument, it will be taken from
976 the sequence's alphabet (if defined). Notice that the returned sequence's
977 alphabet is adjusted since it no longer requires a gapped alphabet:
978
979 >>> from Bio.Seq import Seq
980 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon
981 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "=")))
982 >>> my_pro
983 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*'))
984 >>> my_pro.ungap()
985 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*'))
986
987 Or, with a simpler gapped DNA example:
988
989 >>> from Bio.Seq import Seq
990 >>> from Bio.Alphabet import IUPAC, Gapped
991 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "="))
992 >>> my_seq
993 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
994 >>> my_seq.ungap()
995 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
996
997 As long as it is consistent with the alphabet, although it is redundant,
998 you can still supply the gap character as an argument to this method:
999
1000 >>> my_seq
1001 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
1002 >>> my_seq.ungap("=")
1003 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
1004
1005 However, if the gap character given as the argument disagrees with that
1006 declared in the alphabet, an exception is raised:
1007
1008 >>> my_seq
1009 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
1010 >>> my_seq.ungap("-")
1011 Traceback (most recent call last):
1012 ...
1013 ValueError: Gap '-' does not match '=' from alphabet
1014
1015 Finally, if a gap character is not supplied, and the alphabet does not
1016 define one, an exception is raised:
1017
1018 >>> from Bio.Seq import Seq
1019 >>> from Bio.Alphabet import generic_dna
1020 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna)
1021 >>> my_dna
1022 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet())
1023 >>> my_dna.ungap()
1024 Traceback (most recent call last):
1025 ...
1026 ValueError: Gap character not given and not defined in alphabet
1027
1028 """
1029 if hasattr(self.alphabet, "gap_char"):
1030 if not gap:
1031 gap = self.alphabet.gap_char
1032 elif gap != self.alphabet.gap_char:
1033 raise ValueError("Gap %s does not match %s from alphabet"
1034 % (repr(gap), repr(self.alphabet.gap_char)))
1035 alpha = Alphabet._ungap(self.alphabet)
1036 elif not gap:
1037 raise ValueError("Gap character not given and not defined in alphabet")
1038 else:
1039 alpha = self.alphabet
1040 if len(gap)!=1 or not isinstance(gap, str):
1041 raise ValueError("Unexpected gap character, %s" % repr(gap))
1042 return Seq(str(self).replace(gap, ""), alpha)
1043
1044
1046 """A read-only sequence object of known length but unknown contents.
1047
1048 If you have an unknown sequence, you can represent this with a normal
1049 Seq object, for example:
1050
1051 >>> my_seq = Seq("N"*5)
1052 >>> my_seq
1053 Seq('NNNNN', Alphabet())
1054 >>> len(my_seq)
1055 5
1056 >>> print my_seq
1057 NNNNN
1058
1059 However, this is rather wasteful of memory (especially for large
1060 sequences), which is where this class is most usefull:
1061
1062 >>> unk_five = UnknownSeq(5)
1063 >>> unk_five
1064 UnknownSeq(5, alphabet = Alphabet(), character = '?')
1065 >>> len(unk_five)
1066 5
1067 >>> print(unk_five)
1068 ?????
1069
1070 You can add unknown sequence together, provided their alphabets and
1071 characters are compatible, and get another memory saving UnknownSeq:
1072
1073 >>> unk_four = UnknownSeq(4)
1074 >>> unk_four
1075 UnknownSeq(4, alphabet = Alphabet(), character = '?')
1076 >>> unk_four + unk_five
1077 UnknownSeq(9, alphabet = Alphabet(), character = '?')
1078
1079 If the alphabet or characters don't match up, the addition gives an
1080 ordinary Seq object:
1081
1082 >>> unk_nnnn = UnknownSeq(4, character = "N")
1083 >>> unk_nnnn
1084 UnknownSeq(4, alphabet = Alphabet(), character = 'N')
1085 >>> unk_nnnn + unk_four
1086 Seq('NNNN????', Alphabet())
1087
1088 Combining with a real Seq gives a new Seq object:
1089
1090 >>> known_seq = Seq("ACGT")
1091 >>> unk_four + known_seq
1092 Seq('????ACGT', Alphabet())
1093 >>> known_seq + unk_four
1094 Seq('ACGT????', Alphabet())
1095 """
1097 """Create a new UnknownSeq object.
1098
1099 If character is omitted, it is determined from the alphabet, "N" for
1100 nucleotides, "X" for proteins, and "?" otherwise.
1101 """
1102 self._length = int(length)
1103 if self._length < 0:
1104
1105 raise ValueError("Length must not be negative.")
1106 self.alphabet = alphabet
1107 if character:
1108 if len(character) != 1:
1109 raise ValueError("character argument should be a single letter string.")
1110 self._character = character
1111 else:
1112 base = Alphabet._get_base_alphabet(alphabet)
1113
1114
1115 if isinstance(base, Alphabet.NucleotideAlphabet):
1116 self._character = "N"
1117 elif isinstance(base, Alphabet.ProteinAlphabet):
1118 self._character = "X"
1119 else:
1120 self._character = "?"
1121
1123 """Returns the stated length of the unknown sequence."""
1124 return self._length
1125
1127 """Returns the unknown sequence as full string of the given length."""
1128 return self._character * self._length
1129
1131 return "UnknownSeq(%i, alphabet = %s, character = %s)" \
1132 % (self._length, repr(self.alphabet), repr(self._character))
1133
1135 """Add another sequence or string to this sequence.
1136
1137 Adding two UnknownSeq objects returns another UnknownSeq object
1138 provided the character is the same and the alphabets are compatible.
1139
1140 >>> from Bio.Seq import UnknownSeq
1141 >>> from Bio.Alphabet import generic_protein
1142 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein)
1143 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X')
1144
1145 If the characters differ, an UnknownSeq object cannot be used, so a
1146 Seq object is returned:
1147
1148 >>> from Bio.Seq import UnknownSeq
1149 >>> from Bio.Alphabet import generic_protein
1150 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein,
1151 ... character="x")
1152 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet())
1153
1154 If adding a string to an UnknownSeq, a new Seq is returned with the
1155 same alphabet:
1156
1157 >>> from Bio.Seq import UnknownSeq
1158 >>> from Bio.Alphabet import generic_protein
1159 >>> UnknownSeq(5, generic_protein) + "LV"
1160 Seq('XXXXXLV', ProteinAlphabet())
1161 """
1162 if isinstance(other, UnknownSeq) \
1163 and other._character == self._character:
1164
1165 return UnknownSeq(len(self)+len(other),
1166 self.alphabet, self._character)
1167
1168 return Seq(str(self), self.alphabet) + other
1169
1174
1176 """Get a subsequence from the UnknownSeq object.
1177
1178 >>> unk = UnknownSeq(8, character="N")
1179 >>> print unk[:]
1180 NNNNNNNN
1181 >>> print unk[5:3]
1182 <BLANKLINE>
1183 >>> print unk[1:-1]
1184 NNNNNN
1185 >>> print unk[1:-1:2]
1186 NNN
1187 """
1188 if isinstance(index, int):
1189
1190 return str(self)[index]
1191 old_length = self._length
1192 step = index.step
1193 if step is None or step == 1:
1194
1195 start = index.start
1196 end = index.stop
1197 if start is None:
1198 start = 0
1199 elif start < 0:
1200 start = max(0, old_length + start)
1201 elif start > old_length:
1202 start = old_length
1203 if end is None:
1204 end = old_length
1205 elif end < 0:
1206 end = max(0, old_length + end)
1207 elif end > old_length:
1208 end = old_length
1209 new_length = max(0, end-start)
1210 elif step == 0:
1211 raise ValueError("slice step cannot be zero")
1212 else:
1213
1214 new_length = len(("X"*old_length)[index])
1215
1216
1217
1218 return UnknownSeq(new_length, self.alphabet, self._character)
1219
1221 """Non-overlapping count method, like that of a python string.
1222
1223 This behaves like the python string (and Seq object) method of the
1224 same name, which does a non-overlapping count!
1225
1226 Returns an integer, the number of occurrences of substring
1227 argument sub in the (sub)sequence given by [start:end].
1228 Optional arguments start and end are interpreted as in slice
1229 notation.
1230
1231 Arguments:
1232 - sub - a string or another Seq object to look for
1233 - start - optional integer, slice start
1234 - end - optional integer, slice end
1235
1236 >>> "NNNN".count("N")
1237 4
1238 >>> Seq("NNNN").count("N")
1239 4
1240 >>> UnknownSeq(4, character="N").count("N")
1241 4
1242 >>> UnknownSeq(4, character="N").count("A")
1243 0
1244 >>> UnknownSeq(4, character="N").count("AA")
1245 0
1246
1247 HOWEVER, please note because that python strings and Seq objects (and
1248 MutableSeq objects) do a non-overlapping search, this may not give
1249 the answer you expect:
1250
1251 >>> UnknownSeq(4, character="N").count("NN")
1252 2
1253 >>> UnknownSeq(4, character="N").count("NNN")
1254 1
1255 """
1256 sub_str = self._get_seq_str_and_check_alphabet(sub)
1257 if len(sub_str) == 1:
1258 if str(sub_str) == self._character:
1259 if start==0 and end >= self._length:
1260 return self._length
1261 else:
1262
1263 return str(self).count(sub_str, start, end)
1264 else:
1265 return 0
1266 else:
1267 if set(sub_str) == set(self._character):
1268 if start==0 and end >= self._length:
1269 return self._length // len(sub_str)
1270 else:
1271
1272 return str(self).count(sub_str, start, end)
1273 else:
1274 return 0
1275
1277 """The complement of an unknown nucleotide equals itself.
1278
1279 >>> my_nuc = UnknownSeq(8)
1280 >>> my_nuc
1281 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1282 >>> print my_nuc
1283 ????????
1284 >>> my_nuc.complement()
1285 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1286 >>> print my_nuc.complement()
1287 ????????
1288 """
1289 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1290 Alphabet.ProteinAlphabet):
1291 raise ValueError("Proteins do not have complements!")
1292 return self
1293
1295 """The reverse complement of an unknown nucleotide equals itself.
1296
1297 >>> my_nuc = UnknownSeq(10)
1298 >>> my_nuc
1299 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1300 >>> print my_nuc
1301 ??????????
1302 >>> my_nuc.reverse_complement()
1303 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1304 >>> print my_nuc.reverse_complement()
1305 ??????????
1306 """
1307 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1308 Alphabet.ProteinAlphabet):
1309 raise ValueError("Proteins do not have complements!")
1310 return self
1311
1313 """Returns unknown RNA sequence from an unknown DNA sequence.
1314
1315 >>> my_dna = UnknownSeq(10, character="N")
1316 >>> my_dna
1317 UnknownSeq(10, alphabet = Alphabet(), character = 'N')
1318 >>> print my_dna
1319 NNNNNNNNNN
1320 >>> my_rna = my_dna.transcribe()
1321 >>> my_rna
1322 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
1323 >>> print my_rna
1324 NNNNNNNNNN
1325 """
1326
1327 s = Seq(self._character, self.alphabet).transcribe()
1328 return UnknownSeq(self._length, s.alphabet, self._character)
1329
1331 """Returns unknown DNA sequence from an unknown RNA sequence.
1332
1333 >>> my_rna = UnknownSeq(20, character="N")
1334 >>> my_rna
1335 UnknownSeq(20, alphabet = Alphabet(), character = 'N')
1336 >>> print my_rna
1337 NNNNNNNNNNNNNNNNNNNN
1338 >>> my_dna = my_rna.back_transcribe()
1339 >>> my_dna
1340 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1341 >>> print my_dna
1342 NNNNNNNNNNNNNNNNNNNN
1343 """
1344
1345 s = Seq(self._character, self.alphabet).back_transcribe()
1346 return UnknownSeq(self._length, s.alphabet, self._character)
1347
1349 """Returns an upper case copy of the sequence.
1350
1351 >>> from Bio.Alphabet import generic_dna
1352 >>> from Bio.Seq import UnknownSeq
1353 >>> my_seq = UnknownSeq(20, generic_dna, character="n")
1354 >>> my_seq
1355 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n')
1356 >>> print my_seq
1357 nnnnnnnnnnnnnnnnnnnn
1358 >>> my_seq.upper()
1359 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1360 >>> print my_seq.upper()
1361 NNNNNNNNNNNNNNNNNNNN
1362
1363 This will adjust the alphabet if required. See also the lower method.
1364 """
1365 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1366
1368 """Returns a lower case copy of the sequence.
1369
1370 This will adjust the alphabet if required:
1371
1372 >>> from Bio.Alphabet import IUPAC
1373 >>> from Bio.Seq import UnknownSeq
1374 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein)
1375 >>> my_seq
1376 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X')
1377 >>> print my_seq
1378 XXXXXXXXXXXXXXXXXXXX
1379 >>> my_seq.lower()
1380 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x')
1381 >>> print my_seq.lower()
1382 xxxxxxxxxxxxxxxxxxxx
1383
1384 See also the upper method.
1385 """
1386 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1387
1389 """Translate an unknown nucleotide sequence into an unknown protein.
1390
1391 e.g.
1392
1393 >>> my_seq = UnknownSeq(11, character="N")
1394 >>> print my_seq
1395 NNNNNNNNNNN
1396 >>> my_protein = my_seq.translate()
1397 >>> my_protein
1398 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
1399 >>> print my_protein
1400 XXX
1401
1402 In comparison, using a normal Seq object:
1403
1404 >>> my_seq = Seq("NNNNNNNNNNN")
1405 >>> print my_seq
1406 NNNNNNNNNNN
1407 >>> my_protein = my_seq.translate()
1408 >>> my_protein
1409 Seq('XXX', ExtendedIUPACProtein())
1410 >>> print my_protein
1411 XXX
1412
1413 """
1414 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1415 Alphabet.ProteinAlphabet):
1416 raise ValueError("Proteins cannot be translated!")
1417 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1418
1419 - def ungap(self, gap=None):
1420 """Return a copy of the sequence without the gap character(s).
1421
1422 The gap character can be specified in two ways - either as an explicit
1423 argument, or via the sequence's alphabet. For example:
1424
1425 >>> from Bio.Seq import UnknownSeq
1426 >>> from Bio.Alphabet import Gapped, generic_dna
1427 >>> my_dna = UnknownSeq(20, Gapped(generic_dna,"-"))
1428 >>> my_dna
1429 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N')
1430 >>> my_dna.ungap()
1431 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1432 >>> my_dna.ungap("-")
1433 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1434
1435 If the UnknownSeq is using the gap character, then an empty Seq is
1436 returned:
1437
1438 >>> my_gap = UnknownSeq(20, Gapped(generic_dna,"-"), character="-")
1439 >>> my_gap
1440 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-')
1441 >>> my_gap.ungap()
1442 Seq('', DNAAlphabet())
1443 >>> my_gap.ungap("-")
1444 Seq('', DNAAlphabet())
1445
1446 Notice that the returned sequence's alphabet is adjusted to remove any
1447 explicit gap character declaration.
1448 """
1449
1450 s = Seq(self._character, self.alphabet).ungap()
1451 if s:
1452 return UnknownSeq(self._length, s.alphabet, self._character)
1453 else:
1454 return Seq("", s.alphabet)
1455
1456
1458 """An editable sequence object (with an alphabet).
1459
1460 Unlike normal python strings and our basic sequence object (the Seq class)
1461 which are immuatable, the MutableSeq lets you edit the sequence in place.
1462 However, this means you cannot use a MutableSeq object as a dictionary key.
1463
1464 >>> from Bio.Seq import MutableSeq
1465 >>> from Bio.Alphabet import generic_dna
1466 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
1467 >>> my_seq
1468 MutableSeq('ACTCGTCGTCG', DNAAlphabet())
1469 >>> my_seq[5]
1470 'T'
1471 >>> my_seq[5] = "A"
1472 >>> my_seq
1473 MutableSeq('ACTCGACGTCG', DNAAlphabet())
1474 >>> my_seq[5]
1475 'A'
1476 >>> my_seq[5:8] = "NNN"
1477 >>> my_seq
1478 MutableSeq('ACTCGNNNTCG', DNAAlphabet())
1479 >>> len(my_seq)
1480 11
1481
1482 Note that the MutableSeq object does not support as many string-like
1483 or biological methods as the Seq object.
1484 """
1486 if sys.version_info[0] == 3:
1487 self.array_indicator = "u"
1488 else:
1489 self.array_indicator = "c"
1490 if isinstance(data, str):
1491 self.data = array.array(self.array_indicator, data)
1492 else:
1493 self.data = data
1494 self.alphabet = alphabet
1495
1497 """Returns a (truncated) representation of the sequence for debugging."""
1498 if len(self) > 60:
1499
1500
1501
1502 return "%s('%s...%s', %s)" % (self.__class__.__name__,
1503 str(self[:54]), str(self[-3:]),
1504 repr(self.alphabet))
1505 else:
1506 return "%s('%s', %s)" % (self.__class__.__name__,
1507 str(self),
1508 repr(self.alphabet))
1509
1511 """Returns the full sequence as a python string.
1512
1513 Note that Biopython 1.44 and earlier would give a truncated
1514 version of repr(my_seq) for str(my_seq). If you are writing code
1515 which needs to be backwards compatible with old Biopython, you
1516 should continue to use my_seq.tostring() rather than str(my_seq).
1517 """
1518
1519 return "".join(self.data)
1520
1522 """Compare the sequence to another sequence or a string (README).
1523
1524 Currently if compared to another sequence the alphabets must be
1525 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise
1526 an exception. Otherwise only the sequence itself is compared, not the
1527 precise alphabet.
1528
1529 A future release of Biopython will change this (and the Seq object etc)
1530 to use simple string comparison. The plan is that comparing sequences
1531 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning
1532 but not an exception.
1533
1534 During this transition period, please just do explicit comparisons:
1535
1536 >>> seq1 = MutableSeq("ACGT")
1537 >>> seq2 = MutableSeq("ACGT")
1538 >>> id(seq1) == id(seq2)
1539 False
1540 >>> str(seq1) == str(seq2)
1541 True
1542
1543 This method indirectly supports ==, < , etc.
1544 """
1545 if hasattr(other, "alphabet"):
1546
1547 import warnings
1548 warnings.warn("In future comparing incompatible alphabets will "
1549 "only trigger a warning (not an exception). In "
1550 "the interim please use id(seq1)==id(seq2) or "
1551 "str(seq1)==str(seq2) to make your code explicit "
1552 "and to avoid this warning.", FutureWarning)
1553 if not Alphabet._check_type_compatible([self.alphabet,
1554 other.alphabet]):
1555 raise TypeError("Incompatable alphabets %s and %s"
1556 % (repr(self.alphabet), repr(other.alphabet)))
1557
1558 if isinstance(other, MutableSeq):
1559
1560
1561 return cmp(self.data, other.data)
1562 else:
1563 return cmp(str(self), str(other))
1564 elif isinstance(other, basestring):
1565 return cmp(str(self), other)
1566 else:
1567 raise TypeError
1568
1570 return len(self.data)
1571
1582
1599
1601
1602
1603
1604
1605
1606 del self.data[index]
1607
1631
1652
1655
1658
1659 - def pop(self, i = (-1)):
1660 c = self.data[i]
1661 del self.data[i]
1662 return c
1663
1665 for i in range(len(self.data)):
1666 if self.data[i] == item:
1667 del self.data[i]
1668 return
1669 raise ValueError("MutableSeq.remove(x): x not in list")
1670
1672 """Non-overlapping count method, like that of a python string.
1673
1674 This behaves like the python string method of the same name,
1675 which does a non-overlapping count!
1676
1677 Returns an integer, the number of occurrences of substring
1678 argument sub in the (sub)sequence given by [start:end].
1679 Optional arguments start and end are interpreted as in slice
1680 notation.
1681
1682 Arguments:
1683 - sub - a string or another Seq object to look for
1684 - start - optional integer, slice start
1685 - end - optional integer, slice end
1686
1687 e.g.
1688
1689 >>> from Bio.Seq import MutableSeq
1690 >>> my_mseq = MutableSeq("AAAATGA")
1691 >>> print my_mseq.count("A")
1692 5
1693 >>> print my_mseq.count("ATG")
1694 1
1695 >>> print my_mseq.count(Seq("AT"))
1696 1
1697 >>> print my_mseq.count("AT", 2, -1)
1698 1
1699
1700 HOWEVER, please note because that python strings, Seq objects and
1701 MutableSeq objects do a non-overlapping search, this may not give
1702 the answer you expect:
1703
1704 >>> "AAAA".count("AA")
1705 2
1706 >>> print MutableSeq("AAAA").count("AA")
1707 2
1708
1709 A non-overlapping search would give the answer as three!
1710 """
1711 try:
1712
1713 search = sub.tostring()
1714 except AttributeError:
1715 search = sub
1716
1717 if not isinstance(search, basestring):
1718 raise TypeError("expected a string, Seq or MutableSeq")
1719
1720 if len(search) == 1:
1721
1722 count = 0
1723 for c in self.data[start:end]:
1724 if c == search:
1725 count += 1
1726 return count
1727 else:
1728
1729 return self.tostring().count(search, start, end)
1730
1732 for i in range(len(self.data)):
1733 if self.data[i] == item:
1734 return i
1735 raise ValueError("MutableSeq.index(x): x not in list")
1736
1738 """Modify the mutable sequence to reverse itself.
1739
1740 No return value.
1741 """
1742 self.data.reverse()
1743
1769
1771 """Modify the mutable sequence to take on its reverse complement.
1772
1773 Trying to reverse complement a protein sequence raises an exception.
1774
1775 No return value.
1776 """
1777 self.complement()
1778 self.data.reverse()
1779
1780
1781
1782
1790
1792 """Returns the full sequence as a python string (semi-obsolete).
1793
1794 Although not formally deprecated, you are now encouraged to use
1795 str(my_seq) instead of my_seq.tostring().
1796
1797 Because str(my_seq) will give you the full sequence as a python string,
1798 there is often no need to make an explicit conversion. For example,
1799
1800 print "ID={%s}, sequence={%s}" % (my_name, my_seq)
1801
1802 On Biopython 1.44 or older you would have to have done this:
1803
1804 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
1805 """
1806 return "".join(self.data)
1807
1809 """Returns the full sequence as a new immutable Seq object.
1810
1811 >>> from Bio.Seq import Seq
1812 >>> from Bio.Alphabet import IUPAC
1813 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL",
1814 ... IUPAC.protein)
1815 >>> my_mseq
1816 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1817 >>> my_mseq.toseq()
1818 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1819
1820 Note that the alphabet is preserved.
1821 """
1822 return Seq("".join(self.data), self.alphabet)
1823
1824
1825
1826
1827
1828
1830 """Transcribes a DNA sequence into RNA.
1831
1832 If given a string, returns a new string object.
1833
1834 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1835
1836 Trying to transcribe a protein or RNA sequence raises an exception.
1837
1838 e.g.
1839
1840 >>> transcribe("ACTGN")
1841 'ACUGN'
1842 """
1843 if isinstance(dna, Seq):
1844 return dna.transcribe()
1845 elif isinstance(dna, MutableSeq):
1846 return dna.toseq().transcribe()
1847 else:
1848 return dna.replace('T', 'U').replace('t', 'u')
1849
1850
1852 """Back-transcribes an RNA sequence into DNA.
1853
1854 If given a string, returns a new string object.
1855
1856 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1857
1858 Trying to transcribe a protein or DNA sequence raises an exception.
1859
1860 e.g.
1861
1862 >>> back_transcribe("ACUGN")
1863 'ACTGN'
1864 """
1865 if isinstance(rna, Seq):
1866 return rna.back_transcribe()
1867 elif isinstance(rna, MutableSeq):
1868 return rna.toseq().back_transcribe()
1869 else:
1870 return rna.replace('U', 'T').replace('u', 't')
1871
1872
1873 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False,
1874 cds=False, pos_stop="X"):
1875 """Helper function to translate a nucleotide string (PRIVATE).
1876
1877 Arguments:
1878 - sequence - a string
1879 - table - a CodonTable object (NOT a table name or id number)
1880 - stop_symbol - a single character string, what to use for terminators.
1881 - to_stop - boolean, should translation terminate at the first
1882 in frame stop codon? If there is no in-frame stop codon
1883 then translation continues to the end.
1884 - pos_stop - a single character string for a possible stop codon
1885 (e.g. TAN or NNN)
1886 - cds - Boolean, indicates this is a complete CDS. If True, this
1887 checks the sequence starts with a valid alternative start
1888 codon (which will be translated as methionine, M), that the
1889 sequence length is a multiple of three, and that there is a
1890 single in frame stop codon at the end (this will be excluded
1891 from the protein sequence, regardless of the to_stop option).
1892 If these tests fail, an exception is raised.
1893
1894 Returns a string.
1895
1896 e.g.
1897
1898 >>> from Bio.Data import CodonTable
1899 >>> table = CodonTable.ambiguous_dna_by_id[1]
1900 >>> _translate_str("AAA", table)
1901 'K'
1902 >>> _translate_str("TAR", table)
1903 '*'
1904 >>> _translate_str("TAN", table)
1905 'X'
1906 >>> _translate_str("TAN", table, pos_stop="@")
1907 '@'
1908 >>> _translate_str("TA?", table)
1909 Traceback (most recent call last):
1910 ...
1911 TranslationError: Codon 'TA?' is invalid
1912 >>> _translate_str("ATGCCCTAG", table, cds=True)
1913 'MP'
1914 >>> _translate_str("AAACCCTAG", table, cds=True)
1915 Traceback (most recent call last):
1916 ...
1917 TranslationError: First codon 'AAA' is not a start codon
1918 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
1919 Traceback (most recent call last):
1920 ...
1921 TranslationError: Extra in frame stop codon found.
1922 """
1923 sequence = sequence.upper()
1924 amino_acids = []
1925 forward_table = table.forward_table
1926 stop_codons = table.stop_codons
1927 if table.nucleotide_alphabet.letters is not None:
1928 valid_letters = set(table.nucleotide_alphabet.letters.upper())
1929 else:
1930
1931 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() +
1932 IUPAC.ambiguous_rna.letters.upper())
1933 if cds:
1934 if str(sequence[:3]).upper() not in table.start_codons:
1935 raise CodonTable.TranslationError(
1936 "First codon '%s' is not a start codon" % sequence[:3])
1937 if len(sequence) % 3 != 0:
1938 raise CodonTable.TranslationError(
1939 "Sequence length %i is not a multiple of three" % len(sequence))
1940 if str(sequence[-3:]).upper() not in stop_codons:
1941 raise CodonTable.TranslationError(
1942 "Final codon '%s' is not a stop codon" % sequence[-3:])
1943
1944 sequence = sequence[3:-3]
1945 amino_acids = ["M"]
1946 n = len(sequence)
1947 for i in xrange(0, n-n%3, 3):
1948 codon = sequence[i:i+3]
1949 try:
1950 amino_acids.append(forward_table[codon])
1951 except (KeyError, CodonTable.TranslationError):
1952
1953 if codon in table.stop_codons:
1954 if cds:
1955 raise CodonTable.TranslationError(
1956 "Extra in frame stop codon found.")
1957 if to_stop:
1958 break
1959 amino_acids.append(stop_symbol)
1960 elif valid_letters.issuperset(set(codon)):
1961
1962 amino_acids.append(pos_stop)
1963 else:
1964 raise CodonTable.TranslationError(
1965 "Codon '%s' is invalid" % codon)
1966 return "".join(amino_acids)
1967
1968
1969 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False,
1970 cds=False):
1971 """Translate a nucleotide sequence into amino acids.
1972
1973 If given a string, returns a new string object. Given a Seq or
1974 MutableSeq, returns a Seq object with a protein alphabet.
1975
1976 Arguments:
1977 - table - Which codon table to use? This can be either a name (string),
1978 an NCBI identifier (integer), or a CodonTable object (useful
1979 for non-standard genetic codes). Defaults to the "Standard"
1980 table.
1981 - stop_symbol - Single character string, what to use for any
1982 terminators, defaults to the asterisk, "*".
1983 - to_stop - Boolean, defaults to False meaning do a full
1984 translation continuing on past any stop codons
1985 (translated as the specified stop_symbol). If
1986 True, translation is terminated at the first in
1987 frame stop codon (and the stop_symbol is not
1988 appended to the returned protein sequence).
1989 - cds - Boolean, indicates this is a complete CDS. If True, this
1990 checks the sequence starts with a valid alternative start
1991 codon (which will be translated as methionine, M), that the
1992 sequence length is a multiple of three, and that there is a
1993 single in frame stop codon at the end (this will be excluded
1994 from the protein sequence, regardless of the to_stop option).
1995 If these tests fail, an exception is raised.
1996
1997 A simple string example using the default (standard) genetic code:
1998
1999 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
2000 >>> translate(coding_dna)
2001 'VAIVMGR*KGAR*'
2002 >>> translate(coding_dna, stop_symbol="@")
2003 'VAIVMGR@KGAR@'
2004 >>> translate(coding_dna, to_stop=True)
2005 'VAIVMGR'
2006
2007 Now using NCBI table 2, where TGA is not a stop codon:
2008
2009 >>> translate(coding_dna, table=2)
2010 'VAIVMGRWKGAR*'
2011 >>> translate(coding_dna, table=2, to_stop=True)
2012 'VAIVMGRWKGAR'
2013
2014 In fact this example uses an alternative start codon valid under NCBI table 2,
2015 GTG, which means this example is a complete valid CDS which when translated
2016 should really start with methionine (not valine):
2017
2018 >>> translate(coding_dna, table=2, cds=True)
2019 'MAIVMGRWKGAR'
2020
2021 Note that if the sequence has no in-frame stop codon, then the to_stop
2022 argument has no effect:
2023
2024 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
2025 >>> translate(coding_dna2)
2026 'VAIVMGR'
2027 >>> translate(coding_dna2, to_stop=True)
2028 'VAIVMGR'
2029
2030 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
2031 or a stop codon. These are translated as "X". Any invalid codon
2032 (e.g. "TA?" or "T-A") will throw a TranslationError.
2033
2034 NOTE - Does NOT support gapped sequences.
2035
2036 It will however translate either DNA or RNA.
2037 """
2038 if isinstance(sequence, Seq):
2039 return sequence.translate(table, stop_symbol, to_stop, cds)
2040 elif isinstance(sequence, MutableSeq):
2041
2042 return sequence.toseq().translate(table, stop_symbol, to_stop, cds)
2043 else:
2044
2045 try:
2046 codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
2047 except ValueError:
2048 codon_table = CodonTable.ambiguous_generic_by_name[table]
2049 except (AttributeError, TypeError):
2050 if isinstance(table, CodonTable.CodonTable):
2051 codon_table = table
2052 else:
2053 raise ValueError('Bad table argument')
2054 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2055
2056
2090
2091
2093 """Run the Bio.Seq module's doctests (PRIVATE)."""
2094 if sys.version_info[0:2] == (3, 1):
2095 print "Not running Bio.Seq doctest on Python 3.1"
2096 print "See http://bugs.python.org/issue7490"
2097 else:
2098 print "Running doctests..."
2099 import doctest
2100 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
2101 print "Done"
2102
2103 if __name__ == "__main__":
2104 _test()
2105