Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000 Andrew Dalke. 
   2  # Copyright 2000-2002 Brad Chapman. 
   3  # Copyright 2004-2005, 2010 by M de Hoon. 
   4  # Copyright 2007-2018 by Peter Cock. 
   5  # All rights reserved. 
   6  # 
   7  # This file is part of the Biopython distribution and governed by your 
   8  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
   9  # Please see the LICENSE file that should have been included as part of this 
  10  # package. 
  11  """Provide objects to represent biological sequences with alphabets. 
  12   
  13  See also the Seq_ wiki and the chapter in our tutorial: 
  14   - `HTML Tutorial`_ 
  15   - `PDF Tutorial`_ 
  16   
  17  .. _Seq: http://biopython.org/wiki/Seq 
  18  .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html 
  19  .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf 
  20   
  21  """ 
  22  from __future__ import print_function 
  23   
  24  import string  # for maketrans only 
  25  import array 
  26  import sys 
  27  import warnings 
  28   
  29  from Bio._py3k import range 
  30  from Bio._py3k import basestring 
  31   
  32  from Bio import BiopythonWarning 
  33  from Bio import Alphabet 
  34  from Bio.Alphabet import IUPAC 
  35  from Bio.Data.IUPACData import (ambiguous_dna_complement, 
  36                                  ambiguous_rna_complement) 
  37  from Bio.Data import CodonTable 
  38   
  39   
40 -def _maketrans(complement_mapping):
41 """Make a python string translation table (PRIVATE). 42 43 Arguments: 44 - complement_mapping - a dictionary such as ambiguous_dna_complement 45 and ambiguous_rna_complement from Data.IUPACData. 46 47 Returns a translation table (a string of length 256) for use with the 48 python string's translate method to use in a (reverse) complement. 49 50 Compatible with lower case and upper case sequences. 51 52 For internal use only. 53 """ 54 before = ''.join(complement_mapping.keys()) 55 after = ''.join(complement_mapping.values()) 56 before += before.lower() 57 after += after.lower() 58 if sys.version_info[0] == 3: 59 return str.maketrans(before, after) 60 else: 61 return string.maketrans(before, after)
62 63 64 _dna_complement_table = _maketrans(ambiguous_dna_complement) 65 _rna_complement_table = _maketrans(ambiguous_rna_complement) 66 67
68 -class Seq(object):
69 """Read-only sequence object (essentially a string with an alphabet). 70 71 Like normal python strings, our basic sequence object is immutable. 72 This prevents you from doing my_seq[5] = "A" for example, but does allow 73 Seq objects to be used as dictionary keys. 74 75 The Seq object provides a number of string like methods (such as count, 76 find, split and strip), which are alphabet aware where appropriate. 77 78 In addition to the string like sequence, the Seq object has an alphabet 79 property. This is an instance of an Alphabet class from Bio.Alphabet, 80 for example generic DNA, or IUPAC DNA. This describes the type of molecule 81 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 82 (letters). 83 84 The Seq object also provides some biological methods, such as complement, 85 reverse_complement, transcribe, back_transcribe and translate (which are 86 not applicable to sequences with a protein alphabet). 87 """ 88
89 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
90 """Create a Seq object. 91 92 Arguments: 93 - seq - Sequence, required (string) 94 - alphabet - Optional argument, an Alphabet object from 95 Bio.Alphabet 96 97 You will typically use Bio.SeqIO to read in sequences from files as 98 SeqRecord objects, whose sequence will be exposed as a Seq object via 99 the seq property. 100 101 However, will often want to create your own Seq objects directly: 102 103 >>> from Bio.Seq import Seq 104 >>> from Bio.Alphabet import IUPAC 105 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 106 ... IUPAC.protein) 107 >>> my_seq 108 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 109 >>> print(my_seq) 110 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 111 >>> my_seq.alphabet 112 IUPACProtein() 113 """ 114 # Enforce string storage 115 if not isinstance(data, basestring): 116 raise TypeError("The sequence data given to a Seq object should " 117 "be a string (not another Seq object etc)") 118 self._data = data 119 self.alphabet = alphabet # Seq API requirement
120
121 - def __repr__(self):
122 """Return (truncated) representation of the sequence for debugging.""" 123 if self.alphabet is Alphabet.generic_alphabet: 124 # Default used, we can omit it and simplify the representation 125 a = "" 126 else: 127 a = ", %r" % self.alphabet 128 if len(self) > 60: 129 # Shows the last three letters as it is often useful to see if 130 # there is a stop codon at the end of a sequence. 131 # Note total length is 54+3+3=60 132 return "{0}('{1}...{2}'{3!s})".format(self.__class__.__name__, 133 str(self)[:54], 134 str(self)[-3:], 135 a) 136 else: 137 return '{0}({1!r}{2!s})'.format(self.__class__.__name__, 138 self._data, 139 a)
140
141 - def __str__(self):
142 """Return the full sequence as a python string, use str(my_seq). 143 144 Note that Biopython 1.44 and earlier would give a truncated 145 version of repr(my_seq) for str(my_seq). If you are writing code 146 which need to be backwards compatible with old Biopython, you 147 should continue to use my_seq.tostring() rather than str(my_seq). 148 """ 149 return self._data
150
151 - def __hash__(self):
152 """Hash for comparison. 153 154 See the __cmp__ documentation - this has changed from past 155 versions of Biopython! 156 """ 157 # TODO - remove this warning in a future release 158 warnings.warn("Biopython Seq objects now use string comparison. " 159 "Older versions of Biopython used object comparison. " 160 "During this transition, please use hash(id(my_seq)) " 161 "or my_dict[id(my_seq)] if you want the old behaviour, " 162 "or use hash(str(my_seq)) or my_dict[str(my_seq)] for " 163 "the new string hashing behaviour.", BiopythonWarning) 164 return hash(str(self))
165
166 - def __eq__(self, other):
167 """Compare the sequence to another sequence or a string (README). 168 169 Historically comparing Seq objects has done Python object comparison. 170 After considerable discussion (keeping in mind constraints of the 171 Python language, hashes and dictionary support), Biopython now uses 172 simple string comparison (with a warning about the change). 173 174 Note that incompatible alphabets (e.g. DNA to RNA) will trigger a 175 warning. 176 177 During this transition period, please just do explicit comparisons: 178 179 >>> from Bio.Seq import Seq 180 >>> from Bio.Alphabet import generic_dna 181 >>> seq1 = Seq("ACGT") 182 >>> seq2 = Seq("ACGT") 183 >>> id(seq1) == id(seq2) 184 False 185 >>> str(seq1) == str(seq2) 186 True 187 188 The new behaviour is to use string-like equality: 189 190 >>> from Bio.Seq import Seq 191 >>> from Bio.Alphabet import generic_dna 192 >>> seq1 == seq2 193 True 194 >>> seq1 == "ACGT" 195 True 196 >>> seq1 == Seq("ACGT", generic_dna) 197 True 198 199 """ 200 if hasattr(other, "alphabet"): 201 # other could be a Seq or a MutableSeq 202 if not Alphabet._check_type_compatible([self.alphabet, 203 other.alphabet]): 204 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 205 self.alphabet, other.alphabet), 206 BiopythonWarning) 207 return str(self) == str(other)
208
209 - def __ne__(self, other):
210 """Implement the not-equal operand.""" 211 # Require this method for Python 2 but not needed on Python 3 212 return not self == other
213
214 - def __lt__(self, other):
215 """Implement the less-than operand.""" 216 if hasattr(other, "alphabet"): 217 if not Alphabet._check_type_compatible([self.alphabet, 218 other.alphabet]): 219 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 220 self.alphabet, other.alphabet), 221 BiopythonWarning) 222 return str(self) < str(other)
223
224 - def __le__(self, other):
225 """Implement the less-than or equal operand.""" 226 if hasattr(other, "alphabet"): 227 if not Alphabet._check_type_compatible([self.alphabet, 228 other.alphabet]): 229 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 230 self.alphabet, other.alphabet), 231 BiopythonWarning) 232 return str(self) <= str(other)
233
234 - def __len__(self):
235 """Return the length of the sequence, use len(my_seq).""" 236 return len(self._data) # Seq API requirement
237
238 - def __getitem__(self, index): # Seq API requirement
239 """Return a subsequence of single letter, use my_seq[index]. 240 241 >>> my_seq = Seq('ACTCGACGTCG') 242 >>> my_seq[5] 243 'A' 244 """ 245 # Note since Python 2.0, __getslice__ is deprecated 246 # and __getitem__ is used instead. 247 # See http://docs.python.org/ref/sequence-methods.html 248 if isinstance(index, int): 249 # Return a single letter as a string 250 return self._data[index] 251 else: 252 # Return the (sub)sequence as another Seq object 253 return Seq(self._data[index], self.alphabet)
254
255 - def __add__(self, other):
256 """Add another sequence or string to this sequence. 257 258 If adding a string to a Seq, the alphabet is preserved: 259 260 >>> from Bio.Seq import Seq 261 >>> from Bio.Alphabet import generic_protein 262 >>> Seq("MELKI", generic_protein) + "LV" 263 Seq('MELKILV', ProteinAlphabet()) 264 265 When adding two Seq (like) objects, the alphabets are important. 266 Consider this example: 267 268 >>> from Bio.Seq import Seq 269 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 270 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 271 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 272 >>> unamb_dna_seq 273 Seq('ACGT', IUPACUnambiguousDNA()) 274 >>> ambig_dna_seq 275 Seq('ACRGT', IUPACAmbiguousDNA()) 276 277 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 278 the more general ambiguous IUPAC DNA alphabet: 279 280 >>> unamb_dna_seq + ambig_dna_seq 281 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 282 283 However, if the default generic alphabet is included, the result is 284 a generic alphabet: 285 286 >>> Seq("") + ambig_dna_seq 287 Seq('ACRGT') 288 289 You can't add RNA and DNA sequences: 290 291 >>> from Bio.Alphabet import generic_dna, generic_rna 292 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 293 Traceback (most recent call last): 294 ... 295 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 296 297 You can't add nucleotide and protein sequences: 298 299 >>> from Bio.Alphabet import generic_dna, generic_protein 300 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 301 Traceback (most recent call last): 302 ... 303 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 304 """ 305 if hasattr(other, "alphabet"): 306 # other should be a Seq or a MutableSeq 307 if not Alphabet._check_type_compatible([self.alphabet, 308 other.alphabet]): 309 raise TypeError( 310 "Incompatible alphabets {0!r} and {1!r}".format( 311 self.alphabet, other.alphabet)) 312 # They should be the same sequence type (or one of them is generic) 313 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 314 return self.__class__(str(self) + str(other), a) 315 elif isinstance(other, basestring): 316 # other is a plain string - use the current alphabet 317 return self.__class__(str(self) + other, self.alphabet) 318 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports 319 if isinstance(other, SeqRecord): 320 # Get the SeqRecord's __radd__ to handle this 321 return NotImplemented 322 else: 323 raise TypeError
324
325 - def __radd__(self, other):
326 """Add a sequence on the left. 327 328 If adding a string to a Seq, the alphabet is preserved: 329 330 >>> from Bio.Seq import Seq 331 >>> from Bio.Alphabet import generic_protein 332 >>> "LV" + Seq("MELKI", generic_protein) 333 Seq('LVMELKI', ProteinAlphabet()) 334 335 Adding two Seq (like) objects is handled via the __add__ method. 336 """ 337 if hasattr(other, "alphabet"): 338 # other should be a Seq or a MutableSeq 339 if not Alphabet._check_type_compatible([self.alphabet, 340 other.alphabet]): 341 raise TypeError( 342 "Incompatible alphabets {0!r} and {1!r}".format( 343 self.alphabet, other.alphabet)) 344 # They should be the same sequence type (or one of them is generic) 345 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 346 return self.__class__(str(other) + str(self), a) 347 elif isinstance(other, basestring): 348 # other is a plain string - use the current alphabet 349 return self.__class__(other + str(self), self.alphabet) 350 else: 351 raise TypeError
352
353 - def __mul__(self, other):
354 """Multiply Seq by integer. 355 356 >>> from Bio.Seq import Seq 357 >>> from Bio.Alphabet import generic_dna 358 >>> Seq('ATG') * 2 359 Seq('ATGATG') 360 >>> Seq('ATG', generic_dna) * 2 361 Seq('ATGATG', DNAAlphabet()) 362 """ 363 if not isinstance(other, int): 364 raise TypeError("can't multiply Seq by non-int type") 365 return self.__class__(str(self) * other, self.alphabet)
366
367 - def __rmul__(self, other):
368 """Multiply integer by Seq. 369 370 >>> from Bio.Seq import Seq 371 >>> from Bio.Alphabet import generic_dna 372 >>> 2 * Seq('ATG') 373 Seq('ATGATG') 374 >>> 2 * Seq('ATG', generic_dna) 375 Seq('ATGATG', DNAAlphabet()) 376 """ 377 if not isinstance(other, int): 378 raise TypeError("can't multiply Seq by non-int type") 379 return self.__class__(str(self) * other, self.alphabet)
380
381 - def __imul__(self, other):
382 """Multiply Seq in-place. 383 384 Note although Seq is immutable, the in-place method is 385 included to match the behaviour for regular Python strings. 386 387 >>> from Bio.Seq import Seq 388 >>> from Bio.Alphabet import generic_dna 389 >>> seq = Seq('ATG', generic_dna) 390 >>> seq *= 2 391 >>> seq 392 Seq('ATGATG', DNAAlphabet()) 393 """ 394 if not isinstance(other, int): 395 raise TypeError("can't multiply Seq by non-int type") 396 return self.__class__(str(self) * other, self.alphabet)
397
398 - def tostring(self): # Seq API requirement
399 """Return the full sequence as a python string (DEPRECATED). 400 401 You are now encouraged to use str(my_seq) instead of 402 my_seq.tostring(). 403 """ 404 from Bio import BiopythonDeprecationWarning 405 warnings.warn("This method is obsolete; please use str(my_seq) " 406 "instead of my_seq.tostring().", 407 BiopythonDeprecationWarning) 408 return str(self) 409
410 - def tomutable(self): # Needed? Or use a function?
411 """Return the full sequence as a MutableSeq object. 412 413 >>> from Bio.Seq import Seq 414 >>> from Bio.Alphabet import IUPAC 415 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 416 ... IUPAC.protein) 417 >>> my_seq 418 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 419 >>> my_seq.tomutable() 420 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 421 422 Note that the alphabet is preserved. 423 """ 424 return MutableSeq(str(self), self.alphabet) 425
426 - def _get_seq_str_and_check_alphabet(self, other_sequence):
427 """Convert string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 428 429 For a string argument, returns the string. 430 431 For a Seq or MutableSeq, it checks the alphabet is compatible 432 (raising an exception if it isn't), and then returns a string. 433 """ 434 try: 435 other_alpha = other_sequence.alphabet 436 except AttributeError: 437 # Assume other_sequence is a string 438 return other_sequence 439 440 # Other should be a Seq or a MutableSeq 441 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 442 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 443 self.alphabet, other_alpha)) 444 # Return as a string 445 return str(other_sequence)
446
447 - def count(self, sub, start=0, end=sys.maxsize):
448 """Return a non-overlapping count, like that of a python string. 449 450 This behaves like the python string method of the same name, 451 which does a non-overlapping count! 452 453 For an overlapping search use the newer count_overlap() method. 454 455 Returns an integer, the number of occurrences of substring 456 argument sub in the (sub)sequence given by [start:end]. 457 Optional arguments start and end are interpreted as in slice 458 notation. 459 460 Arguments: 461 - sub - a string or another Seq object to look for 462 - start - optional integer, slice start 463 - end - optional integer, slice end 464 465 e.g. 466 467 >>> from Bio.Seq import Seq 468 >>> my_seq = Seq("AAAATGA") 469 >>> print(my_seq.count("A")) 470 5 471 >>> print(my_seq.count("ATG")) 472 1 473 >>> print(my_seq.count(Seq("AT"))) 474 1 475 >>> print(my_seq.count("AT", 2, -1)) 476 1 477 478 HOWEVER, please note because python strings and Seq objects (and 479 MutableSeq objects) do a non-overlapping search, this may not give 480 the answer you expect: 481 482 >>> "AAAA".count("AA") 483 2 484 >>> print(Seq("AAAA").count("AA")) 485 2 486 487 An overlapping search, as implemented in .count_overlap(), 488 would give the answer as three! 489 """ 490 # If it has one, check the alphabet: 491 sub_str = self._get_seq_str_and_check_alphabet(sub) 492 return str(self).count(sub_str, start, end)
493
494 - def count_overlap(self, sub, start=0, end=sys.maxsize):
495 """Return an overlapping count. 496 497 For a non-overlapping search use the count() method. 498 499 Returns an integer, the number of occurrences of substring 500 argument sub in the (sub)sequence given by [start:end]. 501 Optional arguments start and end are interpreted as in slice 502 notation. 503 504 Arguments: 505 - sub - a string or another Seq object to look for 506 - start - optional integer, slice start 507 - end - optional integer, slice end 508 509 e.g. 510 511 >>> from Bio.Seq import Seq 512 >>> print(Seq("AAAA").count_overlap("AA")) 513 3 514 >>> print(Seq("ATATATATA").count_overlap("ATA")) 515 4 516 >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1)) 517 1 518 519 Where substrings do not overlap, should behave the same as 520 the count() method: 521 522 >>> from Bio.Seq import Seq 523 >>> my_seq = Seq("AAAATGA") 524 >>> print(my_seq.count_overlap("A")) 525 5 526 >>> my_seq.count_overlap("A") == my_seq.count("A") 527 True 528 >>> print(my_seq.count_overlap("ATG")) 529 1 530 >>> my_seq.count_overlap("ATG") == my_seq.count("ATG") 531 True 532 >>> print(my_seq.count_overlap(Seq("AT"))) 533 1 534 >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT")) 535 True 536 >>> print(my_seq.count_overlap("AT", 2, -1)) 537 1 538 >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1) 539 True 540 541 HOWEVER, do not use this method for such cases because the 542 count() method is much for efficient. 543 """ 544 sub_str = self._get_seq_str_and_check_alphabet(sub) 545 self_str = str(self) 546 overlap_count = 0 547 while True: 548 start = self_str.find(sub_str, start, end) + 1 549 if start != 0: 550 overlap_count += 1 551 else: 552 return overlap_count
553
554 - def __contains__(self, char):
555 """Implement the 'in' keyword, like a python string. 556 557 e.g. 558 559 >>> from Bio.Seq import Seq 560 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 561 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 562 >>> "AAA" in my_dna 563 True 564 >>> Seq("AAA") in my_dna 565 True 566 >>> Seq("AAA", generic_dna) in my_dna 567 True 568 569 Like other Seq methods, this will raise a type error if another Seq 570 (or Seq like) object with an incompatible alphabet is used: 571 572 >>> Seq("AAA", generic_rna) in my_dna 573 Traceback (most recent call last): 574 ... 575 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 576 >>> Seq("AAA", generic_protein) in my_dna 577 Traceback (most recent call last): 578 ... 579 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 580 """ 581 # If it has one, check the alphabet: 582 sub_str = self._get_seq_str_and_check_alphabet(char) 583 return sub_str in str(self)
584
585 - def find(self, sub, start=0, end=sys.maxsize):
586 """Find method, like that of a python string. 587 588 This behaves like the python string method of the same name. 589 590 Returns an integer, the index of the first occurrence of substring 591 argument sub in the (sub)sequence given by [start:end]. 592 593 Arguments: 594 - sub - a string or another Seq object to look for 595 - start - optional integer, slice start 596 - end - optional integer, slice end 597 598 Returns -1 if the subsequence is NOT found. 599 600 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 601 602 >>> from Bio.Seq import Seq 603 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 604 >>> my_rna.find("AUG") 605 3 606 """ 607 # If it has one, check the alphabet: 608 sub_str = self._get_seq_str_and_check_alphabet(sub) 609 return str(self).find(sub_str, start, end)
610
611 - def rfind(self, sub, start=0, end=sys.maxsize):
612 """Find from right method, like that of a python string. 613 614 This behaves like the python string method of the same name. 615 616 Returns an integer, the index of the last (right most) occurrence of 617 substring argument sub in the (sub)sequence given by [start:end]. 618 619 Arguments: 620 - sub - a string or another Seq object to look for 621 - start - optional integer, slice start 622 - end - optional integer, slice end 623 624 Returns -1 if the subsequence is NOT found. 625 626 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 627 628 >>> from Bio.Seq import Seq 629 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 630 >>> my_rna.rfind("AUG") 631 15 632 """ 633 # If it has one, check the alphabet: 634 sub_str = self._get_seq_str_and_check_alphabet(sub) 635 return str(self).rfind(sub_str, start, end)
636
637 - def startswith(self, prefix, start=0, end=sys.maxsize):
638 """Return True if the Seq starts with the given prefix, False otherwise. 639 640 This behaves like the python string method of the same name. 641 642 Return True if the sequence starts with the specified prefix 643 (a string or another Seq object), False otherwise. 644 With optional start, test sequence beginning at that position. 645 With optional end, stop comparing sequence at that position. 646 prefix can also be a tuple of strings to try. e.g. 647 648 >>> from Bio.Seq import Seq 649 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 650 >>> my_rna.startswith("GUC") 651 True 652 >>> my_rna.startswith("AUG") 653 False 654 >>> my_rna.startswith("AUG", 3) 655 True 656 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) 657 True 658 """ 659 # If it has one, check the alphabet: 660 if isinstance(prefix, tuple): 661 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 662 for p in prefix) 663 return str(self).startswith(prefix_strs, start, end) 664 else: 665 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 666 return str(self).startswith(prefix_str, start, end)
667
668 - def endswith(self, suffix, start=0, end=sys.maxsize):
669 """Return True if the Seq ends with the given suffix, False otherwise. 670 671 This behaves like the python string method of the same name. 672 673 Return True if the sequence ends with the specified suffix 674 (a string or another Seq object), False otherwise. 675 With optional start, test sequence beginning at that position. 676 With optional end, stop comparing sequence at that position. 677 suffix can also be a tuple of strings to try. e.g. 678 679 >>> from Bio.Seq import Seq 680 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 681 >>> my_rna.endswith("UUG") 682 True 683 >>> my_rna.endswith("AUG") 684 False 685 >>> my_rna.endswith("AUG", 0, 18) 686 True 687 >>> my_rna.endswith(("UCC", "UCA", "UUG")) 688 True 689 """ 690 # If it has one, check the alphabet: 691 if isinstance(suffix, tuple): 692 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 693 for p in suffix) 694 return str(self).endswith(suffix_strs, start, end) 695 else: 696 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 697 return str(self).endswith(suffix_str, start, end)
698
699 - def split(self, sep=None, maxsplit=-1):
700 """Split method, like that of a python string. 701 702 This behaves like the python string method of the same name. 703 704 Return a list of the 'words' in the string (as Seq objects), 705 using sep as the delimiter string. If maxsplit is given, at 706 most maxsplit splits are done. If maxsplit is omitted, all 707 splits are made. 708 709 Following the python string method, sep will by default be any 710 white space (tabs, spaces, newlines) but this is unlikely to 711 apply to biological sequences. 712 713 e.g. 714 715 >>> from Bio.Seq import Seq 716 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 717 >>> my_aa = my_rna.translate() 718 >>> my_aa 719 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 720 >>> for pep in my_aa.split("*"): 721 ... pep 722 Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')) 723 Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')) 724 Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*')) 725 >>> for pep in my_aa.split("*", 1): 726 ... pep 727 Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')) 728 Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 729 730 See also the rsplit method: 731 732 >>> for pep in my_aa.rsplit("*", 1): 733 ... pep 734 Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')) 735 Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*')) 736 """ 737 # If it has one, check the alphabet: 738 sep_str = self._get_seq_str_and_check_alphabet(sep) 739 # TODO - If the sep is the defined stop symbol, or gap char, 740 # should we adjust the alphabet? 741 return [Seq(part, self.alphabet) 742 for part in str(self).split(sep_str, maxsplit)]
743
744 - def rsplit(self, sep=None, maxsplit=-1):
745 """Do a right split method, like that of a python string. 746 747 This behaves like the python string method of the same name. 748 749 Return a list of the 'words' in the string (as Seq objects), 750 using sep as the delimiter string. If maxsplit is given, at 751 most maxsplit splits are done COUNTING FROM THE RIGHT. 752 If maxsplit is omitted, all splits are made. 753 754 Following the python string method, sep will by default be any 755 white space (tabs, spaces, newlines) but this is unlikely to 756 apply to biological sequences. 757 758 e.g. print(my_seq.rsplit("*",1)) 759 760 See also the split method. 761 """ 762 # If it has one, check the alphabet: 763 sep_str = self._get_seq_str_and_check_alphabet(sep) 764 return [Seq(part, self.alphabet) 765 for part in str(self).rsplit(sep_str, maxsplit)]
766
767 - def strip(self, chars=None):
768 """Return a new Seq object with leading and trailing ends stripped. 769 770 This behaves like the python string method of the same name. 771 772 Optional argument chars defines which characters to remove. If 773 omitted or None (default) then as for the python string method, 774 this defaults to removing any white space. 775 776 e.g. print(my_seq.strip("-")) 777 778 See also the lstrip and rstrip methods. 779 """ 780 # If it has one, check the alphabet: 781 strip_str = self._get_seq_str_and_check_alphabet(chars) 782 return Seq(str(self).strip(strip_str), self.alphabet)
783
784 - def lstrip(self, chars=None):
785 """Return a new Seq object with leading (left) end stripped. 786 787 This behaves like the python string method of the same name. 788 789 Optional argument chars defines which characters to remove. If 790 omitted or None (default) then as for the python string method, 791 this defaults to removing any white space. 792 793 e.g. print(my_seq.lstrip("-")) 794 795 See also the strip and rstrip methods. 796 """ 797 # If it has one, check the alphabet: 798 strip_str = self._get_seq_str_and_check_alphabet(chars) 799 return Seq(str(self).lstrip(strip_str), self.alphabet)
800
801 - def rstrip(self, chars=None):
802 """Return a new Seq object with trailing (right) end stripped. 803 804 This behaves like the python string method of the same name. 805 806 Optional argument chars defines which characters to remove. If 807 omitted or None (default) then as for the python string method, 808 this defaults to removing any white space. 809 810 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 811 812 >>> from Bio.Alphabet import IUPAC 813 >>> from Bio.Seq import Seq 814 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 815 >>> my_seq 816 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 817 >>> my_seq.rstrip("A") 818 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 819 820 See also the strip and lstrip methods. 821 """ 822 # If it has one, check the alphabet: 823 strip_str = self._get_seq_str_and_check_alphabet(chars) 824 return Seq(str(self).rstrip(strip_str), self.alphabet)
825
826 - def upper(self):
827 """Return an upper case copy of the sequence. 828 829 >>> from Bio.Alphabet import HasStopCodon, generic_protein 830 >>> from Bio.Seq import Seq 831 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 832 >>> my_seq 833 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 834 >>> my_seq.lower() 835 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 836 >>> my_seq.upper() 837 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 838 839 This will adjust the alphabet if required. See also the lower method. 840 """ 841 return Seq(str(self).upper(), self.alphabet._upper())
842
843 - def lower(self):
844 """Return a lower case copy of the sequence. 845 846 This will adjust the alphabet if required. Note that the IUPAC 847 alphabets are upper case only, and thus a generic alphabet must be 848 substituted. 849 850 >>> from Bio.Alphabet import Gapped, generic_dna 851 >>> from Bio.Alphabet import IUPAC 852 >>> from Bio.Seq import Seq 853 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", 854 ... Gapped(IUPAC.unambiguous_dna, "*")) 855 >>> my_seq 856 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 857 >>> my_seq.lower() 858 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 859 860 See also the upper method. 861 """ 862 return Seq(str(self).lower(), self.alphabet._lower())
863
864 - def complement(self):
865 """Return the complement sequence by creating a new Seq object. 866 867 >>> from Bio.Seq import Seq 868 >>> from Bio.Alphabet import IUPAC 869 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 870 >>> my_dna 871 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 872 >>> my_dna.complement() 873 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 874 875 You can of course used mixed case sequences, 876 877 >>> from Bio.Seq import Seq 878 >>> from Bio.Alphabet import generic_dna 879 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 880 >>> my_dna 881 Seq('CCCCCgatA-GD', DNAAlphabet()) 882 >>> my_dna.complement() 883 Seq('GGGGGctaT-CH', DNAAlphabet()) 884 885 Note in the above example, ambiguous character D denotes 886 G, A or T so its complement is H (for C, T or A). 887 888 Trying to complement a protein sequence raises an exception. 889 890 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 891 >>> my_protein.complement() 892 Traceback (most recent call last): 893 ... 894 ValueError: Proteins do not have complements! 895 """ 896 base = Alphabet._get_base_alphabet(self.alphabet) 897 if isinstance(base, Alphabet.ProteinAlphabet): 898 raise ValueError("Proteins do not have complements!") 899 if isinstance(base, Alphabet.DNAAlphabet): 900 ttable = _dna_complement_table 901 elif isinstance(base, Alphabet.RNAAlphabet): 902 ttable = _rna_complement_table 903 elif ('U' in self._data or 'u' in self._data) \ 904 and ('T' in self._data or 't' in self._data): 905 # TODO - Handle this cleanly? 906 raise ValueError("Mixed RNA/DNA found") 907 elif 'U' in self._data or 'u' in self._data: 908 ttable = _rna_complement_table 909 else: 910 ttable = _dna_complement_table 911 # Much faster on really long sequences than the previous loop based 912 # one. Thanks to Michael Palmer, University of Waterloo. 913 return Seq(str(self).translate(ttable), self.alphabet)
914
915 - def reverse_complement(self):
916 """Return the reverse complement sequence by creating a new Seq object. 917 918 >>> from Bio.Seq import Seq 919 >>> from Bio.Alphabet import IUPAC 920 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 921 >>> my_dna 922 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 923 >>> my_dna.reverse_complement() 924 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 925 926 Note in the above example, since R = G or A, its complement 927 is Y (which denotes C or T). 928 929 You can of course used mixed case sequences, 930 931 >>> from Bio.Seq import Seq 932 >>> from Bio.Alphabet import generic_dna 933 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 934 >>> my_dna 935 Seq('CCCCCgatA-G', DNAAlphabet()) 936 >>> my_dna.reverse_complement() 937 Seq('C-TatcGGGGG', DNAAlphabet()) 938 939 Trying to complement a protein sequence raises an exception: 940 941 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 942 >>> my_protein.reverse_complement() 943 Traceback (most recent call last): 944 ... 945 ValueError: Proteins do not have complements! 946 """ 947 # Use -1 stride/step to reverse the complement 948 return self.complement()[::-1]
949
950 - def transcribe(self):
951 """Return the RNA sequence from a DNA sequence by creating a new Seq object. 952 953 >>> from Bio.Seq import Seq 954 >>> from Bio.Alphabet import IUPAC 955 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 956 ... IUPAC.unambiguous_dna) 957 >>> coding_dna 958 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 959 >>> coding_dna.transcribe() 960 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 961 962 Trying to transcribe a protein or RNA sequence raises an exception: 963 964 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 965 >>> my_protein.transcribe() 966 Traceback (most recent call last): 967 ... 968 ValueError: Proteins cannot be transcribed! 969 """ 970 base = Alphabet._get_base_alphabet(self.alphabet) 971 if isinstance(base, Alphabet.ProteinAlphabet): 972 raise ValueError("Proteins cannot be transcribed!") 973 if isinstance(base, Alphabet.RNAAlphabet): 974 raise ValueError("RNA cannot be transcribed!") 975 976 if self.alphabet == IUPAC.unambiguous_dna: 977 alphabet = IUPAC.unambiguous_rna 978 elif self.alphabet == IUPAC.ambiguous_dna: 979 alphabet = IUPAC.ambiguous_rna 980 else: 981 alphabet = Alphabet.generic_rna 982 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
983
984 - def back_transcribe(self):
985 """Return the DNA sequence from an RNA sequence by creating a new Seq object. 986 987 >>> from Bio.Seq import Seq 988 >>> from Bio.Alphabet import IUPAC 989 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 990 ... IUPAC.unambiguous_rna) 991 >>> messenger_rna 992 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 993 >>> messenger_rna.back_transcribe() 994 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 995 996 Trying to back-transcribe a protein or DNA sequence raises an 997 exception: 998 999 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 1000 >>> my_protein.back_transcribe() 1001 Traceback (most recent call last): 1002 ... 1003 ValueError: Proteins cannot be back transcribed! 1004 """ 1005 base = Alphabet._get_base_alphabet(self.alphabet) 1006 if isinstance(base, Alphabet.ProteinAlphabet): 1007 raise ValueError("Proteins cannot be back transcribed!") 1008 if isinstance(base, Alphabet.DNAAlphabet): 1009 raise ValueError("DNA cannot be back transcribed!") 1010 1011 if self.alphabet == IUPAC.unambiguous_rna: 1012 alphabet = IUPAC.unambiguous_dna 1013 elif self.alphabet == IUPAC.ambiguous_rna: 1014 alphabet = IUPAC.ambiguous_dna 1015 else: 1016 alphabet = Alphabet.generic_dna 1017 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
1018
1019 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 1020 cds=False, gap=None):
1021 """Turn a nucleotide sequence into a protein sequence by creating a new Seq object. 1022 1023 This method will translate DNA or RNA sequences, and those with a 1024 nucleotide or generic alphabet. Trying to translate a protein 1025 sequence raises an exception. 1026 1027 Arguments: 1028 - table - Which codon table to use? This can be either a name 1029 (string), an NCBI identifier (integer), or a CodonTable 1030 object (useful for non-standard genetic codes). This 1031 defaults to the "Standard" table. 1032 - stop_symbol - Single character string, what to use for 1033 terminators. This defaults to the asterisk, "*". 1034 - to_stop - Boolean, defaults to False meaning do a full 1035 translation continuing on past any stop codons (translated as the 1036 specified stop_symbol). If True, translation is terminated at 1037 the first in frame stop codon (and the stop_symbol is not 1038 appended to the returned protein sequence). 1039 - cds - Boolean, indicates this is a complete CDS. If True, 1040 this checks the sequence starts with a valid alternative start 1041 codon (which will be translated as methionine, M), that the 1042 sequence length is a multiple of three, and that there is a 1043 single in frame stop codon at the end (this will be excluded 1044 from the protein sequence, regardless of the to_stop option). 1045 If these tests fail, an exception is raised. 1046 - gap - Single character string to denote symbol used for gaps. 1047 It will try to guess the gap character from the alphabet. 1048 1049 e.g. Using the standard table: 1050 1051 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 1052 >>> coding_dna.translate() 1053 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 1054 >>> coding_dna.translate(stop_symbol="@") 1055 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 1056 >>> coding_dna.translate(to_stop=True) 1057 Seq('VAIVMGR', ExtendedIUPACProtein()) 1058 1059 Now using NCBI table 2, where TGA is not a stop codon: 1060 1061 >>> coding_dna.translate(table=2) 1062 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 1063 >>> coding_dna.translate(table=2, to_stop=True) 1064 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 1065 1066 In fact, GTG is an alternative start codon under NCBI table 2, meaning 1067 this sequence could be a complete CDS: 1068 1069 >>> coding_dna.translate(table=2, cds=True) 1070 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 1071 1072 It isn't a valid CDS under NCBI table 1, due to both the start codon 1073 and also the in frame stop codons: 1074 1075 >>> coding_dna.translate(table=1, cds=True) 1076 Traceback (most recent call last): 1077 ... 1078 TranslationError: First codon 'GTG' is not a start codon 1079 1080 If the sequence has no in-frame stop codon, then the to_stop argument 1081 has no effect: 1082 1083 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 1084 >>> coding_dna2.translate() 1085 Seq('LAIVMGR', ExtendedIUPACProtein()) 1086 >>> coding_dna2.translate(to_stop=True) 1087 Seq('LAIVMGR', ExtendedIUPACProtein()) 1088 1089 When translating gapped sequences, the gap character is inferred from 1090 the alphabet: 1091 1092 >>> from Bio.Alphabet import Gapped 1093 >>> coding_dna3 = Seq("GTG---GCCATT", Gapped(IUPAC.unambiguous_dna)) 1094 >>> coding_dna3.translate() 1095 Seq('V-AI', Gapped(ExtendedIUPACProtein(), '-')) 1096 1097 It is possible to pass the gap character when the alphabet is missing: 1098 1099 >>> coding_dna4 = Seq("GTG---GCCATT") 1100 >>> coding_dna4.translate(gap='-') 1101 Seq('V-AI', Gapped(ExtendedIUPACProtein(), '-')) 1102 1103 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 1104 or a stop codon. These are translated as "X". Any invalid codon 1105 (e.g. "TA?" or "T-A") will throw a TranslationError. 1106 1107 NOTE - This does NOT behave like the python string's translate 1108 method. For that use str(my_seq).translate(...) instead. 1109 """ 1110 if isinstance(table, str) and len(table) == 256: 1111 raise ValueError("The Seq object translate method DOES NOT take " 1112 "a 256 character string mapping table like " 1113 "the python string object's translate method. " 1114 "Use str(my_seq).translate(...) instead.") 1115 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1116 Alphabet.ProteinAlphabet): 1117 raise ValueError("Proteins cannot be translated!") 1118 try: 1119 table_id = int(table) 1120 except ValueError: 1121 # Assume its a table name 1122 if self.alphabet == IUPAC.unambiguous_dna: 1123 # Will use standard IUPAC protein alphabet, no need for X 1124 codon_table = CodonTable.unambiguous_dna_by_name[table] 1125 elif self.alphabet == IUPAC.unambiguous_rna: 1126 # Will use standard IUPAC protein alphabet, no need for X 1127 codon_table = CodonTable.unambiguous_rna_by_name[table] 1128 else: 1129 # This will use the extended IUPAC protein alphabet with X etc. 1130 # The same table can be used for RNA or DNA (we use this for 1131 # translating strings). 1132 codon_table = CodonTable.ambiguous_generic_by_name[table] 1133 except (AttributeError, TypeError): 1134 # Assume its a CodonTable object 1135 if isinstance(table, CodonTable.CodonTable): 1136 codon_table = table 1137 else: 1138 raise ValueError('Bad table argument') 1139 else: 1140 # Assume its a table ID 1141 if self.alphabet == IUPAC.unambiguous_dna: 1142 # Will use standard IUPAC protein alphabet, no need for X 1143 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 1144 elif self.alphabet == IUPAC.unambiguous_rna: 1145 # Will use standard IUPAC protein alphabet, no need for X 1146 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 1147 else: 1148 # This will use the extended IUPAC protein alphabet with X etc. 1149 # The same table can be used for RNA or DNA (we use this for 1150 # translating strings). 1151 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 1152 1153 # Deal with gaps for translation 1154 if hasattr(self.alphabet, "gap_char"): 1155 if not gap: 1156 gap = self.alphabet.gap_char 1157 elif gap != self.alphabet.gap_char: 1158 raise ValueError( 1159 "Gap {0!r} does not match {1!r} from alphabet".format( 1160 gap, self.alphabet.gap_char)) 1161 1162 protein = _translate_str(str(self), codon_table, stop_symbol, to_stop, 1163 cds, gap=gap) 1164 1165 if gap and gap in protein: 1166 alphabet = Alphabet.Gapped(codon_table.protein_alphabet, gap) 1167 else: 1168 alphabet = codon_table.protein_alphabet 1169 1170 if stop_symbol in protein: 1171 alphabet = Alphabet.HasStopCodon(alphabet, stop_symbol) 1172 1173 return Seq(protein, alphabet)
1174
1175 - def ungap(self, gap=None):
1176 """Return a copy of the sequence without the gap character(s). 1177 1178 The gap character can be specified in two ways - either as an explicit 1179 argument, or via the sequence's alphabet. For example: 1180 1181 >>> from Bio.Seq import Seq 1182 >>> from Bio.Alphabet import generic_dna 1183 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 1184 >>> my_dna 1185 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1186 >>> my_dna.ungap("-") 1187 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 1188 1189 If the gap character is not given as an argument, it will be taken from 1190 the sequence's alphabet (if defined). Notice that the returned 1191 sequence's alphabet is adjusted since it no longer requires a gapped 1192 alphabet: 1193 1194 >>> from Bio.Seq import Seq 1195 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 1196 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 1197 >>> my_pro 1198 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 1199 >>> my_pro.ungap() 1200 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 1201 1202 Or, with a simpler gapped DNA example: 1203 1204 >>> from Bio.Seq import Seq 1205 >>> from Bio.Alphabet import IUPAC, Gapped 1206 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 1207 >>> my_seq 1208 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1209 >>> my_seq.ungap() 1210 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1211 1212 As long as it is consistent with the alphabet, although it is 1213 redundant, you can still supply the gap character as an argument to 1214 this method: 1215 1216 >>> my_seq 1217 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1218 >>> my_seq.ungap("=") 1219 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1220 1221 However, if the gap character given as the argument disagrees with that 1222 declared in the alphabet, an exception is raised: 1223 1224 >>> my_seq 1225 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1226 >>> my_seq.ungap("-") 1227 Traceback (most recent call last): 1228 ... 1229 ValueError: Gap '-' does not match '=' from alphabet 1230 1231 Finally, if a gap character is not supplied, and the alphabet does not 1232 define one, an exception is raised: 1233 1234 >>> from Bio.Seq import Seq 1235 >>> from Bio.Alphabet import generic_dna 1236 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1237 >>> my_dna 1238 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1239 >>> my_dna.ungap() 1240 Traceback (most recent call last): 1241 ... 1242 ValueError: Gap character not given and not defined in alphabet 1243 1244 """ 1245 if hasattr(self.alphabet, "gap_char"): 1246 if not gap: 1247 gap = self.alphabet.gap_char 1248 elif gap != self.alphabet.gap_char: 1249 raise ValueError( 1250 "Gap {0!r} does not match {1!r} from alphabet".format( 1251 gap, self.alphabet.gap_char)) 1252 alpha = Alphabet._ungap(self.alphabet) 1253 elif not gap: 1254 raise ValueError("Gap character not given and not defined in " 1255 "alphabet") 1256 else: 1257 alpha = self.alphabet # modify! 1258 if len(gap) != 1 or not isinstance(gap, str): 1259 raise ValueError("Unexpected gap character, {0!r}".format(gap)) 1260 return Seq(str(self).replace(gap, ""), alpha)
1261 1262
1263 -class UnknownSeq(Seq):
1264 """Read-only sequence object of known length but unknown contents. 1265 1266 If you have an unknown sequence, you can represent this with a normal 1267 Seq object, for example: 1268 1269 >>> my_seq = Seq("N"*5) 1270 >>> my_seq 1271 Seq('NNNNN') 1272 >>> len(my_seq) 1273 5 1274 >>> print(my_seq) 1275 NNNNN 1276 1277 However, this is rather wasteful of memory (especially for large 1278 sequences), which is where this class is most useful: 1279 1280 >>> unk_five = UnknownSeq(5) 1281 >>> unk_five 1282 UnknownSeq(5, character='?') 1283 >>> len(unk_five) 1284 5 1285 >>> print(unk_five) 1286 ????? 1287 1288 You can add unknown sequence together, provided their alphabets and 1289 characters are compatible, and get another memory saving UnknownSeq: 1290 1291 >>> unk_four = UnknownSeq(4) 1292 >>> unk_four 1293 UnknownSeq(4, character='?') 1294 >>> unk_four + unk_five 1295 UnknownSeq(9, character='?') 1296 1297 If the alphabet or characters don't match up, the addition gives an 1298 ordinary Seq object: 1299 1300 >>> unk_nnnn = UnknownSeq(4, character="N") 1301 >>> unk_nnnn 1302 UnknownSeq(4, character='N') 1303 >>> unk_nnnn + unk_four 1304 Seq('NNNN????') 1305 1306 Combining with a real Seq gives a new Seq object: 1307 1308 >>> known_seq = Seq("ACGT") 1309 >>> unk_four + known_seq 1310 Seq('????ACGT') 1311 >>> known_seq + unk_four 1312 Seq('ACGT????') 1313 """ 1314
1315 - def __init__(self, length, alphabet=Alphabet.generic_alphabet, 1316 character=None):
1317 """Create a new UnknownSeq object. 1318 1319 If character is omitted, it is determined from the alphabet, "N" for 1320 nucleotides, "X" for proteins, and "?" otherwise. 1321 """ 1322 self._length = int(length) 1323 if self._length < 0: 1324 # TODO - Block zero length UnknownSeq? You can just use a Seq! 1325 raise ValueError("Length must not be negative.") 1326 self.alphabet = alphabet 1327 if character: 1328 if len(character) != 1: 1329 raise ValueError("character argument should be a single " 1330 "letter string.") 1331 self._character = character 1332 else: 1333 base = Alphabet._get_base_alphabet(alphabet) 1334 # TODO? Check the case of the letters in the alphabet? 1335 # We may have to use "n" instead of "N" etc. 1336 if isinstance(base, Alphabet.NucleotideAlphabet): 1337 self._character = "N" 1338 elif isinstance(base, Alphabet.ProteinAlphabet): 1339 self._character = "X" 1340 else: 1341 self._character = "?"
1342
1343 - def __len__(self):
1344 """Return the stated length of the unknown sequence.""" 1345 return self._length
1346
1347 - def __str__(self):
1348 """Return the unknown sequence as full string of the given length.""" 1349 return self._character * self._length
1350
1351 - def __repr__(self):
1352 """Return (truncated) representation of the sequence for debugging.""" 1353 if self.alphabet is Alphabet.generic_alphabet: 1354 # Default used, we can omit it and simplify the representation 1355 a = "" 1356 else: 1357 a = ", alphabet=%r" % self.alphabet 1358 return "UnknownSeq({0}{1!s}, character={2!r})".format( 1359 self._length, a, self._character)
1360
1361 - def __add__(self, other):
1362 """Add another sequence or string to this sequence. 1363 1364 Adding two UnknownSeq objects returns another UnknownSeq object 1365 provided the character is the same and the alphabets are compatible. 1366 1367 >>> from Bio.Seq import UnknownSeq 1368 >>> from Bio.Alphabet import generic_protein 1369 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1370 UnknownSeq(15, alphabet=ProteinAlphabet(), character='X') 1371 1372 If the characters differ, an UnknownSeq object cannot be used, so a 1373 Seq object is returned: 1374 1375 >>> from Bio.Seq import UnknownSeq 1376 >>> from Bio.Alphabet import generic_protein 1377 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1378 ... character="x") 1379 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1380 1381 If adding a string to an UnknownSeq, a new Seq is returned with the 1382 same alphabet: 1383 1384 >>> from Bio.Seq import UnknownSeq 1385 >>> from Bio.Alphabet import generic_protein 1386 >>> UnknownSeq(5, generic_protein) + "LV" 1387 Seq('XXXXXLV', ProteinAlphabet()) 1388 """ 1389 if isinstance(other, UnknownSeq) and \ 1390 other._character == self._character: 1391 # TODO - Check the alphabets match 1392 return UnknownSeq(len(self) + len(other), 1393 self.alphabet, self._character) 1394 # Offload to the base class... 1395 return Seq(str(self), self.alphabet) + other
1396
1397 - def __radd__(self, other):
1398 """Add a sequence on the left.""" 1399 # If other is an UnknownSeq, then __add__ would be called. 1400 # Offload to the base class... 1401 return other + Seq(str(self), self.alphabet)
1402
1403 - def __mul__(self, other):
1404 """Multiply UnknownSeq by integer. 1405 1406 >>> from Bio.Seq import UnknownSeq 1407 >>> from Bio.Alphabet import generic_dna 1408 >>> UnknownSeq(3) * 2 1409 UnknownSeq(6, character='?') 1410 >>> UnknownSeq(3, generic_dna) * 2 1411 UnknownSeq(6, alphabet=DNAAlphabet(), character='N') 1412 """ 1413 if not isinstance(other, int): 1414 raise TypeError("can't multiply UnknownSeq by non-int type") 1415 return self.__class__(len(self) * other, self.alphabet)
1416
1417 - def __rmul__(self, other):
1418 """Multiply integer by UnknownSeq. 1419 1420 >>> from Bio.Seq import UnknownSeq 1421 >>> from Bio.Alphabet import generic_dna 1422 >>> 2 * UnknownSeq(3) 1423 UnknownSeq(6, character='?') 1424 >>> 2 * UnknownSeq(3, generic_dna) 1425 UnknownSeq(6, alphabet=DNAAlphabet(), character='N') 1426 """ 1427 if not isinstance(other, int): 1428 raise TypeError("can't multiply UnknownSeq by non-int type") 1429 return self.__class__(len(self) * other, self.alphabet)
1430
1431 - def __imul__(self, other):
1432 """Multiply UnknownSeq in-place. 1433 1434 Note although UnknownSeq is immutable, the in-place method is 1435 included to match the behaviour for regular Python strings. 1436 1437 >>> from Bio.Seq import UnknownSeq 1438 >>> from Bio.Alphabet import generic_dna 1439 >>> seq = UnknownSeq(3, generic_dna) 1440 >>> seq *= 2 1441 >>> seq 1442 UnknownSeq(6, alphabet=DNAAlphabet(), character='N') 1443 """ 1444 if not isinstance(other, int): 1445 raise TypeError("can't multiply UnknownSeq by non-int type") 1446 return self.__class__(len(self) * other, self.alphabet)
1447
1448 - def __getitem__(self, index):
1449 """Get a subsequence from the UnknownSeq object. 1450 1451 >>> unk = UnknownSeq(8, character="N") 1452 >>> print(unk[:]) 1453 NNNNNNNN 1454 >>> print(unk[5:3]) 1455 <BLANKLINE> 1456 >>> print(unk[1:-1]) 1457 NNNNNN 1458 >>> print(unk[1:-1:2]) 1459 NNN 1460 """ 1461 if isinstance(index, int): 1462 # TODO - Check the bounds without wasting memory 1463 return str(self)[index] 1464 old_length = self._length 1465 step = index.step 1466 if step is None or step == 1: 1467 # This calculates the length you'd get from ("N"*old_length)[index] 1468 start = index.start 1469 end = index.stop 1470 if start is None: 1471 start = 0 1472 elif start < 0: 1473 start = max(0, old_length + start) 1474 elif start > old_length: 1475 start = old_length 1476 if end is None: 1477 end = old_length 1478 elif end < 0: 1479 end = max(0, old_length + end) 1480 elif end > old_length: 1481 end = old_length 1482 new_length = max(0, end - start) 1483 elif step == 0: 1484 raise ValueError("slice step cannot be zero") 1485 else: 1486 # TODO - handle step efficiently 1487 new_length = len(("X" * old_length)[index]) 1488 # assert new_length == len(("X"*old_length)[index]), \ 1489 # (index, start, end, step, old_length, 1490 # new_length, len(("X"*old_length)[index])) 1491 return UnknownSeq(new_length, self.alphabet, self._character)
1492
1493 - def count(self, sub, start=0, end=sys.maxsize):
1494 """Return a non-overlapping count, like that of a python string. 1495 1496 This behaves like the python string (and Seq object) method of the 1497 same name, which does a non-overlapping count! 1498 1499 For an overlapping search use the newer count_overlap() method. 1500 1501 Returns an integer, the number of occurrences of substring 1502 argument sub in the (sub)sequence given by [start:end]. 1503 Optional arguments start and end are interpreted as in slice 1504 notation. 1505 1506 Arguments: 1507 - sub - a string or another Seq object to look for 1508 - start - optional integer, slice start 1509 - end - optional integer, slice end 1510 1511 >>> "NNNN".count("N") 1512 4 1513 >>> Seq("NNNN").count("N") 1514 4 1515 >>> UnknownSeq(4, character="N").count("N") 1516 4 1517 >>> UnknownSeq(4, character="N").count("A") 1518 0 1519 >>> UnknownSeq(4, character="N").count("AA") 1520 0 1521 1522 HOWEVER, please note because that python strings and Seq objects (and 1523 MutableSeq objects) do a non-overlapping search, this may not give 1524 the answer you expect: 1525 1526 >>> UnknownSeq(4, character="N").count("NN") 1527 2 1528 >>> UnknownSeq(4, character="N").count("NNN") 1529 1 1530 """ 1531 sub_str = self._get_seq_str_and_check_alphabet(sub) 1532 len_self, len_sub_str = self._length, len(sub_str) 1533 # Handling case where substring not in self 1534 if set(sub_str) != set(self._character): 1535 return 0 1536 # Setting None to the default arguments 1537 if start is None: 1538 start = 0 1539 if end is None: 1540 end = sys.maxsize 1541 # Truncating start and end to max of self._length and min of -self._length 1542 start = max(min(start, len_self), -len_self) 1543 end = max(min(end, len_self), -len_self) 1544 # Convert start and ends to positive indexes 1545 if start < 0: 1546 start += len_self 1547 if end < 0: 1548 end += len_self 1549 # Handle case where end <= start (no negative step argument here) 1550 # and case where len_sub_str is larger than the search space 1551 if end <= start or (end - start) < len_sub_str: 1552 return 0 1553 # 'Normal' calculation 1554 return (end - start) // len_sub_str
1555
1556 - def count_overlap(self, sub, start=0, end=sys.maxsize):
1557 """Return an overlapping count. 1558 1559 For a non-overlapping search use the count() method. 1560 1561 Returns an integer, the number of occurrences of substring 1562 argument sub in the (sub)sequence given by [start:end]. 1563 Optional arguments start and end are interpreted as in slice 1564 notation. 1565 1566 Arguments: 1567 - sub - a string or another Seq object to look for 1568 - start - optional integer, slice start 1569 - end - optional integer, slice end 1570 1571 e.g. 1572 1573 >>> from Bio.Seq import UnknownSeq 1574 >>> UnknownSeq(4, character="N").count_overlap("NN") 1575 3 1576 >>> UnknownSeq(4, character="N").count_overlap("NNN") 1577 2 1578 1579 Where substrings do not overlap, should behave the same as 1580 the count() method: 1581 1582 >>> UnknownSeq(4, character="N").count_overlap("N") 1583 4 1584 >>> UnknownSeq(4, character="N").count_overlap("N") == UnknownSeq(4, character="N").count("N") 1585 True 1586 >>> UnknownSeq(4, character="N").count_overlap("A") 1587 0 1588 >>> UnknownSeq(4, character="N").count_overlap("A") == UnknownSeq(4, character="N").count("A") 1589 True 1590 >>> UnknownSeq(4, character="N").count_overlap("AA") 1591 0 1592 >>> UnknownSeq(4, character="N").count_overlap("AA") == UnknownSeq(4, character="N").count("AA") 1593 True 1594 """ 1595 sub_str = self._get_seq_str_and_check_alphabet(sub) 1596 len_self, len_sub_str = self._length, len(sub_str) 1597 # Handling case where substring not in self 1598 if set(sub_str) != set(self._character): 1599 return 0 1600 # Setting None to the default arguments 1601 if start is None: 1602 start = 0 1603 if end is None: 1604 end = sys.maxsize 1605 # Truncating start and end to max of self._length and min of -self._length 1606 start = max(min(start, len_self), -len_self) 1607 end = max(min(end, len_self), -len_self) 1608 # Convert start and ends to positive indexes 1609 if start < 0: 1610 start += len_self 1611 if end < 0: 1612 end += len_self 1613 # Handle case where end <= start (no negative step argument here) 1614 # and case where len_sub_str is larger than the search space 1615 if end <= start or (end - start) < len_sub_str: 1616 return 0 1617 # 'Normal' calculation 1618 return end - start - len_sub_str + 1
1619
1620 - def complement(self):
1621 """Return the complement of an unknown nucleotide equals itself. 1622 1623 >>> my_nuc = UnknownSeq(8) 1624 >>> my_nuc 1625 UnknownSeq(8, character='?') 1626 >>> print(my_nuc) 1627 ???????? 1628 >>> my_nuc.complement() 1629 UnknownSeq(8, character='?') 1630 >>> print(my_nuc.complement()) 1631 ???????? 1632 """ 1633 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1634 Alphabet.ProteinAlphabet): 1635 raise ValueError("Proteins do not have complements!") 1636 return self
1637
1638 - def reverse_complement(self):
1639 """Return the reverse complement of an unknown sequence. 1640 1641 The reverse complement of an unknown nucleotide equals itself: 1642 1643 >>> from Bio.Seq import UnknownSeq 1644 >>> from Bio.Alphabet import generic_dna 1645 >>> example = UnknownSeq(6, generic_dna) 1646 >>> print(example) 1647 NNNNNN 1648 >>> print(example.reverse_complement()) 1649 NNNNNN 1650 """ 1651 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1652 Alphabet.ProteinAlphabet): 1653 raise ValueError("Proteins do not have complements!") 1654 return self
1655
1656 - def transcribe(self):
1657 """Return an unknown RNA sequence from an unknown DNA sequence. 1658 1659 >>> my_dna = UnknownSeq(10, character="N") 1660 >>> my_dna 1661 UnknownSeq(10, character='N') 1662 >>> print(my_dna) 1663 NNNNNNNNNN 1664 >>> my_rna = my_dna.transcribe() 1665 >>> my_rna 1666 UnknownSeq(10, alphabet=RNAAlphabet(), character='N') 1667 >>> print(my_rna) 1668 NNNNNNNNNN 1669 """ 1670 # Offload the alphabet stuff 1671 s = Seq(self._character, self.alphabet).transcribe() 1672 return UnknownSeq(self._length, s.alphabet, self._character)
1673
1674 - def back_transcribe(self):
1675 """Return an unknown DNA sequence from an unknown RNA sequence. 1676 1677 >>> my_rna = UnknownSeq(20, character="N") 1678 >>> my_rna 1679 UnknownSeq(20, character='N') 1680 >>> print(my_rna) 1681 NNNNNNNNNNNNNNNNNNNN 1682 >>> my_dna = my_rna.back_transcribe() 1683 >>> my_dna 1684 UnknownSeq(20, alphabet=DNAAlphabet(), character='N') 1685 >>> print(my_dna) 1686 NNNNNNNNNNNNNNNNNNNN 1687 """ 1688 # Offload the alphabet stuff 1689 s = Seq(self._character, self.alphabet).back_transcribe() 1690 return UnknownSeq(self._length, s.alphabet, self._character)
1691
1692 - def upper(self):
1693 """Return an upper case copy of the sequence. 1694 1695 >>> from Bio.Alphabet import generic_dna 1696 >>> from Bio.Seq import UnknownSeq 1697 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1698 >>> my_seq 1699 UnknownSeq(20, alphabet=DNAAlphabet(), character='n') 1700 >>> print(my_seq) 1701 nnnnnnnnnnnnnnnnnnnn 1702 >>> my_seq.upper() 1703 UnknownSeq(20, alphabet=DNAAlphabet(), character='N') 1704 >>> print(my_seq.upper()) 1705 NNNNNNNNNNNNNNNNNNNN 1706 1707 This will adjust the alphabet if required. See also the lower method. 1708 """ 1709 return UnknownSeq(self._length, self.alphabet._upper(), 1710 self._character.upper())
1711
1712 - def lower(self):
1713 """Return a lower case copy of the sequence. 1714 1715 This will adjust the alphabet if required: 1716 1717 >>> from Bio.Alphabet import IUPAC 1718 >>> from Bio.Seq import UnknownSeq 1719 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1720 >>> my_seq 1721 UnknownSeq(20, alphabet=ExtendedIUPACProtein(), character='X') 1722 >>> print(my_seq) 1723 XXXXXXXXXXXXXXXXXXXX 1724 >>> my_seq.lower() 1725 UnknownSeq(20, alphabet=ProteinAlphabet(), character='x') 1726 >>> print(my_seq.lower()) 1727 xxxxxxxxxxxxxxxxxxxx 1728 1729 See also the upper method. 1730 """ 1731 return UnknownSeq(self._length, self.alphabet._lower(), 1732 self._character.lower())
1733
1734 - def translate(self, **kwargs):
1735 """Translate an unknown nucleotide sequence into an unknown protein. 1736 1737 e.g. 1738 1739 >>> my_seq = UnknownSeq(9, character="N") 1740 >>> print(my_seq) 1741 NNNNNNNNN 1742 >>> my_protein = my_seq.translate() 1743 >>> my_protein 1744 UnknownSeq(3, alphabet=ProteinAlphabet(), character='X') 1745 >>> print(my_protein) 1746 XXX 1747 1748 In comparison, using a normal Seq object: 1749 1750 >>> my_seq = Seq("NNNNNNNNN") 1751 >>> print(my_seq) 1752 NNNNNNNNN 1753 >>> my_protein = my_seq.translate() 1754 >>> my_protein 1755 Seq('XXX', ExtendedIUPACProtein()) 1756 >>> print(my_protein) 1757 XXX 1758 1759 """ 1760 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1761 Alphabet.ProteinAlphabet): 1762 raise ValueError("Proteins cannot be translated!") 1763 return UnknownSeq(self._length // 3, Alphabet.generic_protein, "X")
1764
1765 - def ungap(self, gap=None):
1766 """Return a copy of the sequence without the gap character(s). 1767 1768 The gap character can be specified in two ways - either as an explicit 1769 argument, or via the sequence's alphabet. For example: 1770 1771 >>> from Bio.Seq import UnknownSeq 1772 >>> from Bio.Alphabet import Gapped, generic_dna 1773 >>> my_dna = UnknownSeq(20, Gapped(generic_dna, "-")) 1774 >>> my_dna 1775 UnknownSeq(20, alphabet=Gapped(DNAAlphabet(), '-'), character='N') 1776 >>> my_dna.ungap() 1777 UnknownSeq(20, alphabet=DNAAlphabet(), character='N') 1778 >>> my_dna.ungap("-") 1779 UnknownSeq(20, alphabet=DNAAlphabet(), character='N') 1780 1781 If the UnknownSeq is using the gap character, then an empty Seq is 1782 returned: 1783 1784 >>> my_gap = UnknownSeq(20, Gapped(generic_dna, "-"), character="-") 1785 >>> my_gap 1786 UnknownSeq(20, alphabet=Gapped(DNAAlphabet(), '-'), character='-') 1787 >>> my_gap.ungap() 1788 Seq('', DNAAlphabet()) 1789 >>> my_gap.ungap("-") 1790 Seq('', DNAAlphabet()) 1791 1792 Notice that the returned sequence's alphabet is adjusted to remove any 1793 explicit gap character declaration. 1794 """ 1795 # Offload the alphabet stuff 1796 s = Seq(self._character, self.alphabet).ungap(gap) 1797 if s: 1798 return UnknownSeq(self._length, s.alphabet, self._character) 1799 else: 1800 return Seq("", s.alphabet)
1801 1802
1803 -class MutableSeq(object):
1804 """An editable sequence object (with an alphabet). 1805 1806 Unlike normal python strings and our basic sequence object (the Seq class) 1807 which are immutable, the MutableSeq lets you edit the sequence in place. 1808 However, this means you cannot use a MutableSeq object as a dictionary key. 1809 1810 >>> from Bio.Seq import MutableSeq 1811 >>> from Bio.Alphabet import generic_dna 1812 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1813 >>> my_seq 1814 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1815 >>> my_seq[5] 1816 'T' 1817 >>> my_seq[5] = "A" 1818 >>> my_seq 1819 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1820 >>> my_seq[5] 1821 'A' 1822 >>> my_seq[5:8] = "NNN" 1823 >>> my_seq 1824 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1825 >>> len(my_seq) 1826 11 1827 1828 Note that the MutableSeq object does not support as many string-like 1829 or biological methods as the Seq object. 1830 """ 1831
1832 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
1833 """Initialize the class.""" 1834 if sys.version_info[0] == 3: 1835 self.array_indicator = "u" 1836 else: 1837 self.array_indicator = "c" 1838 if isinstance(data, str): # TODO - What about unicode? 1839 self.data = array.array(self.array_indicator, data) 1840 else: 1841 self.data = data # assumes the input is an array 1842 self.alphabet = alphabet
1843
1844 - def __repr__(self):
1845 """Return (truncated) representation of the sequence for debugging.""" 1846 if self.alphabet is Alphabet.generic_alphabet: 1847 # Default used, we can omit it and simplify the representation 1848 a = "" 1849 else: 1850 a = ", %r" % self.alphabet 1851 if len(self) > 60: 1852 # Shows the last three letters as it is often useful to see if 1853 # there is a stop codon at the end of a sequence. 1854 # Note total length is 54+3+3=60 1855 return "{0}('{1}...{2}'{3!s})".format(self.__class__.__name__, 1856 str(self[:54]), 1857 str(self[-3:]), 1858 a) 1859 else: 1860 return "{0}('{1}'{2!s})".format(self.__class__.__name__, 1861 str(self), 1862 a)
1863
1864 - def __str__(self):
1865 """Return the full sequence as a python string. 1866 1867 Note that Biopython 1.44 and earlier would give a truncated 1868 version of repr(my_seq) for str(my_seq). If you are writing code 1869 which needs to be backwards compatible with old Biopython, you 1870 should continue to use my_seq.tostring() rather than str(my_seq). 1871 """ 1872 # See test_GAQueens.py for an historic usage of a non-string alphabet! 1873 return "".join(self.data)
1874
1875 - def __eq__(self, other):
1876 """Compare the sequence to another sequence or a string (README). 1877 1878 Currently if compared to another sequence the alphabets must be 1879 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1880 an exception. Otherwise only the sequence itself is compared, not the 1881 precise alphabet. 1882 1883 A future release of Biopython will change this (and the Seq object etc) 1884 to use simple string comparison. The plan is that comparing sequences 1885 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1886 but not an exception. 1887 1888 During this transition period, please just do explicit comparisons: 1889 1890 >>> seq1 = MutableSeq("ACGT") 1891 >>> seq2 = MutableSeq("ACGT") 1892 >>> id(seq1) == id(seq2) 1893 False 1894 >>> str(seq1) == str(seq2) 1895 True 1896 1897 Biopython now does: 1898 1899 >>> seq1 == seq2 1900 True 1901 >>> seq1 == Seq("ACGT") 1902 True 1903 >>> seq1 == "ACGT" 1904 True 1905 1906 """ 1907 if hasattr(other, "alphabet"): 1908 if not Alphabet._check_type_compatible([self.alphabet, 1909 other.alphabet]): 1910 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1911 self.alphabet, other.alphabet), 1912 BiopythonWarning) 1913 if isinstance(other, MutableSeq): 1914 return self.data == other.data 1915 return str(self) == str(other)
1916
1917 - def __ne__(self, other):
1918 """Implement the not-equal operand.""" 1919 # Seem to require this method for Python 2 but not needed on Python 3? 1920 return not (self == other)
1921
1922 - def __lt__(self, other):
1923 """Implement the less-than operand.""" 1924 if hasattr(other, "alphabet"): 1925 if not Alphabet._check_type_compatible([self.alphabet, 1926 other.alphabet]): 1927 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1928 self.alphabet, other.alphabet), 1929 BiopythonWarning) 1930 if isinstance(other, MutableSeq): 1931 return self.data < other.data 1932 return str(self) < str(other)
1933
1934 - def __le__(self, other):
1935 """Implement the less-than or equal operand.""" 1936 if hasattr(other, "alphabet"): 1937 if not Alphabet._check_type_compatible([self.alphabet, 1938 other.alphabet]): 1939 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1940 self.alphabet, other.alphabet), 1941 BiopythonWarning) 1942 if isinstance(other, MutableSeq): 1943 return self.data <= other.data 1944 return str(self) <= str(other)
1945
1946 - def __len__(self):
1947 """Return the length of the sequence, use len(my_seq).""" 1948 return len(self.data)
1949
1950 - def __getitem__(self, index):
1951 """Return a subsequence of single letter, use my_seq[index]. 1952 1953 >>> my_seq = MutableSeq('ACTCGACGTCG') 1954 >>> my_seq[5] 1955 'A' 1956 """ 1957 # Note since Python 2.0, __getslice__ is deprecated 1958 # and __getitem__ is used instead. 1959 # See http://docs.python.org/ref/sequence-methods.html 1960 if isinstance(index, int): 1961 # Return a single letter as a string 1962 return self.data[index] 1963 else: 1964 # Return the (sub)sequence as another Seq object 1965 return MutableSeq(self.data[index], self.alphabet)
1966
1967 - def __setitem__(self, index, value):
1968 """Set a subsequence of single letter via value parameter. 1969 1970 >>> my_seq = MutableSeq('ACTCGACGTCG') 1971 >>> my_seq[0] = 'T' 1972 >>> my_seq 1973 MutableSeq('TCTCGACGTCG') 1974 """ 1975 # Note since Python 2.0, __setslice__ is deprecated 1976 # and __setitem__ is used instead. 1977 # See http://docs.python.org/ref/sequence-methods.html 1978 if isinstance(index, int): 1979 # Replacing a single letter with a new string 1980 self.data[index] = value 1981 else: 1982 # Replacing a sub-sequence 1983 if isinstance(value, MutableSeq): 1984 self.data[index] = value.data 1985 elif isinstance(value, type(self.data)): 1986 self.data[index] = value 1987 else: 1988 self.data[index] = array.array(self.array_indicator, 1989 str(value))
1990
1991 - def __delitem__(self, index):
1992 """Delete a subsequence of single letter. 1993 1994 >>> my_seq = MutableSeq('ACTCGACGTCG') 1995 >>> del my_seq[0] 1996 >>> my_seq 1997 MutableSeq('CTCGACGTCG') 1998 """ 1999 # Note since Python 2.0, __delslice__ is deprecated 2000 # and __delitem__ is used instead. 2001 # See http://docs.python.org/ref/sequence-methods.html 2002 2003 # Could be deleting a single letter, or a slice 2004 del self.data[index]
2005
2006 - def __add__(self, other):
2007 """Add another sequence or string to this sequence. 2008 2009 Returns a new MutableSeq object. 2010 """ 2011 if hasattr(other, "alphabet"): 2012 # other should be a Seq or a MutableSeq 2013 if not Alphabet._check_type_compatible([self.alphabet, 2014 other.alphabet]): 2015 raise TypeError( 2016 "Incompatible alphabets {0!r} and {1!r}".format( 2017 self.alphabet, other.alphabet)) 2018 # They should be the same sequence type (or one of them is generic) 2019 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 2020 if isinstance(other, MutableSeq): 2021 # See test_GAQueens.py for an historic usage of a non-string 2022 # alphabet! Adding the arrays should support this. 2023 return self.__class__(self.data + other.data, a) 2024 else: 2025 return self.__class__(str(self) + str(other), a) 2026 elif isinstance(other, basestring): 2027 # other is a plain string - use the current alphabet 2028 return self.__class__(str(self) + str(other), self.alphabet) 2029 else: 2030 raise TypeError
2031
2032 - def __radd__(self, other):
2033 """Add a sequence on the left. 2034 2035 >>> from Bio.Seq import MutableSeq 2036 >>> from Bio.Alphabet import generic_protein 2037 >>> "LV" + MutableSeq("MELKI", generic_protein) 2038 MutableSeq('LVMELKI', ProteinAlphabet()) 2039 """ 2040 if hasattr(other, "alphabet"): 2041 # other should be a Seq or a MutableSeq 2042 if not Alphabet._check_type_compatible([self.alphabet, 2043 other.alphabet]): 2044 raise TypeError( 2045 "Incompatible alphabets {0!r} and {1!r}".format( 2046 self.alphabet, other.alphabet)) 2047 # They should be the same sequence type (or one of them is generic) 2048 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 2049 if isinstance(other, MutableSeq): 2050 # See test_GAQueens.py for an historic usage of a non-string 2051 # alphabet! Adding the arrays should support this. 2052 return self.__class__(other.data + self.data, a) 2053 else: 2054 return self.__class__(str(other) + str(self), a) 2055 elif isinstance(other, basestring): 2056 # other is a plain string - use the current alphabet 2057 return self.__class__(str(other) + str(self), self.alphabet) 2058 else: 2059 raise TypeError
2060
2061 - def __mul__(self, other):
2062 """Multiply MutableSeq by integer. 2063 2064 Note this is not in-place and returns a new object, 2065 matching native Python list multiplication. 2066 2067 >>> from Bio.Seq import MutableSeq 2068 >>> from Bio.Alphabet import generic_dna 2069 >>> MutableSeq('ATG') * 2 2070 MutableSeq('ATGATG') 2071 >>> MutableSeq('ATG', generic_dna) * 2 2072 MutableSeq('ATGATG', DNAAlphabet()) 2073 """ 2074 if not isinstance(other, int): 2075 raise TypeError("can't multiply MutableSeq by non-int type") 2076 return self.__class__(self.data * other, self.alphabet)
2077
2078 - def __rmul__(self, other):
2079 """Multiply integer by MutableSeq. 2080 2081 Note this is not in-place and returns a new object, 2082 matching native Python list multiplication. 2083 2084 >>> from Bio.Seq import MutableSeq 2085 >>> from Bio.Alphabet import generic_dna 2086 >>> 2 * MutableSeq('ATG') 2087 MutableSeq('ATGATG') 2088 >>> 2 * MutableSeq('ATG', generic_dna) 2089 MutableSeq('ATGATG', DNAAlphabet()) 2090 """ 2091 if not isinstance(other, int): 2092 raise TypeError("can't multiply MutableSeq by non-int type") 2093 return self.__class__(self.data * other, self.alphabet)
2094
2095 - def __imul__(self, other):
2096 """Multiply MutableSeq in-place. 2097 2098 >>> from Bio.Seq import MutableSeq 2099 >>> from Bio.Alphabet import generic_dna 2100 >>> seq = MutableSeq('ATG', generic_dna) 2101 >>> seq *= 2 2102 >>> seq 2103 MutableSeq('ATGATG', DNAAlphabet()) 2104 """ 2105 if not isinstance(other, int): 2106 raise TypeError("can't multiply MutableSeq by non-int type") 2107 return self.__class__(self.data * other, self.alphabet)
2108
2109 - def append(self, c):
2110 """Add a subsequence to the mutable sequence object. 2111 2112 >>> my_seq = MutableSeq('ACTCGACGTCG') 2113 >>> my_seq.append('A') 2114 >>> my_seq 2115 MutableSeq('ACTCGACGTCGA') 2116 2117 No return value. 2118 """ 2119 self.data.append(c)
2120
2121 - def insert(self, i, c):
2122 """Add a subsequence to the mutable sequence object at a given index. 2123 2124 >>> my_seq = MutableSeq('ACTCGACGTCG') 2125 >>> my_seq.insert(0,'A') 2126 >>> my_seq 2127 MutableSeq('AACTCGACGTCG') 2128 >>> my_seq.insert(8,'G') 2129 >>> my_seq 2130 MutableSeq('AACTCGACGGTCG') 2131 2132 No return value. 2133 """ 2134 self.data.insert(i, c)
2135
2136 - def pop(self, i=(-1)):
2137 """Remove a subsequence of a single letter at given index. 2138 2139 >>> my_seq = MutableSeq('ACTCGACGTCG') 2140 >>> my_seq.pop() 2141 'G' 2142 >>> my_seq 2143 MutableSeq('ACTCGACGTC') 2144 >>> my_seq.pop() 2145 'C' 2146 >>> my_seq 2147 MutableSeq('ACTCGACGT') 2148 2149 Returns the last character of the sequence 2150 """ 2151 c = self.data[i] 2152 del self.data[i] 2153 return c
2154
2155 - def remove(self, item):
2156 """Remove a subsequence of a single letter from mutable sequence. 2157 2158 >>> my_seq = MutableSeq('ACTCGACGTCG') 2159 >>> my_seq.remove('C') 2160 >>> my_seq 2161 MutableSeq('ATCGACGTCG') 2162 >>> my_seq.remove('A') 2163 >>> my_seq 2164 MutableSeq('TCGACGTCG') 2165 2166 No return value. 2167 """ 2168 for i in range(len(self.data)): 2169 if self.data[i] == item: 2170 del self.data[i] 2171 return 2172 raise ValueError("MutableSeq.remove(x): x not in list")
2173
2174 - def count(self, sub, start=0, end=sys.maxsize):
2175 """Return a non-overlapping count, like that of a python string. 2176 2177 This behaves like the python string method of the same name, 2178 which does a non-overlapping count! 2179 2180 For an overlapping search use the newer count_overlap() method. 2181 2182 Returns an integer, the number of occurrences of substring 2183 argument sub in the (sub)sequence given by [start:end]. 2184 Optional arguments start and end are interpreted as in slice 2185 notation. 2186 2187 Arguments: 2188 - sub - a string or another Seq object to look for 2189 - start - optional integer, slice start 2190 - end - optional integer, slice end 2191 2192 e.g. 2193 2194 >>> from Bio.Seq import MutableSeq 2195 >>> my_mseq = MutableSeq("AAAATGA") 2196 >>> print(my_mseq.count("A")) 2197 5 2198 >>> print(my_mseq.count("ATG")) 2199 1 2200 >>> print(my_mseq.count(Seq("AT"))) 2201 1 2202 >>> print(my_mseq.count("AT", 2, -1)) 2203 1 2204 2205 HOWEVER, please note because that python strings, Seq objects and 2206 MutableSeq objects do a non-overlapping search, this may not give 2207 the answer you expect: 2208 2209 >>> "AAAA".count("AA") 2210 2 2211 >>> print(MutableSeq("AAAA").count("AA")) 2212 2 2213 2214 An overlapping search would give the answer as three! 2215 """ 2216 try: 2217 # TODO - Should we check the alphabet? 2218 search = str(sub) 2219 except AttributeError: 2220 search = sub 2221 2222 if not isinstance(search, basestring): 2223 raise TypeError("expected a string, Seq or MutableSeq") 2224 2225 if len(search) == 1: 2226 # Try and be efficient and work directly from the array. 2227 count = 0 2228 for c in self.data[start:end]: 2229 if c == search: 2230 count += 1 2231 return count 2232 else: 2233 # TODO - Can we do this more efficiently? 2234 return str(self).count(search, start, end)
2235
2236 - def count_overlap(self, sub, start=0, end=sys.maxsize):
2237 """Return an overlapping count. 2238 2239 For a non-overlapping search use the count() method. 2240 2241 Returns an integer, the number of occurrences of substring 2242 argument sub in the (sub)sequence given by [start:end]. 2243 Optional arguments start and end are interpreted as in slice 2244 notation. 2245 2246 Arguments: 2247 - sub - a string or another Seq object to look for 2248 - start - optional integer, slice start 2249 - end - optional integer, slice end 2250 2251 e.g. 2252 2253 >>> from Bio.Seq import MutableSeq 2254 >>> print(MutableSeq("AAAA").count_overlap("AA")) 2255 3 2256 >>> print(MutableSeq("ATATATATA").count_overlap("ATA")) 2257 4 2258 >>> print(MutableSeq("ATATATATA").count_overlap("ATA", 3, -1)) 2259 1 2260 2261 Where substrings do not overlap, should behave the same as 2262 the count() method: 2263 2264 >>> from Bio.Seq import MutableSeq 2265 >>> my_mseq = MutableSeq("AAAATGA") 2266 >>> print(my_mseq.count_overlap("A")) 2267 5 2268 >>> my_mseq.count_overlap("A") == my_mseq.count("A") 2269 True 2270 >>> print(my_mseq.count_overlap("ATG")) 2271 1 2272 >>> my_mseq.count_overlap("ATG") == my_mseq.count("ATG") 2273 True 2274 >>> print(my_mseq.count_overlap(Seq("AT"))) 2275 1 2276 >>> my_mseq.count_overlap(Seq("AT")) == my_mseq.count(Seq("AT")) 2277 True 2278 >>> print(my_mseq.count_overlap("AT", 2, -1)) 2279 1 2280 >>> my_mseq.count_overlap("AT", 2, -1) == my_mseq.count("AT", 2, -1) 2281 True 2282 2283 HOWEVER, do not use this method for such cases because the 2284 count() method is much for efficient. 2285 """ 2286 # The implementation is currently identical to that of 2287 # Seq.count_overlap() apart from the definition of sub_str 2288 sub_str = str(sub) 2289 self_str = str(self) 2290 overlap_count = 0 2291 while True: 2292 start = self_str.find(sub_str, start, end) + 1 2293 if start != 0: 2294 overlap_count += 1 2295 else: 2296 return overlap_count
2297
2298 - def index(self, item):
2299 """Return first occurrence position of a single entry (i.e. letter). 2300 2301 >>> my_seq = MutableSeq('ACTCGACGTCG') 2302 >>> my_seq.index('A') 2303 0 2304 >>> my_seq.index('T') 2305 2 2306 2307 Note unlike a Biopython Seq object, or Python string, multi-letter 2308 subsequences are not supported. 2309 """ 2310 for i in range(len(self.data)): 2311 if self.data[i] == item: 2312 return i 2313 raise ValueError("MutableSeq.index(x): x not in list")
2314
2315 - def reverse(self):
2316 """Modify the mutable sequence to reverse itself. 2317 2318 No return value. 2319 """ 2320 self.data.reverse()
2321
2322 - def complement(self):
2323 """Modify the mutable sequence to take on its complement. 2324 2325 Trying to complement a protein sequence raises an exception. 2326 2327 No return value. 2328 """ 2329 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 2330 Alphabet.ProteinAlphabet): 2331 raise ValueError("Proteins do not have complements!") 2332 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 2333 d = ambiguous_dna_complement 2334 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 2335 d = ambiguous_rna_complement 2336 elif 'U' in self.data and 'T' in self.data: 2337 # TODO - Handle this cleanly? 2338 raise ValueError("Mixed RNA/DNA found") 2339 elif 'U' in self.data: 2340 d = ambiguous_rna_complement 2341 else: 2342 d = ambiguous_dna_complement 2343 mixed = d.copy() # We're going to edit this to be mixed case! 2344 mixed.update((x.lower(), y.lower()) for x, y in d.items()) 2345 self.data = [mixed[_] for _ in self.data] 2346 self.data = array.array(self.array_indicator, self.data)
2347
2348 - def reverse_complement(self):
2349 """Modify the mutable sequence to take on its reverse complement. 2350 2351 Trying to reverse complement a protein sequence raises an exception. 2352 2353 No return value. 2354 """ 2355 self.complement() 2356 self.data.reverse()
2357
2358 - def extend(self, other):
2359 """Add a sequence to the original mutable sequence object. 2360 2361 >>> my_seq = MutableSeq('ACTCGACGTCG') 2362 >>> my_seq.extend('A') 2363 >>> my_seq 2364 MutableSeq('ACTCGACGTCGA') 2365 >>> my_seq.extend('TTT') 2366 >>> my_seq 2367 MutableSeq('ACTCGACGTCGATTT') 2368 2369 No return value. 2370 """ 2371 if isinstance(other, MutableSeq): 2372 for c in other.data: 2373 self.data.append(c) 2374 else: 2375 for c in other: 2376 self.data.append(c)
2377
2378 - def tostring(self):
2379 """Return the full sequence as a python string (DEPRECATED). 2380 2381 You are now encouraged to use str(my_seq) instead of my_seq.tostring() 2382 as this method is officially deprecated. 2383 2384 Because str(my_seq) will give you the full sequence as a python string, 2385 there is often no need to make an explicit conversion. For example, 2386 2387 >>> my_seq = Seq("ATCGTG") 2388 >>> my_name = "seq_1" 2389 >>> print("ID={%s}, sequence={%s}" % (my_name, my_seq)) 2390 ID={seq_1}, sequence={ATCGTG} 2391 2392 On Biopython 1.44 or older you would have to have done this: 2393 2394 >>> print("ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())) 2395 ID={seq_1}, sequence={ATCGTG} 2396 """ 2397 from Bio import BiopythonDeprecationWarning 2398 warnings.warn("This method is obsolete; please use str(my_seq) " 2399 "instead of my_seq.tostring().", 2400 BiopythonDeprecationWarning) 2401 return "".join(self.data)
2402
2403 - def toseq(self):
2404 """Return the full sequence as a new immutable Seq object. 2405 2406 >>> from Bio.Seq import Seq 2407 >>> from Bio.Alphabet import IUPAC 2408 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 2409 ... IUPAC.protein) 2410 >>> my_mseq 2411 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 2412 >>> my_mseq.toseq() 2413 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 2414 2415 Note that the alphabet is preserved. 2416 """ 2417 return Seq("".join(self.data), self.alphabet)
2418 2419 2420 # The transcribe, backward_transcribe, and translate functions are 2421 # user-friendly versions of the corresponding functions in Bio.Transcribe 2422 # and Bio.Translate. The functions work both on Seq objects, and on strings. 2423
2424 -def transcribe(dna):
2425 """Transcribe a DNA sequence into RNA. 2426 2427 If given a string, returns a new string object. 2428 2429 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 2430 2431 Trying to transcribe a protein or RNA sequence raises an exception. 2432 2433 e.g. 2434 2435 >>> transcribe("ACTGN") 2436 'ACUGN' 2437 """ 2438 if isinstance(dna, Seq): 2439 return dna.transcribe() 2440 elif isinstance(dna, MutableSeq): 2441 return dna.toseq().transcribe() 2442 else: 2443 return dna.replace('T', 'U').replace('t', 'u')
2444 2445
2446 -def back_transcribe(rna):
2447 """Return the RNA sequence back-transcribed into DNA. 2448 2449 If given a string, returns a new string object. 2450 2451 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 2452 2453 Trying to transcribe a protein or DNA sequence raises an exception. 2454 2455 e.g. 2456 2457 >>> back_transcribe("ACUGN") 2458 'ACTGN' 2459 """ 2460 if isinstance(rna, Seq): 2461 return rna.back_transcribe() 2462 elif isinstance(rna, MutableSeq): 2463 return rna.toseq().back_transcribe() 2464 else: 2465 return rna.replace('U', 'T').replace('u', 't')
2466 2467
2468 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 2469 cds=False, pos_stop="X", gap=None):
2470 """Translate nucleotide string into a protein string (PRIVATE). 2471 2472 Arguments: 2473 - sequence - a string 2474 - table - a CodonTable object (NOT a table name or id number) 2475 - stop_symbol - a single character string, what to use for terminators. 2476 - to_stop - boolean, should translation terminate at the first 2477 in frame stop codon? If there is no in-frame stop codon 2478 then translation continues to the end. 2479 - pos_stop - a single character string for a possible stop codon 2480 (e.g. TAN or NNN) 2481 - cds - Boolean, indicates this is a complete CDS. If True, this 2482 checks the sequence starts with a valid alternative start 2483 codon (which will be translated as methionine, M), that the 2484 sequence length is a multiple of three, and that there is a 2485 single in frame stop codon at the end (this will be excluded 2486 from the protein sequence, regardless of the to_stop option). 2487 If these tests fail, an exception is raised. 2488 - gap - Single character string to denote symbol used for gaps. 2489 Defaults to None. 2490 2491 Returns a string. 2492 2493 e.g. 2494 2495 >>> from Bio.Data import CodonTable 2496 >>> table = CodonTable.ambiguous_dna_by_id[1] 2497 >>> _translate_str("AAA", table) 2498 'K' 2499 >>> _translate_str("TAR", table) 2500 '*' 2501 >>> _translate_str("TAN", table) 2502 'X' 2503 >>> _translate_str("TAN", table, pos_stop="@") 2504 '@' 2505 >>> _translate_str("TA?", table) 2506 Traceback (most recent call last): 2507 ... 2508 TranslationError: Codon 'TA?' is invalid 2509 2510 In a change to older versions of Biopython, partial codons are now 2511 always regarded as an error (previously only checked if cds=True) 2512 and will trigger a warning (likely to become an exception in a 2513 future release). 2514 2515 If **cds=True**, the start and stop codons are checked, and the start 2516 codon will be translated at methionine. The sequence must be an 2517 while number of codons. 2518 2519 >>> _translate_str("ATGCCCTAG", table, cds=True) 2520 'MP' 2521 >>> _translate_str("AAACCCTAG", table, cds=True) 2522 Traceback (most recent call last): 2523 ... 2524 TranslationError: First codon 'AAA' is not a start codon 2525 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 2526 Traceback (most recent call last): 2527 ... 2528 TranslationError: Extra in frame stop codon found. 2529 """ 2530 sequence = sequence.upper() 2531 amino_acids = [] 2532 forward_table = table.forward_table 2533 stop_codons = table.stop_codons 2534 if table.nucleotide_alphabet.letters is not None: 2535 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 2536 else: 2537 # Assume the worst case, ambiguous DNA or RNA: 2538 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + 2539 IUPAC.ambiguous_rna.letters.upper()) 2540 n = len(sequence) 2541 2542 # Check for tables with 'ambiguous' (dual-coding) stop codons: 2543 dual_coding = [c for c in stop_codons if c in forward_table] 2544 if dual_coding: 2545 c = dual_coding[0] 2546 if to_stop: 2547 raise ValueError("You cannot use 'to_stop=True' with this table " 2548 "as it contains {} codon(s) which can be both " 2549 " STOP and an amino acid (e.g. '{}' -> '{}' or " 2550 "STOP)." 2551 .format(len(dual_coding), c, forward_table[c])) 2552 warnings.warn("This table contains {} codon(s) which code(s) for both " 2553 "STOP and an amino acid (e.g. '{}' -> '{}' or STOP). " 2554 "Such codons will be translated as amino acid." 2555 .format(len(dual_coding), c, forward_table[c]), 2556 BiopythonWarning) 2557 2558 if cds: 2559 if str(sequence[:3]).upper() not in table.start_codons: 2560 raise CodonTable.TranslationError( 2561 "First codon '{0}' is not a start codon".format(sequence[:3])) 2562 if n % 3 != 0: 2563 raise CodonTable.TranslationError( 2564 "Sequence length {0} is not a multiple of three".format(n)) 2565 if str(sequence[-3:]).upper() not in stop_codons: 2566 raise CodonTable.TranslationError( 2567 "Final codon '{0}' is not a stop codon".format(sequence[-3:])) 2568 # Don't translate the stop symbol, and manually translate the M 2569 sequence = sequence[3:-3] 2570 n -= 6 2571 amino_acids = ["M"] 2572 elif n % 3 != 0: 2573 warnings.warn("Partial codon, len(sequence) not a multiple of three. " 2574 "Explicitly trim the sequence or add trailing N before " 2575 "translation. This may become an error in future.", 2576 BiopythonWarning) 2577 if gap is not None: 2578 if not isinstance(gap, basestring): 2579 raise TypeError("Gap character should be a single character " 2580 "string.") 2581 elif len(gap) > 1: 2582 raise ValueError("Gap character should be a single character " 2583 "string.") 2584 2585 for i in range(0, n - n % 3, 3): 2586 codon = sequence[i:i + 3] 2587 try: 2588 amino_acids.append(forward_table[codon]) 2589 except (KeyError, CodonTable.TranslationError): 2590 if codon in table.stop_codons: 2591 if cds: 2592 raise CodonTable.TranslationError( 2593 "Extra in frame stop codon found.") 2594 if to_stop: 2595 break 2596 amino_acids.append(stop_symbol) 2597 elif valid_letters.issuperset(set(codon)): 2598 # Possible stop codon (e.g. NNN or TAN) 2599 amino_acids.append(pos_stop) 2600 elif gap is not None and codon == gap * 3: 2601 # Gapped translation 2602 amino_acids.append(gap) 2603 else: 2604 raise CodonTable.TranslationError( 2605 "Codon '{0}' is invalid".format(codon)) 2606 return "".join(amino_acids)
2607 2608
2609 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 2610 cds=False, gap=None):
2611 """Translate a nucleotide sequence into amino acids. 2612 2613 If given a string, returns a new string object. Given a Seq or 2614 MutableSeq, returns a Seq object with a protein alphabet. 2615 2616 Arguments: 2617 - table - Which codon table to use? This can be either a name 2618 (string), an NCBI identifier (integer), or a CodonTable object 2619 (useful for non-standard genetic codes). Defaults to the "Standard" 2620 table. 2621 - stop_symbol - Single character string, what to use for any 2622 terminators, defaults to the asterisk, "*". 2623 - to_stop - Boolean, defaults to False meaning do a full 2624 translation continuing on past any stop codons 2625 (translated as the specified stop_symbol). If 2626 True, translation is terminated at the first in 2627 frame stop codon (and the stop_symbol is not 2628 appended to the returned protein sequence). 2629 - cds - Boolean, indicates this is a complete CDS. If True, this 2630 checks the sequence starts with a valid alternative start 2631 codon (which will be translated as methionine, M), that the 2632 sequence length is a multiple of three, and that there is a 2633 single in frame stop codon at the end (this will be excluded 2634 from the protein sequence, regardless of the to_stop option). 2635 If these tests fail, an exception is raised. 2636 - gap - Single character string to denote symbol used for gaps. 2637 Defaults to None. 2638 2639 A simple string example using the default (standard) genetic code: 2640 2641 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2642 >>> translate(coding_dna) 2643 'VAIVMGR*KGAR*' 2644 >>> translate(coding_dna, stop_symbol="@") 2645 'VAIVMGR@KGAR@' 2646 >>> translate(coding_dna, to_stop=True) 2647 'VAIVMGR' 2648 2649 Now using NCBI table 2, where TGA is not a stop codon: 2650 2651 >>> translate(coding_dna, table=2) 2652 'VAIVMGRWKGAR*' 2653 >>> translate(coding_dna, table=2, to_stop=True) 2654 'VAIVMGRWKGAR' 2655 2656 In fact this example uses an alternative start codon valid under NCBI 2657 table 2, GTG, which means this example is a complete valid CDS which 2658 when translated should really start with methionine (not valine): 2659 2660 >>> translate(coding_dna, table=2, cds=True) 2661 'MAIVMGRWKGAR' 2662 2663 Note that if the sequence has no in-frame stop codon, then the to_stop 2664 argument has no effect: 2665 2666 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2667 >>> translate(coding_dna2) 2668 'VAIVMGR' 2669 >>> translate(coding_dna2, to_stop=True) 2670 'VAIVMGR' 2671 2672 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2673 or a stop codon. These are translated as "X". Any invalid codon 2674 (e.g. "TA?" or "T-A") will throw a TranslationError. 2675 2676 It will however translate either DNA or RNA. 2677 2678 NOTE - Since version 1.71 Biopython contains codon tables with 'ambiguous 2679 stop codons'. These are stop codons with unambiguous sequence but which 2680 have a context dependent coding as STOP or as amino acid. With these tables 2681 'to_stop' must be False (otherwise a ValueError is raised). The dual 2682 coding codons will always be translated as amino acid, except for 2683 'cds=True', where the last codon will be translated as STOP. 2684 2685 >>> coding_dna3 = "ATGGCACGGAAGTGA" 2686 >>> translate(coding_dna3) 2687 'MARK*' 2688 2689 >>> translate(coding_dna3, table=27) # Table 27: TGA -> STOP or W 2690 'MARKW' 2691 2692 It will however raise a BiopythonWarning (not shown). 2693 2694 >>> translate(coding_dna3, table=27, cds=True) 2695 'MARK' 2696 2697 >>> translate(coding_dna3, table=27, to_stop=True) 2698 Traceback (most recent call last): 2699 ... 2700 ValueError: You cannot use 'to_stop=True' with this table ... 2701 """ 2702 if isinstance(sequence, Seq): 2703 return sequence.translate(table, stop_symbol, to_stop, cds) 2704 elif isinstance(sequence, MutableSeq): 2705 # Return a Seq object 2706 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2707 else: 2708 # Assume its a string, return a string 2709 try: 2710 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2711 except ValueError: 2712 codon_table = CodonTable.ambiguous_generic_by_name[table] 2713 except (AttributeError, TypeError): 2714 if isinstance(table, CodonTable.CodonTable): 2715 codon_table = table 2716 else: 2717 raise ValueError('Bad table argument') 2718 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds, 2719 gap=gap)
2720 2721
2722 -def reverse_complement(sequence):
2723 """Return the reverse complement sequence of a nucleotide string. 2724 2725 If given a string, returns a new string object. 2726 Given a Seq or a MutableSeq, returns a new Seq object with the same 2727 alphabet. 2728 2729 Supports unambiguous and ambiguous nucleotide sequences. 2730 2731 e.g. 2732 2733 >>> reverse_complement("ACTG-NH") 2734 'DN-CAGT' 2735 """ 2736 return complement(sequence)[::-1]
2737 2738
2739 -def complement(sequence):
2740 """Return the complement sequence of a nucleotide string. 2741 2742 If given a string, returns a new string object. 2743 Given a Seq or a MutableSeq, returns a new Seq object with the same 2744 alphabet. 2745 2746 Supports unambiguous and ambiguous nucleotide sequences. 2747 2748 e.g. 2749 2750 >>> complement("ACTG-NH") 2751 'TGAC-ND' 2752 """ 2753 if isinstance(sequence, Seq): 2754 # Return a Seq 2755 return sequence.complement() 2756 elif isinstance(sequence, MutableSeq): 2757 # Return a Seq 2758 # Don't use the MutableSeq reverse_complement method as it is 2759 # 'in place'. 2760 return sequence.toseq().complement() 2761 2762 # Assume its a string. 2763 # In order to avoid some code duplication, the old code would turn the 2764 # string into a Seq, use the reverse_complement method, and convert back 2765 # to a string. 2766 # This worked, but is over five times slower on short sequences! 2767 if ('U' in sequence or 'u' in sequence) \ 2768 and ('T' in sequence or 't' in sequence): 2769 raise ValueError("Mixed RNA/DNA found") 2770 elif 'U' in sequence or 'u' in sequence: 2771 ttable = _rna_complement_table 2772 else: 2773 ttable = _dna_complement_table 2774 return sequence.translate(ttable)
2775 2776
2777 -def _test():
2778 """Run the Bio.Seq module's doctests (PRIVATE).""" 2779 print("Running doctests...") 2780 import doctest 2781 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2782 print("Done")
2783 2784 2785 if __name__ == "__main__": 2786 _test() 2787