Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2015 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provide objects to represent biological sequences with alphabets. 
   9   
  10  See also the Seq_ wiki and the chapter in our tutorial: 
  11   - `HTML Tutorial`_ 
  12   - `PDF Tutorial`_ 
  13   
  14  .. _Seq: http://biopython.org/wiki/Seq 
  15  .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html 
  16  .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf 
  17   
  18  """ 
  19  from __future__ import print_function 
  20   
  21  import string  # for maketrans only 
  22  import array 
  23  import sys 
  24  import warnings 
  25   
  26  from Bio._py3k import range 
  27  from Bio._py3k import basestring 
  28   
  29  from Bio import BiopythonWarning 
  30  from Bio import Alphabet 
  31  from Bio.Alphabet import IUPAC 
  32  from Bio.Data.IUPACData import (ambiguous_dna_complement, 
  33                                  ambiguous_rna_complement) 
  34  from Bio.Data import CodonTable 
  35   
  36   
37 -def _maketrans(complement_mapping):
38 """Make a python string translation table (PRIVATE). 39 40 Arguments: 41 - complement_mapping - a dictionary such as ambiguous_dna_complement 42 and ambiguous_rna_complement from Data.IUPACData. 43 44 Returns a translation table (a string of length 256) for use with the 45 python string's translate method to use in a (reverse) complement. 46 47 Compatible with lower case and upper case sequences. 48 49 For internal use only. 50 """ 51 before = ''.join(complement_mapping.keys()) 52 after = ''.join(complement_mapping.values()) 53 before += before.lower() 54 after += after.lower() 55 if sys.version_info[0] == 3: 56 return str.maketrans(before, after) 57 else: 58 return string.maketrans(before, after)
59 60 61 _dna_complement_table = _maketrans(ambiguous_dna_complement) 62 _rna_complement_table = _maketrans(ambiguous_rna_complement) 63 64
65 -class Seq(object):
66 """Read-only sequence object (essentially a string with an alphabet). 67 68 Like normal python strings, our basic sequence object is immutable. 69 This prevents you from doing my_seq[5] = "A" for example, but does allow 70 Seq objects to be used as dictionary keys. 71 72 The Seq object provides a number of string like methods (such as count, 73 find, split and strip), which are alphabet aware where appropriate. 74 75 In addition to the string like sequence, the Seq object has an alphabet 76 property. This is an instance of an Alphabet class from Bio.Alphabet, 77 for example generic DNA, or IUPAC DNA. This describes the type of molecule 78 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 79 (letters). 80 81 The Seq object also provides some biological methods, such as complement, 82 reverse_complement, transcribe, back_transcribe and translate (which are 83 not applicable to sequences with a protein alphabet). 84 """ 85
86 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
87 """Create a Seq object. 88 89 Arguments: 90 - seq - Sequence, required (string) 91 - alphabet - Optional argument, an Alphabet object from 92 Bio.Alphabet 93 94 You will typically use Bio.SeqIO to read in sequences from files as 95 SeqRecord objects, whose sequence will be exposed as a Seq object via 96 the seq property. 97 98 However, will often want to create your own Seq objects directly: 99 100 >>> from Bio.Seq import Seq 101 >>> from Bio.Alphabet import IUPAC 102 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 103 ... IUPAC.protein) 104 >>> my_seq 105 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 106 >>> print(my_seq) 107 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 108 >>> my_seq.alphabet 109 IUPACProtein() 110 """ 111 # Enforce string storage 112 if not isinstance(data, basestring): 113 raise TypeError("The sequence data given to a Seq object should " 114 "be a string (not another Seq object etc)") 115 self._data = data 116 self.alphabet = alphabet # Seq API requirement
117
118 - def __repr__(self):
119 """Return (truncated) representation of the sequence for debugging.""" 120 if len(self) > 60: 121 # Shows the last three letters as it is often useful to see if 122 # there is a stop codon at the end of a sequence. 123 # Note total length is 54+3+3=60 124 return "{0}('{1}...{2}', {3!r})".format(self.__class__.__name__, 125 str(self)[:54], 126 str(self)[-3:], 127 self.alphabet) 128 else: 129 return '{0}({1!r}, {2!r})'.format(self.__class__.__name__, 130 self._data, 131 self.alphabet)
132
133 - def __str__(self):
134 """Return the full sequence as a python string, use str(my_seq). 135 136 Note that Biopython 1.44 and earlier would give a truncated 137 version of repr(my_seq) for str(my_seq). If you are writing code 138 which need to be backwards compatible with old Biopython, you 139 should continue to use my_seq.tostring() rather than str(my_seq). 140 """ 141 return self._data
142
143 - def __hash__(self):
144 """Hash for comparison. 145 146 See the __cmp__ documentation - this has changed from past 147 versions of Biopython! 148 """ 149 # TODO - remove this warning in a future release 150 warnings.warn("Biopython Seq objects now use string comparison. " 151 "Older versions of Biopython used object comparison. " 152 "During this transition, please use hash(id(my_seq)) " 153 "or my_dict[id(my_seq)] if you want the old behaviour, " 154 "or use hash(str(my_seq)) or my_dict[str(my_seq)] for " 155 "the new string hashing behaviour.", BiopythonWarning) 156 return hash(str(self))
157
158 - def __eq__(self, other):
159 """Compare the sequence to another sequence or a string (README). 160 161 Historically comparing Seq objects has done Python object comparison. 162 After considerable discussion (keeping in mind constraints of the 163 Python language, hashes and dictionary support), Biopython now uses 164 simple string comparison (with a warning about the change). 165 166 Note that incompatible alphabets (e.g. DNA to RNA) will trigger a 167 warning. 168 169 During this transition period, please just do explicit comparisons: 170 171 >>> from Bio.Seq import Seq 172 >>> from Bio.Alphabet import generic_dna 173 >>> seq1 = Seq("ACGT") 174 >>> seq2 = Seq("ACGT") 175 >>> id(seq1) == id(seq2) 176 False 177 >>> str(seq1) == str(seq2) 178 True 179 180 The new behaviour is to use string-like equality: 181 182 >>> from Bio.Seq import Seq 183 >>> from Bio.Alphabet import generic_dna 184 >>> seq1 == seq2 185 True 186 >>> seq1 == "ACGT" 187 True 188 >>> seq1 == Seq("ACGT", generic_dna) 189 True 190 191 """ 192 if hasattr(other, "alphabet"): 193 # other could be a Seq or a MutableSeq 194 if not Alphabet._check_type_compatible([self.alphabet, 195 other.alphabet]): 196 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 197 self.alphabet, other.alphabet), 198 BiopythonWarning) 199 return str(self) == str(other)
200
201 - def __ne__(self, other):
202 """Implement the not-equal operand.""" 203 # Require this method for Python 2 but not needed on Python 3 204 return not self == other
205
206 - def __lt__(self, other):
207 """Implement the less-than operand.""" 208 if hasattr(other, "alphabet"): 209 if not Alphabet._check_type_compatible([self.alphabet, 210 other.alphabet]): 211 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 212 self.alphabet, other.alphabet), 213 BiopythonWarning) 214 return str(self) < str(other)
215
216 - def __le__(self, other):
217 """Implement the less-than or equal operand.""" 218 if hasattr(other, "alphabet"): 219 if not Alphabet._check_type_compatible([self.alphabet, 220 other.alphabet]): 221 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 222 self.alphabet, other.alphabet), 223 BiopythonWarning) 224 return str(self) <= str(other)
225
226 - def __len__(self):
227 """Return the length of the sequence, use len(my_seq).""" 228 return len(self._data) # Seq API requirement
229
230 - def __getitem__(self, index): # Seq API requirement
231 """Return a subsequence of single letter, use my_seq[index].""" 232 # Note since Python 2.0, __getslice__ is deprecated 233 # and __getitem__ is used instead. 234 # See http://docs.python.org/ref/sequence-methods.html 235 if isinstance(index, int): 236 # Return a single letter as a string 237 return self._data[index] 238 else: 239 # Return the (sub)sequence as another Seq object 240 return Seq(self._data[index], self.alphabet)
241
242 - def __add__(self, other):
243 """Add another sequence or string to this sequence. 244 245 If adding a string to a Seq, the alphabet is preserved: 246 247 >>> from Bio.Seq import Seq 248 >>> from Bio.Alphabet import generic_protein 249 >>> Seq("MELKI", generic_protein) + "LV" 250 Seq('MELKILV', ProteinAlphabet()) 251 252 When adding two Seq (like) objects, the alphabets are important. 253 Consider this example: 254 255 >>> from Bio.Seq import Seq 256 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 257 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 258 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 259 >>> unamb_dna_seq 260 Seq('ACGT', IUPACUnambiguousDNA()) 261 >>> ambig_dna_seq 262 Seq('ACRGT', IUPACAmbiguousDNA()) 263 264 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 265 the more general ambiguous IUPAC DNA alphabet: 266 267 >>> unamb_dna_seq + ambig_dna_seq 268 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 269 270 However, if the default generic alphabet is included, the result is 271 a generic alphabet: 272 273 >>> Seq("") + ambig_dna_seq 274 Seq('ACRGT', Alphabet()) 275 276 You can't add RNA and DNA sequences: 277 278 >>> from Bio.Alphabet import generic_dna, generic_rna 279 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 280 Traceback (most recent call last): 281 ... 282 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 283 284 You can't add nucleotide and protein sequences: 285 286 >>> from Bio.Alphabet import generic_dna, generic_protein 287 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 288 Traceback (most recent call last): 289 ... 290 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 291 """ 292 if hasattr(other, "alphabet"): 293 # other should be a Seq or a MutableSeq 294 if not Alphabet._check_type_compatible([self.alphabet, 295 other.alphabet]): 296 raise TypeError( 297 "Incompatible alphabets {0!r} and {1!r}".format( 298 self.alphabet, other.alphabet)) 299 # They should be the same sequence type (or one of them is generic) 300 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 301 return self.__class__(str(self) + str(other), a) 302 elif isinstance(other, basestring): 303 # other is a plain string - use the current alphabet 304 return self.__class__(str(self) + other, self.alphabet) 305 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports 306 if isinstance(other, SeqRecord): 307 # Get the SeqRecord's __radd__ to handle this 308 return NotImplemented 309 else: 310 raise TypeError
311
312 - def __radd__(self, other):
313 """Add a sequence on the left. 314 315 If adding a string to a Seq, the alphabet is preserved: 316 317 >>> from Bio.Seq import Seq 318 >>> from Bio.Alphabet import generic_protein 319 >>> "LV" + Seq("MELKI", generic_protein) 320 Seq('LVMELKI', ProteinAlphabet()) 321 322 Adding two Seq (like) objects is handled via the __add__ method. 323 """ 324 if hasattr(other, "alphabet"): 325 # other should be a Seq or a MutableSeq 326 if not Alphabet._check_type_compatible([self.alphabet, 327 other.alphabet]): 328 raise TypeError( 329 "Incompatible alphabets {0!r} and {1!r}".format( 330 self.alphabet, other.alphabet)) 331 # They should be the same sequence type (or one of them is generic) 332 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 333 return self.__class__(str(other) + str(self), a) 334 elif isinstance(other, basestring): 335 # other is a plain string - use the current alphabet 336 return self.__class__(other + str(self), self.alphabet) 337 else: 338 raise TypeError
339
340 - def tostring(self): # Seq API requirement
341 """Return the full sequence as a python string (DEPRECATED). 342 343 You are now encouraged to use str(my_seq) instead of 344 my_seq.tostring(). 345 """ 346 from Bio import BiopythonDeprecationWarning 347 warnings.warn("This method is obsolete; please use str(my_seq) " 348 "instead of my_seq.tostring().", 349 BiopythonDeprecationWarning) 350 return str(self) 351
352 - def tomutable(self): # Needed? Or use a function?
353 """Return the full sequence as a MutableSeq object. 354 355 >>> from Bio.Seq import Seq 356 >>> from Bio.Alphabet import IUPAC 357 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 358 ... IUPAC.protein) 359 >>> my_seq 360 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 361 >>> my_seq.tomutable() 362 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 363 364 Note that the alphabet is preserved. 365 """ 366 return MutableSeq(str(self), self.alphabet) 367
368 - def _get_seq_str_and_check_alphabet(self, other_sequence):
369 """Convert string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 370 371 For a string argument, returns the string. 372 373 For a Seq or MutableSeq, it checks the alphabet is compatible 374 (raising an exception if it isn't), and then returns a string. 375 """ 376 try: 377 other_alpha = other_sequence.alphabet 378 except AttributeError: 379 # Assume other_sequence is a string 380 return other_sequence 381 382 # Other should be a Seq or a MutableSeq 383 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 384 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 385 self.alphabet, other_alpha)) 386 # Return as a string 387 return str(other_sequence)
388
389 - def count(self, sub, start=0, end=sys.maxsize):
390 """Return a non-overlapping count, like that of a python string. 391 392 This behaves like the python string method of the same name, 393 which does a non-overlapping count! 394 395 For an overlapping search use the newer count_overlap() method. 396 397 Returns an integer, the number of occurrences of substring 398 argument sub in the (sub)sequence given by [start:end]. 399 Optional arguments start and end are interpreted as in slice 400 notation. 401 402 Arguments: 403 - sub - a string or another Seq object to look for 404 - start - optional integer, slice start 405 - end - optional integer, slice end 406 407 e.g. 408 409 >>> from Bio.Seq import Seq 410 >>> my_seq = Seq("AAAATGA") 411 >>> print(my_seq.count("A")) 412 5 413 >>> print(my_seq.count("ATG")) 414 1 415 >>> print(my_seq.count(Seq("AT"))) 416 1 417 >>> print(my_seq.count("AT", 2, -1)) 418 1 419 420 HOWEVER, please note because python strings and Seq objects (and 421 MutableSeq objects) do a non-overlapping search, this may not give 422 the answer you expect: 423 424 >>> "AAAA".count("AA") 425 2 426 >>> print(Seq("AAAA").count("AA")) 427 2 428 429 An overlapping search, as implemented in .count_overlap(), 430 would give the answer as three! 431 """ 432 # If it has one, check the alphabet: 433 sub_str = self._get_seq_str_and_check_alphabet(sub) 434 return str(self).count(sub_str, start, end)
435
436 - def count_overlap(self, sub, start=0, end=sys.maxsize):
437 """Return an overlapping count. 438 439 For a non-overlapping search use the count() method. 440 441 Returns an integer, the number of occurrences of substring 442 argument sub in the (sub)sequence given by [start:end]. 443 Optional arguments start and end are interpreted as in slice 444 notation. 445 446 Arguments: 447 - sub - a string or another Seq object to look for 448 - start - optional integer, slice start 449 - end - optional integer, slice end 450 451 e.g. 452 453 >>> from Bio.Seq import Seq 454 >>> print(Seq("AAAA").count_overlap("AA")) 455 3 456 >>> print(Seq("ATATATATA").count_overlap("ATA")) 457 4 458 >>> print(Seq("ATATATATA").count_overlap("ATA", 3, -1)) 459 1 460 461 Where substrings do not overlap, should behave the same as 462 the count() method: 463 464 >>> from Bio.Seq import Seq 465 >>> my_seq = Seq("AAAATGA") 466 >>> print(my_seq.count_overlap("A")) 467 5 468 >>> my_seq.count_overlap("A") == my_seq.count("A") 469 True 470 >>> print(my_seq.count_overlap("ATG")) 471 1 472 >>> my_seq.count_overlap("ATG") == my_seq.count("ATG") 473 True 474 >>> print(my_seq.count_overlap(Seq("AT"))) 475 1 476 >>> my_seq.count_overlap(Seq("AT")) == my_seq.count(Seq("AT")) 477 True 478 >>> print(my_seq.count_overlap("AT", 2, -1)) 479 1 480 >>> my_seq.count_overlap("AT", 2, -1) == my_seq.count("AT", 2, -1) 481 True 482 483 HOWEVER, do not use this method for such cases because the 484 count() method is much for efficient. 485 """ 486 sub_str = self._get_seq_str_and_check_alphabet(sub) 487 self_str = str(self) 488 overlap_count = 0 489 while True: 490 start = self_str.find(sub_str, start, end) + 1 491 if start != 0: 492 overlap_count += 1 493 else: 494 return overlap_count
495
496 - def __contains__(self, char):
497 """Implement the 'in' keyword, like a python string. 498 499 e.g. 500 501 >>> from Bio.Seq import Seq 502 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 503 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 504 >>> "AAA" in my_dna 505 True 506 >>> Seq("AAA") in my_dna 507 True 508 >>> Seq("AAA", generic_dna) in my_dna 509 True 510 511 Like other Seq methods, this will raise a type error if another Seq 512 (or Seq like) object with an incompatible alphabet is used: 513 514 >>> Seq("AAA", generic_rna) in my_dna 515 Traceback (most recent call last): 516 ... 517 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 518 >>> Seq("AAA", generic_protein) in my_dna 519 Traceback (most recent call last): 520 ... 521 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 522 """ 523 # If it has one, check the alphabet: 524 sub_str = self._get_seq_str_and_check_alphabet(char) 525 return sub_str in str(self)
526
527 - def find(self, sub, start=0, end=sys.maxsize):
528 """Find method, like that of a python string. 529 530 This behaves like the python string method of the same name. 531 532 Returns an integer, the index of the first occurrence of substring 533 argument sub in the (sub)sequence given by [start:end]. 534 535 Arguments: 536 - sub - a string or another Seq object to look for 537 - start - optional integer, slice start 538 - end - optional integer, slice end 539 540 Returns -1 if the subsequence is NOT found. 541 542 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 543 544 >>> from Bio.Seq import Seq 545 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 546 >>> my_rna.find("AUG") 547 3 548 """ 549 # If it has one, check the alphabet: 550 sub_str = self._get_seq_str_and_check_alphabet(sub) 551 return str(self).find(sub_str, start, end)
552
553 - def rfind(self, sub, start=0, end=sys.maxsize):
554 """Find from right method, like that of a python string. 555 556 This behaves like the python string method of the same name. 557 558 Returns an integer, the index of the last (right most) occurrence of 559 substring argument sub in the (sub)sequence given by [start:end]. 560 561 Arguments: 562 - sub - a string or another Seq object to look for 563 - start - optional integer, slice start 564 - end - optional integer, slice end 565 566 Returns -1 if the subsequence is NOT found. 567 568 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 569 570 >>> from Bio.Seq import Seq 571 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 572 >>> my_rna.rfind("AUG") 573 15 574 """ 575 # If it has one, check the alphabet: 576 sub_str = self._get_seq_str_and_check_alphabet(sub) 577 return str(self).rfind(sub_str, start, end)
578
579 - def startswith(self, prefix, start=0, end=sys.maxsize):
580 """Return True if the Seq starts with the given prefix, False otherwise. 581 582 This behaves like the python string method of the same name. 583 584 Return True if the sequence starts with the specified prefix 585 (a string or another Seq object), False otherwise. 586 With optional start, test sequence beginning at that position. 587 With optional end, stop comparing sequence at that position. 588 prefix can also be a tuple of strings to try. e.g. 589 590 >>> from Bio.Seq import Seq 591 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 592 >>> my_rna.startswith("GUC") 593 True 594 >>> my_rna.startswith("AUG") 595 False 596 >>> my_rna.startswith("AUG", 3) 597 True 598 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) 599 True 600 """ 601 # If it has one, check the alphabet: 602 if isinstance(prefix, tuple): 603 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 604 for p in prefix) 605 return str(self).startswith(prefix_strs, start, end) 606 else: 607 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 608 return str(self).startswith(prefix_str, start, end)
609
610 - def endswith(self, suffix, start=0, end=sys.maxsize):
611 """Return True if the Seq ends with the given suffix, False otherwise. 612 613 This behaves like the python string method of the same name. 614 615 Return True if the sequence ends with the specified suffix 616 (a string or another Seq object), False otherwise. 617 With optional start, test sequence beginning at that position. 618 With optional end, stop comparing sequence at that position. 619 suffix can also be a tuple of strings to try. e.g. 620 621 >>> from Bio.Seq import Seq 622 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 623 >>> my_rna.endswith("UUG") 624 True 625 >>> my_rna.endswith("AUG") 626 False 627 >>> my_rna.endswith("AUG", 0, 18) 628 True 629 >>> my_rna.endswith(("UCC", "UCA", "UUG")) 630 True 631 """ 632 # If it has one, check the alphabet: 633 if isinstance(suffix, tuple): 634 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 635 for p in suffix) 636 return str(self).endswith(suffix_strs, start, end) 637 else: 638 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 639 return str(self).endswith(suffix_str, start, end)
640
641 - def split(self, sep=None, maxsplit=-1):
642 """Split method, like that of a python string. 643 644 This behaves like the python string method of the same name. 645 646 Return a list of the 'words' in the string (as Seq objects), 647 using sep as the delimiter string. If maxsplit is given, at 648 most maxsplit splits are done. If maxsplit is omitted, all 649 splits are made. 650 651 Following the python string method, sep will by default be any 652 white space (tabs, spaces, newlines) but this is unlikely to 653 apply to biological sequences. 654 655 e.g. 656 657 >>> from Bio.Seq import Seq 658 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 659 >>> my_aa = my_rna.translate() 660 >>> my_aa 661 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 662 >>> for pep in my_aa.split("*"): 663 ... pep 664 Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')) 665 Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')) 666 Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*')) 667 >>> for pep in my_aa.split("*", 1): 668 ... pep 669 Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')) 670 Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 671 672 See also the rsplit method: 673 674 >>> for pep in my_aa.rsplit("*", 1): 675 ... pep 676 Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')) 677 Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*')) 678 """ 679 # If it has one, check the alphabet: 680 sep_str = self._get_seq_str_and_check_alphabet(sep) 681 # TODO - If the sep is the defined stop symbol, or gap char, 682 # should we adjust the alphabet? 683 return [Seq(part, self.alphabet) 684 for part in str(self).split(sep_str, maxsplit)]
685
686 - def rsplit(self, sep=None, maxsplit=-1):
687 """Do a right split method, like that of a python string. 688 689 This behaves like the python string method of the same name. 690 691 Return a list of the 'words' in the string (as Seq objects), 692 using sep as the delimiter string. If maxsplit is given, at 693 most maxsplit splits are done COUNTING FROM THE RIGHT. 694 If maxsplit is omitted, all splits are made. 695 696 Following the python string method, sep will by default be any 697 white space (tabs, spaces, newlines) but this is unlikely to 698 apply to biological sequences. 699 700 e.g. print(my_seq.rsplit("*",1)) 701 702 See also the split method. 703 """ 704 # If it has one, check the alphabet: 705 sep_str = self._get_seq_str_and_check_alphabet(sep) 706 return [Seq(part, self.alphabet) 707 for part in str(self).rsplit(sep_str, maxsplit)]
708
709 - def strip(self, chars=None):
710 """Return a new Seq object with leading and trailing ends stripped. 711 712 This behaves like the python string method of the same name. 713 714 Optional argument chars defines which characters to remove. If 715 omitted or None (default) then as for the python string method, 716 this defaults to removing any white space. 717 718 e.g. print(my_seq.strip("-")) 719 720 See also the lstrip and rstrip methods. 721 """ 722 # If it has one, check the alphabet: 723 strip_str = self._get_seq_str_and_check_alphabet(chars) 724 return Seq(str(self).strip(strip_str), self.alphabet)
725
726 - def lstrip(self, chars=None):
727 """Return a new Seq object with leading (left) end stripped. 728 729 This behaves like the python string method of the same name. 730 731 Optional argument chars defines which characters to remove. If 732 omitted or None (default) then as for the python string method, 733 this defaults to removing any white space. 734 735 e.g. print(my_seq.lstrip("-")) 736 737 See also the strip and rstrip methods. 738 """ 739 # If it has one, check the alphabet: 740 strip_str = self._get_seq_str_and_check_alphabet(chars) 741 return Seq(str(self).lstrip(strip_str), self.alphabet)
742
743 - def rstrip(self, chars=None):
744 """Return a new Seq object with trailing (right) end stripped. 745 746 This behaves like the python string method of the same name. 747 748 Optional argument chars defines which characters to remove. If 749 omitted or None (default) then as for the python string method, 750 this defaults to removing any white space. 751 752 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 753 754 >>> from Bio.Alphabet import IUPAC 755 >>> from Bio.Seq import Seq 756 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 757 >>> my_seq 758 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 759 >>> my_seq.rstrip("A") 760 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 761 762 See also the strip and lstrip methods. 763 """ 764 # If it has one, check the alphabet: 765 strip_str = self._get_seq_str_and_check_alphabet(chars) 766 return Seq(str(self).rstrip(strip_str), self.alphabet)
767
768 - def upper(self):
769 """Return an upper case copy of the sequence. 770 771 >>> from Bio.Alphabet import HasStopCodon, generic_protein 772 >>> from Bio.Seq import Seq 773 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 774 >>> my_seq 775 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 776 >>> my_seq.lower() 777 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 778 >>> my_seq.upper() 779 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 780 781 This will adjust the alphabet if required. See also the lower method. 782 """ 783 return Seq(str(self).upper(), self.alphabet._upper())
784
785 - def lower(self):
786 """Return a lower case copy of the sequence. 787 788 This will adjust the alphabet if required. Note that the IUPAC 789 alphabets are upper case only, and thus a generic alphabet must be 790 substituted. 791 792 >>> from Bio.Alphabet import Gapped, generic_dna 793 >>> from Bio.Alphabet import IUPAC 794 >>> from Bio.Seq import Seq 795 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", 796 ... Gapped(IUPAC.unambiguous_dna, "*")) 797 >>> my_seq 798 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 799 >>> my_seq.lower() 800 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 801 802 See also the upper method. 803 """ 804 return Seq(str(self).lower(), self.alphabet._lower())
805
806 - def complement(self):
807 """Return the complement sequence by creating a new Seq object. 808 809 >>> from Bio.Seq import Seq 810 >>> from Bio.Alphabet import IUPAC 811 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 812 >>> my_dna 813 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 814 >>> my_dna.complement() 815 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 816 817 You can of course used mixed case sequences, 818 819 >>> from Bio.Seq import Seq 820 >>> from Bio.Alphabet import generic_dna 821 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 822 >>> my_dna 823 Seq('CCCCCgatA-GD', DNAAlphabet()) 824 >>> my_dna.complement() 825 Seq('GGGGGctaT-CH', DNAAlphabet()) 826 827 Note in the above example, ambiguous character D denotes 828 G, A or T so its complement is H (for C, T or A). 829 830 Trying to complement a protein sequence raises an exception. 831 832 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 833 >>> my_protein.complement() 834 Traceback (most recent call last): 835 ... 836 ValueError: Proteins do not have complements! 837 """ 838 base = Alphabet._get_base_alphabet(self.alphabet) 839 if isinstance(base, Alphabet.ProteinAlphabet): 840 raise ValueError("Proteins do not have complements!") 841 if isinstance(base, Alphabet.DNAAlphabet): 842 ttable = _dna_complement_table 843 elif isinstance(base, Alphabet.RNAAlphabet): 844 ttable = _rna_complement_table 845 elif ('U' in self._data or 'u' in self._data) \ 846 and ('T' in self._data or 't' in self._data): 847 # TODO - Handle this cleanly? 848 raise ValueError("Mixed RNA/DNA found") 849 elif 'U' in self._data or 'u' in self._data: 850 ttable = _rna_complement_table 851 else: 852 ttable = _dna_complement_table 853 # Much faster on really long sequences than the previous loop based 854 # one. Thanks to Michael Palmer, University of Waterloo. 855 return Seq(str(self).translate(ttable), self.alphabet)
856
857 - def reverse_complement(self):
858 """Return the reverse complement sequence by creating a new Seq object. 859 860 >>> from Bio.Seq import Seq 861 >>> from Bio.Alphabet import IUPAC 862 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 863 >>> my_dna 864 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 865 >>> my_dna.reverse_complement() 866 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 867 868 Note in the above example, since R = G or A, its complement 869 is Y (which denotes C or T). 870 871 You can of course used mixed case sequences, 872 873 >>> from Bio.Seq import Seq 874 >>> from Bio.Alphabet import generic_dna 875 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 876 >>> my_dna 877 Seq('CCCCCgatA-G', DNAAlphabet()) 878 >>> my_dna.reverse_complement() 879 Seq('C-TatcGGGGG', DNAAlphabet()) 880 881 Trying to complement a protein sequence raises an exception: 882 883 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 884 >>> my_protein.reverse_complement() 885 Traceback (most recent call last): 886 ... 887 ValueError: Proteins do not have complements! 888 """ 889 # Use -1 stride/step to reverse the complement 890 return self.complement()[::-1]
891
892 - def transcribe(self):
893 """Return the RNA sequence from a DNA sequence by creating a new Seq object. 894 895 >>> from Bio.Seq import Seq 896 >>> from Bio.Alphabet import IUPAC 897 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 898 ... IUPAC.unambiguous_dna) 899 >>> coding_dna 900 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 901 >>> coding_dna.transcribe() 902 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 903 904 Trying to transcribe a protein or RNA sequence raises an exception: 905 906 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 907 >>> my_protein.transcribe() 908 Traceback (most recent call last): 909 ... 910 ValueError: Proteins cannot be transcribed! 911 """ 912 base = Alphabet._get_base_alphabet(self.alphabet) 913 if isinstance(base, Alphabet.ProteinAlphabet): 914 raise ValueError("Proteins cannot be transcribed!") 915 if isinstance(base, Alphabet.RNAAlphabet): 916 raise ValueError("RNA cannot be transcribed!") 917 918 if self.alphabet == IUPAC.unambiguous_dna: 919 alphabet = IUPAC.unambiguous_rna 920 elif self.alphabet == IUPAC.ambiguous_dna: 921 alphabet = IUPAC.ambiguous_rna 922 else: 923 alphabet = Alphabet.generic_rna 924 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
925
926 - def back_transcribe(self):
927 """Return the DNA sequence from an RNA sequence by creating a new Seq object. 928 929 >>> from Bio.Seq import Seq 930 >>> from Bio.Alphabet import IUPAC 931 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 932 ... IUPAC.unambiguous_rna) 933 >>> messenger_rna 934 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 935 >>> messenger_rna.back_transcribe() 936 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 937 938 Trying to back-transcribe a protein or DNA sequence raises an 939 exception: 940 941 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 942 >>> my_protein.back_transcribe() 943 Traceback (most recent call last): 944 ... 945 ValueError: Proteins cannot be back transcribed! 946 """ 947 base = Alphabet._get_base_alphabet(self.alphabet) 948 if isinstance(base, Alphabet.ProteinAlphabet): 949 raise ValueError("Proteins cannot be back transcribed!") 950 if isinstance(base, Alphabet.DNAAlphabet): 951 raise ValueError("DNA cannot be back transcribed!") 952 953 if self.alphabet == IUPAC.unambiguous_rna: 954 alphabet = IUPAC.unambiguous_dna 955 elif self.alphabet == IUPAC.ambiguous_rna: 956 alphabet = IUPAC.ambiguous_dna 957 else: 958 alphabet = Alphabet.generic_dna 959 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
960
961 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 962 cds=False, gap=None):
963 """Turn a nucleotide sequence into a protein sequence by creating a new Seq object. 964 965 This method will translate DNA or RNA sequences, and those with a 966 nucleotide or generic alphabet. Trying to translate a protein 967 sequence raises an exception. 968 969 Arguments: 970 - table - Which codon table to use? This can be either a name 971 (string), an NCBI identifier (integer), or a CodonTable 972 object (useful for non-standard genetic codes). This 973 defaults to the "Standard" table. 974 - stop_symbol - Single character string, what to use for 975 terminators. This defaults to the asterisk, "*". 976 - to_stop - Boolean, defaults to False meaning do a full 977 translation continuing on past any stop codons (translated as the 978 specified stop_symbol). If True, translation is terminated at 979 the first in frame stop codon (and the stop_symbol is not 980 appended to the returned protein sequence). 981 - cds - Boolean, indicates this is a complete CDS. If True, 982 this checks the sequence starts with a valid alternative start 983 codon (which will be translated as methionine, M), that the 984 sequence length is a multiple of three, and that there is a 985 single in frame stop codon at the end (this will be excluded 986 from the protein sequence, regardless of the to_stop option). 987 If these tests fail, an exception is raised. 988 - gap - Single character string to denote symbol used for gaps. 989 It will try to guess the gap character from the alphabet. 990 991 e.g. Using the standard table: 992 993 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 994 >>> coding_dna.translate() 995 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 996 >>> coding_dna.translate(stop_symbol="@") 997 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 998 >>> coding_dna.translate(to_stop=True) 999 Seq('VAIVMGR', ExtendedIUPACProtein()) 1000 1001 Now using NCBI table 2, where TGA is not a stop codon: 1002 1003 >>> coding_dna.translate(table=2) 1004 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 1005 >>> coding_dna.translate(table=2, to_stop=True) 1006 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 1007 1008 In fact, GTG is an alternative start codon under NCBI table 2, meaning 1009 this sequence could be a complete CDS: 1010 1011 >>> coding_dna.translate(table=2, cds=True) 1012 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 1013 1014 It isn't a valid CDS under NCBI table 1, due to both the start codon 1015 and also the in frame stop codons: 1016 1017 >>> coding_dna.translate(table=1, cds=True) 1018 Traceback (most recent call last): 1019 ... 1020 TranslationError: First codon 'GTG' is not a start codon 1021 1022 If the sequence has no in-frame stop codon, then the to_stop argument 1023 has no effect: 1024 1025 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 1026 >>> coding_dna2.translate() 1027 Seq('LAIVMGR', ExtendedIUPACProtein()) 1028 >>> coding_dna2.translate(to_stop=True) 1029 Seq('LAIVMGR', ExtendedIUPACProtein()) 1030 1031 When translating gapped sequences, the gap character is inferred from 1032 the alphabet: 1033 1034 >>> from Bio.Alphabet import Gapped 1035 >>> coding_dna3 = Seq("GTG---GCCATT", Gapped(IUPAC.unambiguous_dna)) 1036 >>> coding_dna3.translate() 1037 Seq('V-AI', Gapped(ExtendedIUPACProtein(), '-')) 1038 1039 It is possible to pass the gap character when the alphabet is missing: 1040 1041 >>> coding_dna4 = Seq("GTG---GCCATT") 1042 >>> coding_dna4.translate(gap='-') 1043 Seq('V-AI', Gapped(ExtendedIUPACProtein(), '-')) 1044 1045 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 1046 or a stop codon. These are translated as "X". Any invalid codon 1047 (e.g. "TA?" or "T-A") will throw a TranslationError. 1048 1049 NOTE - This does NOT behave like the python string's translate 1050 method. For that use str(my_seq).translate(...) instead. 1051 """ 1052 if isinstance(table, str) and len(table) == 256: 1053 raise ValueError("The Seq object translate method DOES NOT take " 1054 "a 256 character string mapping table like " 1055 "the python string object's translate method. " 1056 "Use str(my_seq).translate(...) instead.") 1057 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1058 Alphabet.ProteinAlphabet): 1059 raise ValueError("Proteins cannot be translated!") 1060 try: 1061 table_id = int(table) 1062 except ValueError: 1063 # Assume its a table name 1064 if self.alphabet == IUPAC.unambiguous_dna: 1065 # Will use standard IUPAC protein alphabet, no need for X 1066 codon_table = CodonTable.unambiguous_dna_by_name[table] 1067 elif self.alphabet == IUPAC.unambiguous_rna: 1068 # Will use standard IUPAC protein alphabet, no need for X 1069 codon_table = CodonTable.unambiguous_rna_by_name[table] 1070 else: 1071 # This will use the extended IUPAC protein alphabet with X etc. 1072 # The same table can be used for RNA or DNA (we use this for 1073 # translating strings). 1074 codon_table = CodonTable.ambiguous_generic_by_name[table] 1075 except (AttributeError, TypeError): 1076 # Assume its a CodonTable object 1077 if isinstance(table, CodonTable.CodonTable): 1078 codon_table = table 1079 else: 1080 raise ValueError('Bad table argument') 1081 else: 1082 # Assume its a table ID 1083 if self.alphabet == IUPAC.unambiguous_dna: 1084 # Will use standard IUPAC protein alphabet, no need for X 1085 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 1086 elif self.alphabet == IUPAC.unambiguous_rna: 1087 # Will use standard IUPAC protein alphabet, no need for X 1088 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 1089 else: 1090 # This will use the extended IUPAC protein alphabet with X etc. 1091 # The same table can be used for RNA or DNA (we use this for 1092 # translating strings). 1093 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 1094 1095 # Deal with gaps for translation 1096 if hasattr(self.alphabet, "gap_char"): 1097 if not gap: 1098 gap = self.alphabet.gap_char 1099 elif gap != self.alphabet.gap_char: 1100 raise ValueError( 1101 "Gap {0!r} does not match {1!r} from alphabet".format( 1102 gap, self.alphabet.gap_char)) 1103 1104 protein = _translate_str(str(self), codon_table, stop_symbol, to_stop, 1105 cds, gap=gap) 1106 1107 if gap and gap in protein: 1108 alphabet = Alphabet.Gapped(codon_table.protein_alphabet, gap) 1109 else: 1110 alphabet = codon_table.protein_alphabet 1111 1112 if stop_symbol in protein: 1113 alphabet = Alphabet.HasStopCodon(alphabet, stop_symbol) 1114 1115 return Seq(protein, alphabet)
1116
1117 - def ungap(self, gap=None):
1118 """Return a copy of the sequence without the gap character(s). 1119 1120 The gap character can be specified in two ways - either as an explicit 1121 argument, or via the sequence's alphabet. For example: 1122 1123 >>> from Bio.Seq import Seq 1124 >>> from Bio.Alphabet import generic_dna 1125 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 1126 >>> my_dna 1127 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1128 >>> my_dna.ungap("-") 1129 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 1130 1131 If the gap character is not given as an argument, it will be taken from 1132 the sequence's alphabet (if defined). Notice that the returned 1133 sequence's alphabet is adjusted since it no longer requires a gapped 1134 alphabet: 1135 1136 >>> from Bio.Seq import Seq 1137 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 1138 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 1139 >>> my_pro 1140 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 1141 >>> my_pro.ungap() 1142 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 1143 1144 Or, with a simpler gapped DNA example: 1145 1146 >>> from Bio.Seq import Seq 1147 >>> from Bio.Alphabet import IUPAC, Gapped 1148 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 1149 >>> my_seq 1150 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1151 >>> my_seq.ungap() 1152 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1153 1154 As long as it is consistent with the alphabet, although it is 1155 redundant, you can still supply the gap character as an argument to 1156 this method: 1157 1158 >>> my_seq 1159 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1160 >>> my_seq.ungap("=") 1161 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1162 1163 However, if the gap character given as the argument disagrees with that 1164 declared in the alphabet, an exception is raised: 1165 1166 >>> my_seq 1167 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1168 >>> my_seq.ungap("-") 1169 Traceback (most recent call last): 1170 ... 1171 ValueError: Gap '-' does not match '=' from alphabet 1172 1173 Finally, if a gap character is not supplied, and the alphabet does not 1174 define one, an exception is raised: 1175 1176 >>> from Bio.Seq import Seq 1177 >>> from Bio.Alphabet import generic_dna 1178 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1179 >>> my_dna 1180 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1181 >>> my_dna.ungap() 1182 Traceback (most recent call last): 1183 ... 1184 ValueError: Gap character not given and not defined in alphabet 1185 1186 """ 1187 if hasattr(self.alphabet, "gap_char"): 1188 if not gap: 1189 gap = self.alphabet.gap_char 1190 elif gap != self.alphabet.gap_char: 1191 raise ValueError( 1192 "Gap {0!r} does not match {1!r} from alphabet".format( 1193 gap, self.alphabet.gap_char)) 1194 alpha = Alphabet._ungap(self.alphabet) 1195 elif not gap: 1196 raise ValueError("Gap character not given and not defined in " 1197 "alphabet") 1198 else: 1199 alpha = self.alphabet # modify! 1200 if len(gap) != 1 or not isinstance(gap, str): 1201 raise ValueError("Unexpected gap character, {0!r}".format(gap)) 1202 return Seq(str(self).replace(gap, ""), alpha)
1203 1204
1205 -class UnknownSeq(Seq):
1206 """Read-only sequence object of known length but unknown contents. 1207 1208 If you have an unknown sequence, you can represent this with a normal 1209 Seq object, for example: 1210 1211 >>> my_seq = Seq("N"*5) 1212 >>> my_seq 1213 Seq('NNNNN', Alphabet()) 1214 >>> len(my_seq) 1215 5 1216 >>> print(my_seq) 1217 NNNNN 1218 1219 However, this is rather wasteful of memory (especially for large 1220 sequences), which is where this class is most useful: 1221 1222 >>> unk_five = UnknownSeq(5) 1223 >>> unk_five 1224 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1225 >>> len(unk_five) 1226 5 1227 >>> print(unk_five) 1228 ????? 1229 1230 You can add unknown sequence together, provided their alphabets and 1231 characters are compatible, and get another memory saving UnknownSeq: 1232 1233 >>> unk_four = UnknownSeq(4) 1234 >>> unk_four 1235 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1236 >>> unk_four + unk_five 1237 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1238 1239 If the alphabet or characters don't match up, the addition gives an 1240 ordinary Seq object: 1241 1242 >>> unk_nnnn = UnknownSeq(4, character = "N") 1243 >>> unk_nnnn 1244 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1245 >>> unk_nnnn + unk_four 1246 Seq('NNNN????', Alphabet()) 1247 1248 Combining with a real Seq gives a new Seq object: 1249 1250 >>> known_seq = Seq("ACGT") 1251 >>> unk_four + known_seq 1252 Seq('????ACGT', Alphabet()) 1253 >>> known_seq + unk_four 1254 Seq('ACGT????', Alphabet()) 1255 """ 1256
1257 - def __init__(self, length, alphabet=Alphabet.generic_alphabet, 1258 character=None):
1259 """Create a new UnknownSeq object. 1260 1261 If character is omitted, it is determined from the alphabet, "N" for 1262 nucleotides, "X" for proteins, and "?" otherwise. 1263 """ 1264 self._length = int(length) 1265 if self._length < 0: 1266 # TODO - Block zero length UnknownSeq? You can just use a Seq! 1267 raise ValueError("Length must not be negative.") 1268 self.alphabet = alphabet 1269 if character: 1270 if len(character) != 1: 1271 raise ValueError("character argument should be a single " 1272 "letter string.") 1273 self._character = character 1274 else: 1275 base = Alphabet._get_base_alphabet(alphabet) 1276 # TODO? Check the case of the letters in the alphabet? 1277 # We may have to use "n" instead of "N" etc. 1278 if isinstance(base, Alphabet.NucleotideAlphabet): 1279 self._character = "N" 1280 elif isinstance(base, Alphabet.ProteinAlphabet): 1281 self._character = "X" 1282 else: 1283 self._character = "?"
1284
1285 - def __len__(self):
1286 """Return the stated length of the unknown sequence.""" 1287 return self._length
1288
1289 - def __str__(self):
1290 """Return the unknown sequence as full string of the given length.""" 1291 return self._character * self._length
1292
1293 - def __repr__(self):
1294 """Return (truncated) representation of the sequence for debugging.""" 1295 return "UnknownSeq({0}, alphabet = {1!r}, character = {2!r})".format( 1296 self._length, self.alphabet, self._character)
1297
1298 - def __add__(self, other):
1299 """Add another sequence or string to this sequence. 1300 1301 Adding two UnknownSeq objects returns another UnknownSeq object 1302 provided the character is the same and the alphabets are compatible. 1303 1304 >>> from Bio.Seq import UnknownSeq 1305 >>> from Bio.Alphabet import generic_protein 1306 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1307 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1308 1309 If the characters differ, an UnknownSeq object cannot be used, so a 1310 Seq object is returned: 1311 1312 >>> from Bio.Seq import UnknownSeq 1313 >>> from Bio.Alphabet import generic_protein 1314 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1315 ... character="x") 1316 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1317 1318 If adding a string to an UnknownSeq, a new Seq is returned with the 1319 same alphabet: 1320 1321 >>> from Bio.Seq import UnknownSeq 1322 >>> from Bio.Alphabet import generic_protein 1323 >>> UnknownSeq(5, generic_protein) + "LV" 1324 Seq('XXXXXLV', ProteinAlphabet()) 1325 """ 1326 if isinstance(other, UnknownSeq) and \ 1327 other._character == self._character: 1328 # TODO - Check the alphabets match 1329 return UnknownSeq(len(self) + len(other), 1330 self.alphabet, self._character) 1331 # Offload to the base class... 1332 return Seq(str(self), self.alphabet) + other
1333
1334 - def __radd__(self, other):
1335 """Add a sequence on the left.""" 1336 # If other is an UnknownSeq, then __add__ would be called. 1337 # Offload to the base class... 1338 return other + Seq(str(self), self.alphabet)
1339
1340 - def __getitem__(self, index):
1341 """Get a subsequence from the UnknownSeq object. 1342 1343 >>> unk = UnknownSeq(8, character="N") 1344 >>> print(unk[:]) 1345 NNNNNNNN 1346 >>> print(unk[5:3]) 1347 <BLANKLINE> 1348 >>> print(unk[1:-1]) 1349 NNNNNN 1350 >>> print(unk[1:-1:2]) 1351 NNN 1352 """ 1353 if isinstance(index, int): 1354 # TODO - Check the bounds without wasting memory 1355 return str(self)[index] 1356 old_length = self._length 1357 step = index.step 1358 if step is None or step == 1: 1359 # This calculates the length you'd get from ("N"*old_length)[index] 1360 start = index.start 1361 end = index.stop 1362 if start is None: 1363 start = 0 1364 elif start < 0: 1365 start = max(0, old_length + start) 1366 elif start > old_length: 1367 start = old_length 1368 if end is None: 1369 end = old_length 1370 elif end < 0: 1371 end = max(0, old_length + end) 1372 elif end > old_length: 1373 end = old_length 1374 new_length = max(0, end - start) 1375 elif step == 0: 1376 raise ValueError("slice step cannot be zero") 1377 else: 1378 # TODO - handle step efficiently 1379 new_length = len(("X" * old_length)[index]) 1380 # assert new_length == len(("X"*old_length)[index]), \ 1381 # (index, start, end, step, old_length, 1382 # new_length, len(("X"*old_length)[index])) 1383 return UnknownSeq(new_length, self.alphabet, self._character)
1384
1385 - def count(self, sub, start=0, end=sys.maxsize):
1386 """Return a non-overlapping count, like that of a python string. 1387 1388 This behaves like the python string (and Seq object) method of the 1389 same name, which does a non-overlapping count! 1390 1391 For an overlapping search use the newer count_overlap() method. 1392 1393 Returns an integer, the number of occurrences of substring 1394 argument sub in the (sub)sequence given by [start:end]. 1395 Optional arguments start and end are interpreted as in slice 1396 notation. 1397 1398 Arguments: 1399 - sub - a string or another Seq object to look for 1400 - start - optional integer, slice start 1401 - end - optional integer, slice end 1402 1403 >>> "NNNN".count("N") 1404 4 1405 >>> Seq("NNNN").count("N") 1406 4 1407 >>> UnknownSeq(4, character="N").count("N") 1408 4 1409 >>> UnknownSeq(4, character="N").count("A") 1410 0 1411 >>> UnknownSeq(4, character="N").count("AA") 1412 0 1413 1414 HOWEVER, please note because that python strings and Seq objects (and 1415 MutableSeq objects) do a non-overlapping search, this may not give 1416 the answer you expect: 1417 1418 >>> UnknownSeq(4, character="N").count("NN") 1419 2 1420 >>> UnknownSeq(4, character="N").count("NNN") 1421 1 1422 """ 1423 sub_str = self._get_seq_str_and_check_alphabet(sub) 1424 if len(sub_str) == 1: 1425 if str(sub_str) == self._character: 1426 if start == 0 and end >= self._length: 1427 return self._length 1428 else: 1429 # This could be done more cleverly... 1430 return str(self).count(sub_str, start, end) 1431 else: 1432 return 0 1433 else: 1434 if set(sub_str) == set(self._character): 1435 if start == 0 and end >= self._length: 1436 return self._length // len(sub_str) 1437 else: 1438 # This could be done more cleverly... 1439 return str(self).count(sub_str, start, end) 1440 else: 1441 return 0
1442
1443 - def count_overlap(self, sub, start=0, end=sys.maxsize):
1444 """Return an overlapping count. 1445 1446 For a non-overlapping search use the count() method. 1447 1448 Returns an integer, the number of occurrences of substring 1449 argument sub in the (sub)sequence given by [start:end]. 1450 Optional arguments start and end are interpreted as in slice 1451 notation. 1452 1453 Arguments: 1454 - sub - a string or another Seq object to look for 1455 - start - optional integer, slice start 1456 - end - optional integer, slice end 1457 1458 e.g. 1459 1460 >>> from Bio.Seq import UnknownSeq 1461 >>> UnknownSeq(4, character="N").count_overlap("NN") 1462 3 1463 >>> UnknownSeq(4, character="N").count_overlap("NNN") 1464 2 1465 1466 Where substrings do not overlap, should behave the same as 1467 the count() method: 1468 1469 >>> UnknownSeq(4, character="N").count_overlap("N") 1470 4 1471 >>> UnknownSeq(4, character="N").count_overlap("N") == UnknownSeq(4, character="N").count("N") 1472 True 1473 >>> UnknownSeq(4, character="N").count_overlap("A") 1474 0 1475 >>> UnknownSeq(4, character="N").count_overlap("A") == UnknownSeq(4, character="N").count("A") 1476 True 1477 >>> UnknownSeq(4, character="N").count_overlap("AA") 1478 0 1479 >>> UnknownSeq(4, character="N").count_overlap("AA") == UnknownSeq(4, character="N").count("AA") 1480 True 1481 """ 1482 sub_str = self._get_seq_str_and_check_alphabet(sub) 1483 len_self, len_sub_str = self._length, len(sub_str) 1484 # Handling case where substring not in self 1485 if set(sub_str) != set(self._character): 1486 return 0 1487 # Setting None to the default arguments 1488 if start is None: 1489 start = 0 1490 if end is None: 1491 end = sys.maxsize 1492 # Truncating start and end to max of self._length and min of -self._length 1493 start = max(min(start, len_self), -len_self) 1494 end = max(min(end, len_self), -len_self) 1495 # Convert start and ends to positive indexes 1496 if start < 0: 1497 start += len_self 1498 if end < 0: 1499 end += len_self 1500 # Handle case where end <= start (no negative step argument here) 1501 # and case where len_sub_str is larger than the search space 1502 if end <= start or (end - start) < len_sub_str: 1503 return 0 1504 # 'Normal' calculation 1505 return end - start - len_sub_str + 1
1506
1507 - def complement(self):
1508 """Return the complement of an unknown nucleotide equals itself. 1509 1510 >>> my_nuc = UnknownSeq(8) 1511 >>> my_nuc 1512 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1513 >>> print(my_nuc) 1514 ???????? 1515 >>> my_nuc.complement() 1516 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1517 >>> print(my_nuc.complement()) 1518 ???????? 1519 """ 1520 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1521 Alphabet.ProteinAlphabet): 1522 raise ValueError("Proteins do not have complements!") 1523 return self
1524
1525 - def reverse_complement(self):
1526 """Return the reverse complement of an unknown sequence. 1527 1528 The reverse complement of an unknown nucleotide equals itself: 1529 1530 >>> from Bio.Seq import UnknownSeq 1531 >>> from Bio.Alphabet import generic_dna 1532 >>> example = UnknownSeq(6, generic_dna) 1533 >>> print(example) 1534 NNNNNN 1535 >>> print(example.reverse_complement()) 1536 NNNNNN 1537 """ 1538 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1539 Alphabet.ProteinAlphabet): 1540 raise ValueError("Proteins do not have complements!") 1541 return self
1542
1543 - def transcribe(self):
1544 """Return an unknown RNA sequence from an unknown DNA sequence. 1545 1546 >>> my_dna = UnknownSeq(10, character="N") 1547 >>> my_dna 1548 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1549 >>> print(my_dna) 1550 NNNNNNNNNN 1551 >>> my_rna = my_dna.transcribe() 1552 >>> my_rna 1553 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1554 >>> print(my_rna) 1555 NNNNNNNNNN 1556 """ 1557 # Offload the alphabet stuff 1558 s = Seq(self._character, self.alphabet).transcribe() 1559 return UnknownSeq(self._length, s.alphabet, self._character)
1560
1561 - def back_transcribe(self):
1562 """Return an unknown DNA sequence from an unknown RNA sequence. 1563 1564 >>> my_rna = UnknownSeq(20, character="N") 1565 >>> my_rna 1566 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1567 >>> print(my_rna) 1568 NNNNNNNNNNNNNNNNNNNN 1569 >>> my_dna = my_rna.back_transcribe() 1570 >>> my_dna 1571 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1572 >>> print(my_dna) 1573 NNNNNNNNNNNNNNNNNNNN 1574 """ 1575 # Offload the alphabet stuff 1576 s = Seq(self._character, self.alphabet).back_transcribe() 1577 return UnknownSeq(self._length, s.alphabet, self._character)
1578
1579 - def upper(self):
1580 """Return an upper case copy of the sequence. 1581 1582 >>> from Bio.Alphabet import generic_dna 1583 >>> from Bio.Seq import UnknownSeq 1584 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1585 >>> my_seq 1586 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1587 >>> print(my_seq) 1588 nnnnnnnnnnnnnnnnnnnn 1589 >>> my_seq.upper() 1590 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1591 >>> print(my_seq.upper()) 1592 NNNNNNNNNNNNNNNNNNNN 1593 1594 This will adjust the alphabet if required. See also the lower method. 1595 """ 1596 return UnknownSeq(self._length, self.alphabet._upper(), 1597 self._character.upper())
1598
1599 - def lower(self):
1600 """Return a lower case copy of the sequence. 1601 1602 This will adjust the alphabet if required: 1603 1604 >>> from Bio.Alphabet import IUPAC 1605 >>> from Bio.Seq import UnknownSeq 1606 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1607 >>> my_seq 1608 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1609 >>> print(my_seq) 1610 XXXXXXXXXXXXXXXXXXXX 1611 >>> my_seq.lower() 1612 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1613 >>> print(my_seq.lower()) 1614 xxxxxxxxxxxxxxxxxxxx 1615 1616 See also the upper method. 1617 """ 1618 return UnknownSeq(self._length, self.alphabet._lower(), 1619 self._character.lower())
1620
1621 - def translate(self, **kwargs):
1622 """Translate an unknown nucleotide sequence into an unknown protein. 1623 1624 e.g. 1625 1626 >>> my_seq = UnknownSeq(9, character="N") 1627 >>> print(my_seq) 1628 NNNNNNNNN 1629 >>> my_protein = my_seq.translate() 1630 >>> my_protein 1631 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1632 >>> print(my_protein) 1633 XXX 1634 1635 In comparison, using a normal Seq object: 1636 1637 >>> my_seq = Seq("NNNNNNNNN") 1638 >>> print(my_seq) 1639 NNNNNNNNN 1640 >>> my_protein = my_seq.translate() 1641 >>> my_protein 1642 Seq('XXX', ExtendedIUPACProtein()) 1643 >>> print(my_protein) 1644 XXX 1645 1646 """ 1647 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1648 Alphabet.ProteinAlphabet): 1649 raise ValueError("Proteins cannot be translated!") 1650 return UnknownSeq(self._length // 3, Alphabet.generic_protein, "X")
1651
1652 - def ungap(self, gap=None):
1653 """Return a copy of the sequence without the gap character(s). 1654 1655 The gap character can be specified in two ways - either as an explicit 1656 argument, or via the sequence's alphabet. For example: 1657 1658 >>> from Bio.Seq import UnknownSeq 1659 >>> from Bio.Alphabet import Gapped, generic_dna 1660 >>> my_dna = UnknownSeq(20, Gapped(generic_dna, "-")) 1661 >>> my_dna 1662 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1663 >>> my_dna.ungap() 1664 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1665 >>> my_dna.ungap("-") 1666 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1667 1668 If the UnknownSeq is using the gap character, then an empty Seq is 1669 returned: 1670 1671 >>> my_gap = UnknownSeq(20, Gapped(generic_dna, "-"), character="-") 1672 >>> my_gap 1673 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1674 >>> my_gap.ungap() 1675 Seq('', DNAAlphabet()) 1676 >>> my_gap.ungap("-") 1677 Seq('', DNAAlphabet()) 1678 1679 Notice that the returned sequence's alphabet is adjusted to remove any 1680 explicit gap character declaration. 1681 """ 1682 # Offload the alphabet stuff 1683 s = Seq(self._character, self.alphabet).ungap() 1684 if s: 1685 return UnknownSeq(self._length, s.alphabet, self._character) 1686 else: 1687 return Seq("", s.alphabet)
1688 1689
1690 -class MutableSeq(object):
1691 """An editable sequence object (with an alphabet). 1692 1693 Unlike normal python strings and our basic sequence object (the Seq class) 1694 which are immutable, the MutableSeq lets you edit the sequence in place. 1695 However, this means you cannot use a MutableSeq object as a dictionary key. 1696 1697 >>> from Bio.Seq import MutableSeq 1698 >>> from Bio.Alphabet import generic_dna 1699 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1700 >>> my_seq 1701 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1702 >>> my_seq[5] 1703 'T' 1704 >>> my_seq[5] = "A" 1705 >>> my_seq 1706 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1707 >>> my_seq[5] 1708 'A' 1709 >>> my_seq[5:8] = "NNN" 1710 >>> my_seq 1711 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1712 >>> len(my_seq) 1713 11 1714 1715 Note that the MutableSeq object does not support as many string-like 1716 or biological methods as the Seq object. 1717 """ 1718
1719 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
1720 """Initialize the class.""" 1721 if sys.version_info[0] == 3: 1722 self.array_indicator = "u" 1723 else: 1724 self.array_indicator = "c" 1725 if isinstance(data, str): # TODO - What about unicode? 1726 self.data = array.array(self.array_indicator, data) 1727 else: 1728 self.data = data # assumes the input is an array 1729 self.alphabet = alphabet
1730
1731 - def __repr__(self):
1732 """Return (truncated) representation of the sequence for debugging.""" 1733 if len(self) > 60: 1734 # Shows the last three letters as it is often useful to see if 1735 # there is a stop codon at the end of a sequence. 1736 # Note total length is 54+3+3=60 1737 return "{0}('{1}...{2}', {3!r})".format(self.__class__.__name__, 1738 str(self[:54]), 1739 str(self[-3:]), 1740 self.alphabet) 1741 else: 1742 return "{0}('{1}', {2!r})".format(self.__class__.__name__, 1743 str(self), 1744 self.alphabet)
1745
1746 - def __str__(self):
1747 """Return the full sequence as a python string. 1748 1749 Note that Biopython 1.44 and earlier would give a truncated 1750 version of repr(my_seq) for str(my_seq). If you are writing code 1751 which needs to be backwards compatible with old Biopython, you 1752 should continue to use my_seq.tostring() rather than str(my_seq). 1753 """ 1754 # See test_GAQueens.py for an historic usage of a non-string alphabet! 1755 return "".join(self.data)
1756
1757 - def __eq__(self, other):
1758 """Compare the sequence to another sequence or a string (README). 1759 1760 Currently if compared to another sequence the alphabets must be 1761 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1762 an exception. Otherwise only the sequence itself is compared, not the 1763 precise alphabet. 1764 1765 A future release of Biopython will change this (and the Seq object etc) 1766 to use simple string comparison. The plan is that comparing sequences 1767 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1768 but not an exception. 1769 1770 During this transition period, please just do explicit comparisons: 1771 1772 >>> seq1 = MutableSeq("ACGT") 1773 >>> seq2 = MutableSeq("ACGT") 1774 >>> id(seq1) == id(seq2) 1775 False 1776 >>> str(seq1) == str(seq2) 1777 True 1778 1779 Biopython now does: 1780 1781 >>> seq1 == seq2 1782 True 1783 >>> seq1 == Seq("ACGT") 1784 True 1785 >>> seq1 == "ACGT" 1786 True 1787 1788 """ 1789 if hasattr(other, "alphabet"): 1790 if not Alphabet._check_type_compatible([self.alphabet, 1791 other.alphabet]): 1792 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1793 self.alphabet, other.alphabet), 1794 BiopythonWarning) 1795 if isinstance(other, MutableSeq): 1796 return self.data == other.data 1797 return str(self) == str(other)
1798
1799 - def __ne__(self, other):
1800 """Implement the not-equal operand.""" 1801 # Seem to require this method for Python 2 but not needed on Python 3? 1802 return not (self == other)
1803
1804 - def __lt__(self, other):
1805 """Implement the less-than operand.""" 1806 if hasattr(other, "alphabet"): 1807 if not Alphabet._check_type_compatible([self.alphabet, 1808 other.alphabet]): 1809 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1810 self.alphabet, other.alphabet), 1811 BiopythonWarning) 1812 if isinstance(other, MutableSeq): 1813 return self.data < other.data 1814 return str(self) < str(other)
1815
1816 - def __le__(self, other):
1817 """Implement the less-than or equal operand.""" 1818 if hasattr(other, "alphabet"): 1819 if not Alphabet._check_type_compatible([self.alphabet, 1820 other.alphabet]): 1821 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1822 self.alphabet, other.alphabet), 1823 BiopythonWarning) 1824 if isinstance(other, MutableSeq): 1825 return self.data <= other.data 1826 return str(self) <= str(other)
1827
1828 - def __len__(self):
1829 """Return the length of the sequence, use len(my_seq).""" 1830 return len(self.data)
1831
1832 - def __getitem__(self, index):
1833 """Return a subsequence of single letter, use my_seq[index].""" 1834 # Note since Python 2.0, __getslice__ is deprecated 1835 # and __getitem__ is used instead. 1836 # See http://docs.python.org/ref/sequence-methods.html 1837 if isinstance(index, int): 1838 # Return a single letter as a string 1839 return self.data[index] 1840 else: 1841 # Return the (sub)sequence as another Seq object 1842 return MutableSeq(self.data[index], self.alphabet)
1843
1844 - def __setitem__(self, index, value):
1845 """Set a subsequence of single letter via value parameter.""" 1846 # Note since Python 2.0, __setslice__ is deprecated 1847 # and __setitem__ is used instead. 1848 # See http://docs.python.org/ref/sequence-methods.html 1849 if isinstance(index, int): 1850 # Replacing a single letter with a new string 1851 self.data[index] = value 1852 else: 1853 # Replacing a sub-sequence 1854 if isinstance(value, MutableSeq): 1855 self.data[index] = value.data 1856 elif isinstance(value, type(self.data)): 1857 self.data[index] = value 1858 else: 1859 self.data[index] = array.array(self.array_indicator, 1860 str(value))
1861
1862 - def __delitem__(self, index):
1863 """Delete a subsequence of single letter.""" 1864 # Note since Python 2.0, __delslice__ is deprecated 1865 # and __delitem__ is used instead. 1866 # See http://docs.python.org/ref/sequence-methods.html 1867 1868 # Could be deleting a single letter, or a slice 1869 del self.data[index]
1870
1871 - def __add__(self, other):
1872 """Add another sequence or string to this sequence. 1873 1874 Returns a new MutableSeq object. 1875 """ 1876 if hasattr(other, "alphabet"): 1877 # other should be a Seq or a MutableSeq 1878 if not Alphabet._check_type_compatible([self.alphabet, 1879 other.alphabet]): 1880 raise TypeError( 1881 "Incompatible alphabets {0!r} and {1!r}".format( 1882 self.alphabet, other.alphabet)) 1883 # They should be the same sequence type (or one of them is generic) 1884 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1885 if isinstance(other, MutableSeq): 1886 # See test_GAQueens.py for an historic usage of a non-string 1887 # alphabet! Adding the arrays should support this. 1888 return self.__class__(self.data + other.data, a) 1889 else: 1890 return self.__class__(str(self) + str(other), a) 1891 elif isinstance(other, basestring): 1892 # other is a plain string - use the current alphabet 1893 return self.__class__(str(self) + str(other), self.alphabet) 1894 else: 1895 raise TypeError
1896
1897 - def __radd__(self, other):
1898 """Add a sequence on the left.""" 1899 if hasattr(other, "alphabet"): 1900 # other should be a Seq or a MutableSeq 1901 if not Alphabet._check_type_compatible([self.alphabet, 1902 other.alphabet]): 1903 raise TypeError( 1904 "Incompatible alphabets {0!r} and {1!r}".format( 1905 self.alphabet, other.alphabet)) 1906 # They should be the same sequence type (or one of them is generic) 1907 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1908 if isinstance(other, MutableSeq): 1909 # See test_GAQueens.py for an historic usage of a non-string 1910 # alphabet! Adding the arrays should support this. 1911 return self.__class__(other.data + self.data, a) 1912 else: 1913 return self.__class__(str(other) + str(self), a) 1914 elif isinstance(other, basestring): 1915 # other is a plain string - use the current alphabet 1916 return self.__class__(str(other) + str(self), self.alphabet) 1917 else: 1918 raise TypeError
1919
1920 - def append(self, c):
1921 """Add a subsequence to the mutable sequence object.""" 1922 self.data.append(c)
1923
1924 - def insert(self, i, c):
1925 """Add a subsequence to the mutable sequence object at a given index.""" 1926 self.data.insert(i, c)
1927
1928 - def pop(self, i=(-1)):
1929 """Remove a subsequence of a single letter at given index.""" 1930 c = self.data[i] 1931 del self.data[i] 1932 return c
1933
1934 - def remove(self, item):
1935 """Remove a subsequence of a single letter from mutable sequence.""" 1936 for i in range(len(self.data)): 1937 if self.data[i] == item: 1938 del self.data[i] 1939 return 1940 raise ValueError("MutableSeq.remove(x): x not in list")
1941
1942 - def count(self, sub, start=0, end=sys.maxsize):
1943 """Return a non-overlapping count, like that of a python string. 1944 1945 This behaves like the python string method of the same name, 1946 which does a non-overlapping count! 1947 1948 For an overlapping search use the newer count_overlap() method. 1949 1950 Returns an integer, the number of occurrences of substring 1951 argument sub in the (sub)sequence given by [start:end]. 1952 Optional arguments start and end are interpreted as in slice 1953 notation. 1954 1955 Arguments: 1956 - sub - a string or another Seq object to look for 1957 - start - optional integer, slice start 1958 - end - optional integer, slice end 1959 1960 e.g. 1961 1962 >>> from Bio.Seq import MutableSeq 1963 >>> my_mseq = MutableSeq("AAAATGA") 1964 >>> print(my_mseq.count("A")) 1965 5 1966 >>> print(my_mseq.count("ATG")) 1967 1 1968 >>> print(my_mseq.count(Seq("AT"))) 1969 1 1970 >>> print(my_mseq.count("AT", 2, -1)) 1971 1 1972 1973 HOWEVER, please note because that python strings, Seq objects and 1974 MutableSeq objects do a non-overlapping search, this may not give 1975 the answer you expect: 1976 1977 >>> "AAAA".count("AA") 1978 2 1979 >>> print(MutableSeq("AAAA").count("AA")) 1980 2 1981 1982 An overlapping search would give the answer as three! 1983 """ 1984 try: 1985 # TODO - Should we check the alphabet? 1986 search = str(sub) 1987 except AttributeError: 1988 search = sub 1989 1990 if not isinstance(search, basestring): 1991 raise TypeError("expected a string, Seq or MutableSeq") 1992 1993 if len(search) == 1: 1994 # Try and be efficient and work directly from the array. 1995 count = 0 1996 for c in self.data[start:end]: 1997 if c == search: 1998 count += 1 1999 return count 2000 else: 2001 # TODO - Can we do this more efficiently? 2002 return str(self).count(search, start, end)
2003
2004 - def count_overlap(self, sub, start=0, end=sys.maxsize):
2005 """Return an overlapping count. 2006 2007 For a non-overlapping search use the count() method. 2008 2009 Returns an integer, the number of occurrences of substring 2010 argument sub in the (sub)sequence given by [start:end]. 2011 Optional arguments start and end are interpreted as in slice 2012 notation. 2013 2014 Arguments: 2015 - sub - a string or another Seq object to look for 2016 - start - optional integer, slice start 2017 - end - optional integer, slice end 2018 2019 e.g. 2020 2021 >>> from Bio.Seq import MutableSeq 2022 >>> print(MutableSeq("AAAA").count_overlap("AA")) 2023 3 2024 >>> print(MutableSeq("ATATATATA").count_overlap("ATA")) 2025 4 2026 >>> print(MutableSeq("ATATATATA").count_overlap("ATA", 3, -1)) 2027 1 2028 2029 Where substrings do not overlap, should behave the same as 2030 the count() method: 2031 2032 >>> from Bio.Seq import MutableSeq 2033 >>> my_mseq = MutableSeq("AAAATGA") 2034 >>> print(my_mseq.count_overlap("A")) 2035 5 2036 >>> my_mseq.count_overlap("A") == my_mseq.count("A") 2037 True 2038 >>> print(my_mseq.count_overlap("ATG")) 2039 1 2040 >>> my_mseq.count_overlap("ATG") == my_mseq.count("ATG") 2041 True 2042 >>> print(my_mseq.count_overlap(Seq("AT"))) 2043 1 2044 >>> my_mseq.count_overlap(Seq("AT")) == my_mseq.count(Seq("AT")) 2045 True 2046 >>> print(my_mseq.count_overlap("AT", 2, -1)) 2047 1 2048 >>> my_mseq.count_overlap("AT", 2, -1) == my_mseq.count("AT", 2, -1) 2049 True 2050 2051 HOWEVER, do not use this method for such cases because the 2052 count() method is much for efficient. 2053 """ 2054 # The implementation is currently identical to that of 2055 # Seq.count_overlap() apart from the definition of sub_str 2056 sub_str = str(sub) 2057 self = str(self) 2058 overlap_count = 0 2059 while True: 2060 start = self.find(sub_str, start, end) + 1 2061 if start != 0: 2062 overlap_count += 1 2063 else: 2064 return overlap_count
2065
2066 - def index(self, item):
2067 """Return the position of a subsequence of a single letter.""" 2068 for i in range(len(self.data)): 2069 if self.data[i] == item: 2070 return i 2071 raise ValueError("MutableSeq.index(x): x not in list")
2072
2073 - def reverse(self):
2074 """Modify the mutable sequence to reverse itself. 2075 2076 No return value. 2077 """ 2078 self.data.reverse()
2079
2080 - def complement(self):
2081 """Modify the mutable sequence to take on its complement. 2082 2083 Trying to complement a protein sequence raises an exception. 2084 2085 No return value. 2086 """ 2087 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 2088 Alphabet.ProteinAlphabet): 2089 raise ValueError("Proteins do not have complements!") 2090 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 2091 d = ambiguous_dna_complement 2092 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 2093 d = ambiguous_rna_complement 2094 elif 'U' in self.data and 'T' in self.data: 2095 # TODO - Handle this cleanly? 2096 raise ValueError("Mixed RNA/DNA found") 2097 elif 'U' in self.data: 2098 d = ambiguous_rna_complement 2099 else: 2100 d = ambiguous_dna_complement 2101 c = dict([(x.lower(), y.lower()) for x, y in d.items()]) 2102 d.update(c) 2103 self.data = [d[c] for c in self.data] 2104 self.data = array.array(self.array_indicator, self.data)
2105
2106 - def reverse_complement(self):
2107 """Modify the mutable sequence to take on its reverse complement. 2108 2109 Trying to reverse complement a protein sequence raises an exception. 2110 2111 No return value. 2112 """ 2113 self.complement() 2114 self.data.reverse()
2115 2116 # Sorting a sequence makes no sense. 2117 # def sort(self, *args): self.data.sort(*args) 2118
2119 - def extend(self, other):
2120 """Add a sequence to the original mutable sequence object.""" 2121 if isinstance(other, MutableSeq): 2122 for c in other.data: 2123 self.data.append(c) 2124 else: 2125 for c in other: 2126 self.data.append(c)
2127
2128 - def tostring(self):
2129 """Return the full sequence as a python string (DEPRECATED). 2130 2131 You are now encouraged to use str(my_seq) instead of my_seq.tostring() 2132 as this method is officially deprecated. 2133 2134 Because str(my_seq) will give you the full sequence as a python string, 2135 there is often no need to make an explicit conversion. For example, 2136 2137 >>> my_seq = Seq("ATCGTG") 2138 >>> my_name = "seq_1" 2139 >>> print("ID={%s}, sequence={%s}" % (my_name, my_seq)) 2140 ID={seq_1}, sequence={ATCGTG} 2141 2142 On Biopython 1.44 or older you would have to have done this: 2143 2144 >>> print("ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())) 2145 ID={seq_1}, sequence={ATCGTG} 2146 """ 2147 from Bio import BiopythonDeprecationWarning 2148 warnings.warn("This method is obsolete; please use str(my_seq) " 2149 "instead of my_seq.tostring().", 2150 BiopythonDeprecationWarning) 2151 return "".join(self.data)
2152
2153 - def toseq(self):
2154 """Return the full sequence as a new immutable Seq object. 2155 2156 >>> from Bio.Seq import Seq 2157 >>> from Bio.Alphabet import IUPAC 2158 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 2159 ... IUPAC.protein) 2160 >>> my_mseq 2161 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 2162 >>> my_mseq.toseq() 2163 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 2164 2165 Note that the alphabet is preserved. 2166 """ 2167 return Seq("".join(self.data), self.alphabet)
2168 2169 2170 # The transcribe, backward_transcribe, and translate functions are 2171 # user-friendly versions of the corresponding functions in Bio.Transcribe 2172 # and Bio.Translate. The functions work both on Seq objects, and on strings. 2173
2174 -def transcribe(dna):
2175 """Transcribe a DNA sequence into RNA. 2176 2177 If given a string, returns a new string object. 2178 2179 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 2180 2181 Trying to transcribe a protein or RNA sequence raises an exception. 2182 2183 e.g. 2184 2185 >>> transcribe("ACTGN") 2186 'ACUGN' 2187 """ 2188 if isinstance(dna, Seq): 2189 return dna.transcribe() 2190 elif isinstance(dna, MutableSeq): 2191 return dna.toseq().transcribe() 2192 else: 2193 return dna.replace('T', 'U').replace('t', 'u')
2194 2195
2196 -def back_transcribe(rna):
2197 """Return the RNA sequence back-transcribed into DNA. 2198 2199 If given a string, returns a new string object. 2200 2201 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 2202 2203 Trying to transcribe a protein or DNA sequence raises an exception. 2204 2205 e.g. 2206 2207 >>> back_transcribe("ACUGN") 2208 'ACTGN' 2209 """ 2210 if isinstance(rna, Seq): 2211 return rna.back_transcribe() 2212 elif isinstance(rna, MutableSeq): 2213 return rna.toseq().back_transcribe() 2214 else: 2215 return rna.replace('U', 'T').replace('u', 't')
2216 2217
2218 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 2219 cds=False, pos_stop="X", gap=None):
2220 """Translate nucleotide string into a protein string (PRIVATE). 2221 2222 Arguments: 2223 - sequence - a string 2224 - table - a CodonTable object (NOT a table name or id number) 2225 - stop_symbol - a single character string, what to use for terminators. 2226 - to_stop - boolean, should translation terminate at the first 2227 in frame stop codon? If there is no in-frame stop codon 2228 then translation continues to the end. 2229 - pos_stop - a single character string for a possible stop codon 2230 (e.g. TAN or NNN) 2231 - cds - Boolean, indicates this is a complete CDS. If True, this 2232 checks the sequence starts with a valid alternative start 2233 codon (which will be translated as methionine, M), that the 2234 sequence length is a multiple of three, and that there is a 2235 single in frame stop codon at the end (this will be excluded 2236 from the protein sequence, regardless of the to_stop option). 2237 If these tests fail, an exception is raised. 2238 - gap - Single character string to denote symbol used for gaps. 2239 Defaults to None. 2240 2241 Returns a string. 2242 2243 e.g. 2244 2245 >>> from Bio.Data import CodonTable 2246 >>> table = CodonTable.ambiguous_dna_by_id[1] 2247 >>> _translate_str("AAA", table) 2248 'K' 2249 >>> _translate_str("TAR", table) 2250 '*' 2251 >>> _translate_str("TAN", table) 2252 'X' 2253 >>> _translate_str("TAN", table, pos_stop="@") 2254 '@' 2255 >>> _translate_str("TA?", table) 2256 Traceback (most recent call last): 2257 ... 2258 TranslationError: Codon 'TA?' is invalid 2259 2260 In a change to older versions of Biopython, partial codons are now 2261 always regarded as an error (previously only checked if cds=True) 2262 and will trigger a warning (likely to become an exception in a 2263 future release). 2264 2265 If **cds=True**, the start and stop codons are checked, and the start 2266 codon will be translated at methionine. The sequence must be an 2267 while number of codons. 2268 2269 >>> _translate_str("ATGCCCTAG", table, cds=True) 2270 'MP' 2271 >>> _translate_str("AAACCCTAG", table, cds=True) 2272 Traceback (most recent call last): 2273 ... 2274 TranslationError: First codon 'AAA' is not a start codon 2275 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 2276 Traceback (most recent call last): 2277 ... 2278 TranslationError: Extra in frame stop codon found. 2279 """ 2280 sequence = sequence.upper() 2281 amino_acids = [] 2282 forward_table = table.forward_table 2283 stop_codons = table.stop_codons 2284 if table.nucleotide_alphabet.letters is not None: 2285 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 2286 else: 2287 # Assume the worst case, ambiguous DNA or RNA: 2288 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + 2289 IUPAC.ambiguous_rna.letters.upper()) 2290 n = len(sequence) 2291 if cds: 2292 if str(sequence[:3]).upper() not in table.start_codons: 2293 raise CodonTable.TranslationError( 2294 "First codon '{0}' is not a start codon".format(sequence[:3])) 2295 if n % 3 != 0: 2296 raise CodonTable.TranslationError( 2297 "Sequence length {0} is not a multiple of three".format(n)) 2298 if str(sequence[-3:]).upper() not in stop_codons: 2299 raise CodonTable.TranslationError( 2300 "Final codon '{0}' is not a stop codon".format(sequence[-3:])) 2301 # Don't translate the stop symbol, and manually translate the M 2302 sequence = sequence[3:-3] 2303 n -= 6 2304 amino_acids = ["M"] 2305 elif n % 3 != 0: 2306 warnings.warn("Partial codon, len(sequence) not a multiple of three. " 2307 "Explicitly trim the sequence or add trailing N before " 2308 "translation. This may become an error in future.", 2309 BiopythonWarning) 2310 if gap is not None: 2311 if not isinstance(gap, basestring): 2312 raise TypeError("Gap character should be a single character " 2313 "string.") 2314 elif len(gap) > 1: 2315 raise ValueError("Gap character should be a single character " 2316 "string.") 2317 2318 for i in range(0, n - n % 3, 3): 2319 codon = sequence[i:i + 3] 2320 try: 2321 amino_acids.append(forward_table[codon]) 2322 except (KeyError, CodonTable.TranslationError): 2323 if codon in table.stop_codons: 2324 if cds: 2325 raise CodonTable.TranslationError( 2326 "Extra in frame stop codon found.") 2327 if to_stop: 2328 break 2329 amino_acids.append(stop_symbol) 2330 elif valid_letters.issuperset(set(codon)): 2331 # Possible stop codon (e.g. NNN or TAN) 2332 amino_acids.append(pos_stop) 2333 elif gap is not None and codon == gap * 3: 2334 # Gapped translation 2335 amino_acids.append(gap) 2336 else: 2337 raise CodonTable.TranslationError( 2338 "Codon '{0}' is invalid".format(codon)) 2339 return "".join(amino_acids)
2340 2341
2342 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 2343 cds=False, gap=None):
2344 """Translate a nucleotide sequence into amino acids. 2345 2346 If given a string, returns a new string object. Given a Seq or 2347 MutableSeq, returns a Seq object with a protein alphabet. 2348 2349 Arguments: 2350 - table - Which codon table to use? This can be either a name 2351 (string), an NCBI identifier (integer), or a CodonTable object 2352 (useful for non-standard genetic codes). Defaults to the "Standard" 2353 table. 2354 - stop_symbol - Single character string, what to use for any 2355 terminators, defaults to the asterisk, "*". 2356 - to_stop - Boolean, defaults to False meaning do a full 2357 translation continuing on past any stop codons 2358 (translated as the specified stop_symbol). If 2359 True, translation is terminated at the first in 2360 frame stop codon (and the stop_symbol is not 2361 appended to the returned protein sequence). 2362 - cds - Boolean, indicates this is a complete CDS. If True, this 2363 checks the sequence starts with a valid alternative start 2364 codon (which will be translated as methionine, M), that the 2365 sequence length is a multiple of three, and that there is a 2366 single in frame stop codon at the end (this will be excluded 2367 from the protein sequence, regardless of the to_stop option). 2368 If these tests fail, an exception is raised. 2369 - gap - Single character string to denote symbol used for gaps. 2370 Defaults to None. 2371 2372 A simple string example using the default (standard) genetic code: 2373 2374 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2375 >>> translate(coding_dna) 2376 'VAIVMGR*KGAR*' 2377 >>> translate(coding_dna, stop_symbol="@") 2378 'VAIVMGR@KGAR@' 2379 >>> translate(coding_dna, to_stop=True) 2380 'VAIVMGR' 2381 2382 Now using NCBI table 2, where TGA is not a stop codon: 2383 2384 >>> translate(coding_dna, table=2) 2385 'VAIVMGRWKGAR*' 2386 >>> translate(coding_dna, table=2, to_stop=True) 2387 'VAIVMGRWKGAR' 2388 2389 In fact this example uses an alternative start codon valid under NCBI 2390 table 2, GTG, which means this example is a complete valid CDS which 2391 when translated should really start with methionine (not valine): 2392 2393 >>> translate(coding_dna, table=2, cds=True) 2394 'MAIVMGRWKGAR' 2395 2396 Note that if the sequence has no in-frame stop codon, then the to_stop 2397 argument has no effect: 2398 2399 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2400 >>> translate(coding_dna2) 2401 'VAIVMGR' 2402 >>> translate(coding_dna2, to_stop=True) 2403 'VAIVMGR' 2404 2405 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2406 or a stop codon. These are translated as "X". Any invalid codon 2407 (e.g. "TA?" or "T-A") will throw a TranslationError. 2408 2409 It will however translate either DNA or RNA. 2410 """ 2411 if isinstance(sequence, Seq): 2412 return sequence.translate(table, stop_symbol, to_stop, cds) 2413 elif isinstance(sequence, MutableSeq): 2414 # Return a Seq object 2415 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2416 else: 2417 # Assume its a string, return a string 2418 try: 2419 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2420 except ValueError: 2421 codon_table = CodonTable.ambiguous_generic_by_name[table] 2422 except (AttributeError, TypeError): 2423 if isinstance(table, CodonTable.CodonTable): 2424 codon_table = table 2425 else: 2426 raise ValueError('Bad table argument') 2427 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds, 2428 gap=gap)
2429 2430
2431 -def reverse_complement(sequence):
2432 """Return the reverse complement sequence of a nucleotide string. 2433 2434 If given a string, returns a new string object. 2435 Given a Seq or a MutableSeq, returns a new Seq object with the same 2436 alphabet. 2437 2438 Supports unambiguous and ambiguous nucleotide sequences. 2439 2440 e.g. 2441 2442 >>> reverse_complement("ACTG-NH") 2443 'DN-CAGT' 2444 """ 2445 return complement(sequence)[::-1]
2446 2447
2448 -def complement(sequence):
2449 """Return the complement sequence of a nucleotide string. 2450 2451 If given a string, returns a new string object. 2452 Given a Seq or a MutableSeq, returns a new Seq object with the same 2453 alphabet. 2454 2455 Supports unambiguous and ambiguous nucleotide sequences. 2456 2457 e.g. 2458 2459 >>> complement("ACTG-NH") 2460 'TGAC-ND' 2461 """ 2462 if isinstance(sequence, Seq): 2463 # Return a Seq 2464 return sequence.complement() 2465 elif isinstance(sequence, MutableSeq): 2466 # Return a Seq 2467 # Don't use the MutableSeq reverse_complement method as it is 2468 # 'in place'. 2469 return sequence.toseq().complement() 2470 2471 # Assume its a string. 2472 # In order to avoid some code duplication, the old code would turn the 2473 # string into a Seq, use the reverse_complement method, and convert back 2474 # to a string. 2475 # This worked, but is over five times slower on short sequences! 2476 if ('U' in sequence or 'u' in sequence) \ 2477 and ('T' in sequence or 't' in sequence): 2478 raise ValueError("Mixed RNA/DNA found") 2479 elif 'U' in sequence or 'u' in sequence: 2480 ttable = _rna_complement_table 2481 else: 2482 ttable = _dna_complement_table 2483 return sequence.translate(ttable)
2484 2485
2486 -def _test():
2487 """Run the Bio.Seq module's doctests (PRIVATE).""" 2488 print("Running doctests...") 2489 import doctest 2490 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2491 print("Done")
2492 2493 2494 if __name__ == "__main__": 2495 _test() 2496