Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2015 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also the Seq_ wiki and the chapter in our tutorial: 
  11      - `HTML Tutorial`_ 
  12      - `PDF Tutorial`_ 
  13   
  14  .. _Seq: http://biopython.org/wiki/Seq 
  15  .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html 
  16  .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf 
  17  """ 
  18  from __future__ import print_function 
  19   
  20  import string  # for maketrans only 
  21  import array 
  22  import sys 
  23  import warnings 
  24   
  25  from Bio._py3k import range 
  26  from Bio._py3k import basestring 
  27   
  28  from Bio import BiopythonWarning 
  29  from Bio import Alphabet 
  30  from Bio.Alphabet import IUPAC 
  31  from Bio.Data.IUPACData import (ambiguous_dna_complement, 
  32                                  ambiguous_rna_complement) 
  33  from Bio.Data import CodonTable 
  34   
  35   
36 -def _maketrans(complement_mapping):
37 """Makes a python string translation table (PRIVATE). 38 39 Arguments: 40 - complement_mapping - a dictionary such as ambiguous_dna_complement 41 and ambiguous_rna_complement from Data.IUPACData. 42 43 Returns a translation table (a string of length 256) for use with the 44 python string's translate method to use in a (reverse) complement. 45 46 Compatible with lower case and upper case sequences. 47 48 For internal use only. 49 """ 50 before = ''.join(complement_mapping.keys()) 51 after = ''.join(complement_mapping.values()) 52 before += before.lower() 53 after += after.lower() 54 if sys.version_info[0] == 3: 55 return str.maketrans(before, after) 56 else: 57 return string.maketrans(before, after)
58 59 _dna_complement_table = _maketrans(ambiguous_dna_complement) 60 _rna_complement_table = _maketrans(ambiguous_rna_complement) 61 62
63 -class Seq(object):
64 """A read-only sequence object (essentially a string with an alphabet). 65 66 Like normal python strings, our basic sequence object is immutable. 67 This prevents you from doing my_seq[5] = "A" for example, but does allow 68 Seq objects to be used as dictionary keys. 69 70 The Seq object provides a number of string like methods (such as count, 71 find, split and strip), which are alphabet aware where appropriate. 72 73 In addition to the string like sequence, the Seq object has an alphabet 74 property. This is an instance of an Alphabet class from Bio.Alphabet, 75 for example generic DNA, or IUPAC DNA. This describes the type of molecule 76 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 77 (letters). 78 79 The Seq object also provides some biological methods, such as complement, 80 reverse_complement, transcribe, back_transcribe and translate (which are 81 not applicable to sequences with a protein alphabet). 82 """
83 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
84 """Create a Seq object. 85 86 Arguments: 87 - seq - Sequence, required (string) 88 - alphabet - Optional argument, an Alphabet object from 89 Bio.Alphabet 90 91 You will typically use Bio.SeqIO to read in sequences from files as 92 SeqRecord objects, whose sequence will be exposed as a Seq object via 93 the seq property. 94 95 However, will often want to create your own Seq objects directly: 96 97 >>> from Bio.Seq import Seq 98 >>> from Bio.Alphabet import IUPAC 99 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 100 ... IUPAC.protein) 101 >>> my_seq 102 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 103 >>> print(my_seq) 104 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 105 >>> my_seq.alphabet 106 IUPACProtein() 107 """ 108 # Enforce string storage 109 if not isinstance(data, basestring): 110 raise TypeError("The sequence data given to a Seq object should " 111 "be a string (not another Seq object etc)") 112 self._data = data 113 self.alphabet = alphabet # Seq API requirement
114
115 - def __repr__(self):
116 """Return (truncated) representation of the sequence for debugging.""" 117 if len(self) > 60: 118 # Shows the last three letters as it is often useful to see if 119 # there is a stop codon at the end of a sequence. 120 # Note total length is 54+3+3=60 121 return "{0}('{1}...{2}', {3!r})".format(self.__class__.__name__, 122 str(self)[:54], 123 str(self)[-3:], 124 self.alphabet) 125 else: 126 return '{0}({1!r}, {2!r})'.format(self.__class__.__name__, 127 self._data, 128 self.alphabet)
129
130 - def __str__(self):
131 """Returns the full sequence as a python string, use str(my_seq). 132 133 Note that Biopython 1.44 and earlier would give a truncated 134 version of repr(my_seq) for str(my_seq). If you are writing code 135 which need to be backwards compatible with old Biopython, you 136 should continue to use my_seq.tostring() rather than str(my_seq). 137 """ 138 return self._data
139
140 - def __hash__(self):
141 """Hash for comparison. 142 143 See the __cmp__ documentation - this has changed from past 144 versions of Biopython! 145 """ 146 # TODO - remove this warning in a future release 147 warnings.warn("Biopython Seq objects now use string comparison. " 148 "Older versions of Biopython used object comparison. " 149 "During this transition, please use hash(id(my_seq)) " 150 "or my_dict[id(my_seq)] if you want the old behaviour, " 151 "or use hash(str(my_seq)) or my_dict[str(my_seq)] for " 152 "the new string hashing behaviour.", BiopythonWarning) 153 return hash(str(self))
154
155 - def __eq__(self, other):
156 """Compare the sequence to another sequence or a string (README). 157 158 Historically comparing Seq objects has done Python object comparison. 159 After considerable discussion (keeping in mind constraints of the 160 Python language, hashes and dictionary support), Biopython now uses 161 simple string comparison (with a warning about the change). 162 163 Note that incompatible alphabets (e.g. DNA to RNA) will trigger a 164 warning. 165 166 During this transition period, please just do explicit comparisons: 167 168 >>> from Bio.Seq import Seq 169 >>> from Bio.Alphabet import generic_dna 170 >>> seq1 = Seq("ACGT") 171 >>> seq2 = Seq("ACGT") 172 >>> id(seq1) == id(seq2) 173 False 174 >>> str(seq1) == str(seq2) 175 True 176 177 The new behaviour is to use string-like equality: 178 179 >>> from Bio.Seq import Seq 180 >>> from Bio.Alphabet import generic_dna 181 >>> seq1 == seq2 182 True 183 >>> seq1 == "ACGT" 184 True 185 >>> seq1 == Seq("ACGT", generic_dna) 186 True 187 188 """ 189 if hasattr(other, "alphabet"): 190 # other could be a Seq or a MutableSeq 191 if not Alphabet._check_type_compatible([self.alphabet, 192 other.alphabet]): 193 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 194 self.alphabet, other.alphabet), 195 BiopythonWarning) 196 return str(self) == str(other)
197
198 - def __ne__(self, other):
199 """Not equal, see __eq__ documentation.""" 200 # Seem to require this method for Python 2 but not needed on Python 3? 201 return not (self == other)
202
203 - def __lt__(self, other):
204 """Less than, see __eq__ documentation.""" 205 if hasattr(other, "alphabet"): 206 if not Alphabet._check_type_compatible([self.alphabet, 207 other.alphabet]): 208 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 209 self.alphabet, other.alphabet), 210 BiopythonWarning) 211 return str(self) < str(other)
212
213 - def __le__(self, other):
214 """Less than or equal, see __eq__ documentation.""" 215 if hasattr(other, "alphabet"): 216 if not Alphabet._check_type_compatible([self.alphabet, 217 other.alphabet]): 218 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 219 self.alphabet, other.alphabet), 220 BiopythonWarning) 221 return str(self) <= str(other)
222
223 - def __len__(self):
224 """Returns the length of the sequence, use len(my_seq).""" 225 return len(self._data) # Seq API requirement
226
227 - def __getitem__(self, index): # Seq API requirement
228 """Returns a subsequence of single letter, use my_seq[index].""" 229 # Note since Python 2.0, __getslice__ is deprecated 230 # and __getitem__ is used instead. 231 # See http://docs.python.org/ref/sequence-methods.html 232 if isinstance(index, int): 233 # Return a single letter as a string 234 return self._data[index] 235 else: 236 # Return the (sub)sequence as another Seq object 237 return Seq(self._data[index], self.alphabet)
238
239 - def __add__(self, other):
240 """Add another sequence or string to this sequence. 241 242 If adding a string to a Seq, the alphabet is preserved: 243 244 >>> from Bio.Seq import Seq 245 >>> from Bio.Alphabet import generic_protein 246 >>> Seq("MELKI", generic_protein) + "LV" 247 Seq('MELKILV', ProteinAlphabet()) 248 249 When adding two Seq (like) objects, the alphabets are important. 250 Consider this example: 251 252 >>> from Bio.Seq import Seq 253 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 254 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 255 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 256 >>> unamb_dna_seq 257 Seq('ACGT', IUPACUnambiguousDNA()) 258 >>> ambig_dna_seq 259 Seq('ACRGT', IUPACAmbiguousDNA()) 260 261 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 262 the more general ambiguous IUPAC DNA alphabet: 263 264 >>> unamb_dna_seq + ambig_dna_seq 265 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 266 267 However, if the default generic alphabet is included, the result is 268 a generic alphabet: 269 270 >>> Seq("") + ambig_dna_seq 271 Seq('ACRGT', Alphabet()) 272 273 You can't add RNA and DNA sequences: 274 275 >>> from Bio.Alphabet import generic_dna, generic_rna 276 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 277 Traceback (most recent call last): 278 ... 279 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 280 281 You can't add nucleotide and protein sequences: 282 283 >>> from Bio.Alphabet import generic_dna, generic_protein 284 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 285 Traceback (most recent call last): 286 ... 287 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 288 """ 289 if hasattr(other, "alphabet"): 290 # other should be a Seq or a MutableSeq 291 if not Alphabet._check_type_compatible([self.alphabet, 292 other.alphabet]): 293 raise TypeError( 294 "Incompatible alphabets {0!r} and {1!r}".format( 295 self.alphabet, other.alphabet)) 296 # They should be the same sequence type (or one of them is generic) 297 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 298 return self.__class__(str(self) + str(other), a) 299 elif isinstance(other, basestring): 300 # other is a plain string - use the current alphabet 301 return self.__class__(str(self) + other, self.alphabet) 302 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports 303 if isinstance(other, SeqRecord): 304 # Get the SeqRecord's __radd__ to handle this 305 return NotImplemented 306 else: 307 raise TypeError
308
309 - def __radd__(self, other):
310 """Adding a sequence on the left. 311 312 If adding a string to a Seq, the alphabet is preserved: 313 314 >>> from Bio.Seq import Seq 315 >>> from Bio.Alphabet import generic_protein 316 >>> "LV" + Seq("MELKI", generic_protein) 317 Seq('LVMELKI', ProteinAlphabet()) 318 319 Adding two Seq (like) objects is handled via the __add__ method. 320 """ 321 if hasattr(other, "alphabet"): 322 # other should be a Seq or a MutableSeq 323 if not Alphabet._check_type_compatible([self.alphabet, 324 other.alphabet]): 325 raise TypeError( 326 "Incompatible alphabets {0!r} and {1!r}".format( 327 self.alphabet, other.alphabet)) 328 # They should be the same sequence type (or one of them is generic) 329 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 330 return self.__class__(str(other) + str(self), a) 331 elif isinstance(other, basestring): 332 # other is a plain string - use the current alphabet 333 return self.__class__(other + str(self), self.alphabet) 334 else: 335 raise TypeError
336
337 - def tostring(self): # Seq API requirement
338 """Returns the full sequence as a python string (DEPRECATED). 339 340 You are now encouraged to use str(my_seq) instead of 341 my_seq.tostring().""" 342 from Bio import BiopythonDeprecationWarning 343 warnings.warn("This method is obsolete; please use str(my_seq) " 344 "instead of my_seq.tostring().", 345 BiopythonDeprecationWarning) 346 return str(self) 347
348 - def tomutable(self): # Needed? Or use a function?
349 """Returns the full sequence as a MutableSeq object. 350 351 >>> from Bio.Seq import Seq 352 >>> from Bio.Alphabet import IUPAC 353 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 354 ... IUPAC.protein) 355 >>> my_seq 356 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 357 >>> my_seq.tomutable() 358 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 359 360 Note that the alphabet is preserved. 361 """ 362 return MutableSeq(str(self), self.alphabet) 363
364 - def _get_seq_str_and_check_alphabet(self, other_sequence):
365 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 366 367 For a string argument, returns the string. 368 369 For a Seq or MutableSeq, it checks the alphabet is compatible 370 (raising an exception if it isn't), and then returns a string. 371 """ 372 try: 373 other_alpha = other_sequence.alphabet 374 except AttributeError: 375 # Assume other_sequence is a string 376 return other_sequence 377 378 # Other should be a Seq or a MutableSeq 379 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 380 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 381 self.alphabet, other_alpha)) 382 # Return as a string 383 return str(other_sequence)
384
385 - def count(self, sub, start=0, end=sys.maxsize):
386 """Non-overlapping count method, like that of a python string. 387 388 This behaves like the python string method of the same name, 389 which does a non-overlapping count! 390 391 Returns an integer, the number of occurrences of substring 392 argument sub in the (sub)sequence given by [start:end]. 393 Optional arguments start and end are interpreted as in slice 394 notation. 395 396 Arguments: 397 - sub - a string or another Seq object to look for 398 - start - optional integer, slice start 399 - end - optional integer, slice end 400 401 e.g. 402 403 >>> from Bio.Seq import Seq 404 >>> my_seq = Seq("AAAATGA") 405 >>> print(my_seq.count("A")) 406 5 407 >>> print(my_seq.count("ATG")) 408 1 409 >>> print(my_seq.count(Seq("AT"))) 410 1 411 >>> print(my_seq.count("AT", 2, -1)) 412 1 413 414 HOWEVER, please note because python strings and Seq objects (and 415 MutableSeq objects) do a non-overlapping search, this may not give 416 the answer you expect: 417 418 >>> "AAAA".count("AA") 419 2 420 >>> print(Seq("AAAA").count("AA")) 421 2 422 423 An overlapping search would give the answer as three! 424 """ 425 # If it has one, check the alphabet: 426 sub_str = self._get_seq_str_and_check_alphabet(sub) 427 return str(self).count(sub_str, start, end)
428
429 - def __contains__(self, char):
430 """Implements the 'in' keyword, like a python string. 431 432 e.g. 433 434 >>> from Bio.Seq import Seq 435 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 436 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 437 >>> "AAA" in my_dna 438 True 439 >>> Seq("AAA") in my_dna 440 True 441 >>> Seq("AAA", generic_dna) in my_dna 442 True 443 444 Like other Seq methods, this will raise a type error if another Seq 445 (or Seq like) object with an incompatible alphabet is used: 446 447 >>> Seq("AAA", generic_rna) in my_dna 448 Traceback (most recent call last): 449 ... 450 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 451 >>> Seq("AAA", generic_protein) in my_dna 452 Traceback (most recent call last): 453 ... 454 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 455 """ 456 # If it has one, check the alphabet: 457 sub_str = self._get_seq_str_and_check_alphabet(char) 458 return sub_str in str(self)
459
460 - def find(self, sub, start=0, end=sys.maxsize):
461 """Find method, like that of a python string. 462 463 This behaves like the python string method of the same name. 464 465 Returns an integer, the index of the first occurrence of substring 466 argument sub in the (sub)sequence given by [start:end]. 467 468 Arguments: 469 - sub - a string or another Seq object to look for 470 - start - optional integer, slice start 471 - end - optional integer, slice end 472 473 Returns -1 if the subsequence is NOT found. 474 475 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 476 477 >>> from Bio.Seq import Seq 478 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 479 >>> my_rna.find("AUG") 480 3 481 """ 482 # If it has one, check the alphabet: 483 sub_str = self._get_seq_str_and_check_alphabet(sub) 484 return str(self).find(sub_str, start, end)
485
486 - def rfind(self, sub, start=0, end=sys.maxsize):
487 """Find from right method, like that of a python string. 488 489 This behaves like the python string method of the same name. 490 491 Returns an integer, the index of the last (right most) occurrence of 492 substring argument sub in the (sub)sequence given by [start:end]. 493 494 Arguments: 495 - sub - a string or another Seq object to look for 496 - start - optional integer, slice start 497 - end - optional integer, slice end 498 499 Returns -1 if the subsequence is NOT found. 500 501 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 502 503 >>> from Bio.Seq import Seq 504 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 505 >>> my_rna.rfind("AUG") 506 15 507 """ 508 # If it has one, check the alphabet: 509 sub_str = self._get_seq_str_and_check_alphabet(sub) 510 return str(self).rfind(sub_str, start, end)
511
512 - def startswith(self, prefix, start=0, end=sys.maxsize):
513 """Does the Seq start with the given prefix? Returns True/False. 514 515 This behaves like the python string method of the same name. 516 517 Return True if the sequence starts with the specified prefix 518 (a string or another Seq object), False otherwise. 519 With optional start, test sequence beginning at that position. 520 With optional end, stop comparing sequence at that position. 521 prefix can also be a tuple of strings to try. e.g. 522 523 >>> from Bio.Seq import Seq 524 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 525 >>> my_rna.startswith("GUC") 526 True 527 >>> my_rna.startswith("AUG") 528 False 529 >>> my_rna.startswith("AUG", 3) 530 True 531 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) 532 True 533 """ 534 # If it has one, check the alphabet: 535 if isinstance(prefix, tuple): 536 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 537 for p in prefix) 538 return str(self).startswith(prefix_strs, start, end) 539 else: 540 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 541 return str(self).startswith(prefix_str, start, end)
542
543 - def endswith(self, suffix, start=0, end=sys.maxsize):
544 """Does the Seq end with the given suffix? Returns True/False. 545 546 This behaves like the python string method of the same name. 547 548 Return True if the sequence ends with the specified suffix 549 (a string or another Seq object), False otherwise. 550 With optional start, test sequence beginning at that position. 551 With optional end, stop comparing sequence at that position. 552 suffix can also be a tuple of strings to try. e.g. 553 554 >>> from Bio.Seq import Seq 555 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 556 >>> my_rna.endswith("UUG") 557 True 558 >>> my_rna.endswith("AUG") 559 False 560 >>> my_rna.endswith("AUG", 0, 18) 561 True 562 >>> my_rna.endswith(("UCC", "UCA", "UUG")) 563 True 564 """ 565 # If it has one, check the alphabet: 566 if isinstance(suffix, tuple): 567 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 568 for p in suffix) 569 return str(self).endswith(suffix_strs, start, end) 570 else: 571 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 572 return str(self).endswith(suffix_str, start, end)
573
574 - def split(self, sep=None, maxsplit=-1):
575 """Split method, like that of a python string. 576 577 This behaves like the python string method of the same name. 578 579 Return a list of the 'words' in the string (as Seq objects), 580 using sep as the delimiter string. If maxsplit is given, at 581 most maxsplit splits are done. If maxsplit is omitted, all 582 splits are made. 583 584 Following the python string method, sep will by default be any 585 white space (tabs, spaces, newlines) but this is unlikely to 586 apply to biological sequences. 587 588 e.g. 589 590 >>> from Bio.Seq import Seq 591 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 592 >>> my_aa = my_rna.translate() 593 >>> my_aa 594 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 595 >>> for pep in my_aa.split("*"): 596 ... pep 597 Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')) 598 Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')) 599 Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*')) 600 >>> for pep in my_aa.split("*", 1): 601 ... pep 602 Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')) 603 Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 604 605 See also the rsplit method: 606 607 >>> for pep in my_aa.rsplit("*", 1): 608 ... pep 609 Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')) 610 Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*')) 611 """ 612 # If it has one, check the alphabet: 613 sep_str = self._get_seq_str_and_check_alphabet(sep) 614 # TODO - If the sep is the defined stop symbol, or gap char, 615 # should we adjust the alphabet? 616 return [Seq(part, self.alphabet) 617 for part in str(self).split(sep_str, maxsplit)]
618
619 - def rsplit(self, sep=None, maxsplit=-1):
620 """Right split method, like that of a python string. 621 622 This behaves like the python string method of the same name. 623 624 Return a list of the 'words' in the string (as Seq objects), 625 using sep as the delimiter string. If maxsplit is given, at 626 most maxsplit splits are done COUNTING FROM THE RIGHT. 627 If maxsplit is omitted, all splits are made. 628 629 Following the python string method, sep will by default be any 630 white space (tabs, spaces, newlines) but this is unlikely to 631 apply to biological sequences. 632 633 e.g. print(my_seq.rsplit("*",1)) 634 635 See also the split method. 636 """ 637 # If it has one, check the alphabet: 638 sep_str = self._get_seq_str_and_check_alphabet(sep) 639 return [Seq(part, self.alphabet) 640 for part in str(self).rsplit(sep_str, maxsplit)]
641
642 - def strip(self, chars=None):
643 """Returns a new Seq object with leading and trailing ends stripped. 644 645 This behaves like the python string method of the same name. 646 647 Optional argument chars defines which characters to remove. If 648 omitted or None (default) then as for the python string method, 649 this defaults to removing any white space. 650 651 e.g. print(my_seq.strip("-")) 652 653 See also the lstrip and rstrip methods. 654 """ 655 # If it has one, check the alphabet: 656 strip_str = self._get_seq_str_and_check_alphabet(chars) 657 return Seq(str(self).strip(strip_str), self.alphabet)
658
659 - def lstrip(self, chars=None):
660 """Returns a new Seq object with leading (left) end stripped. 661 662 This behaves like the python string method of the same name. 663 664 Optional argument chars defines which characters to remove. If 665 omitted or None (default) then as for the python string method, 666 this defaults to removing any white space. 667 668 e.g. print(my_seq.lstrip("-")) 669 670 See also the strip and rstrip methods. 671 """ 672 # If it has one, check the alphabet: 673 strip_str = self._get_seq_str_and_check_alphabet(chars) 674 return Seq(str(self).lstrip(strip_str), self.alphabet)
675
676 - def rstrip(self, chars=None):
677 """Returns a new Seq object with trailing (right) end stripped. 678 679 This behaves like the python string method of the same name. 680 681 Optional argument chars defines which characters to remove. If 682 omitted or None (default) then as for the python string method, 683 this defaults to removing any white space. 684 685 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 686 687 >>> from Bio.Alphabet import IUPAC 688 >>> from Bio.Seq import Seq 689 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 690 >>> my_seq 691 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 692 >>> my_seq.rstrip("A") 693 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 694 695 See also the strip and lstrip methods. 696 """ 697 # If it has one, check the alphabet: 698 strip_str = self._get_seq_str_and_check_alphabet(chars) 699 return Seq(str(self).rstrip(strip_str), self.alphabet)
700
701 - def upper(self):
702 """Returns an upper case copy of the sequence. 703 704 >>> from Bio.Alphabet import HasStopCodon, generic_protein 705 >>> from Bio.Seq import Seq 706 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 707 >>> my_seq 708 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 709 >>> my_seq.lower() 710 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 711 >>> my_seq.upper() 712 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 713 714 This will adjust the alphabet if required. See also the lower method. 715 """ 716 return Seq(str(self).upper(), self.alphabet._upper())
717
718 - def lower(self):
719 """Returns a lower case copy of the sequence. 720 721 This will adjust the alphabet if required. Note that the IUPAC 722 alphabets are upper case only, and thus a generic alphabet must be 723 substituted. 724 725 >>> from Bio.Alphabet import Gapped, generic_dna 726 >>> from Bio.Alphabet import IUPAC 727 >>> from Bio.Seq import Seq 728 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", 729 ... Gapped(IUPAC.unambiguous_dna, "*")) 730 >>> my_seq 731 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 732 >>> my_seq.lower() 733 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 734 735 See also the upper method. 736 """ 737 return Seq(str(self).lower(), self.alphabet._lower())
738
739 - def complement(self):
740 """Returns the complement sequence. New Seq object. 741 742 >>> from Bio.Seq import Seq 743 >>> from Bio.Alphabet import IUPAC 744 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 745 >>> my_dna 746 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 747 >>> my_dna.complement() 748 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 749 750 You can of course used mixed case sequences, 751 752 >>> from Bio.Seq import Seq 753 >>> from Bio.Alphabet import generic_dna 754 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 755 >>> my_dna 756 Seq('CCCCCgatA-GD', DNAAlphabet()) 757 >>> my_dna.complement() 758 Seq('GGGGGctaT-CH', DNAAlphabet()) 759 760 Note in the above example, ambiguous character D denotes 761 G, A or T so its complement is H (for C, T or A). 762 763 Trying to complement a protein sequence raises an exception. 764 765 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 766 >>> my_protein.complement() 767 Traceback (most recent call last): 768 ... 769 ValueError: Proteins do not have complements! 770 """ 771 base = Alphabet._get_base_alphabet(self.alphabet) 772 if isinstance(base, Alphabet.ProteinAlphabet): 773 raise ValueError("Proteins do not have complements!") 774 if isinstance(base, Alphabet.DNAAlphabet): 775 ttable = _dna_complement_table 776 elif isinstance(base, Alphabet.RNAAlphabet): 777 ttable = _rna_complement_table 778 elif ('U' in self._data or 'u' in self._data) \ 779 and ('T' in self._data or 't' in self._data): 780 # TODO - Handle this cleanly? 781 raise ValueError("Mixed RNA/DNA found") 782 elif 'U' in self._data or 'u' in self._data: 783 ttable = _rna_complement_table 784 else: 785 ttable = _dna_complement_table 786 # Much faster on really long sequences than the previous loop based 787 # one. Thanks to Michael Palmer, University of Waterloo. 788 return Seq(str(self).translate(ttable), self.alphabet)
789
790 - def reverse_complement(self):
791 """Returns the reverse complement sequence. New Seq object. 792 793 >>> from Bio.Seq import Seq 794 >>> from Bio.Alphabet import IUPAC 795 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 796 >>> my_dna 797 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 798 >>> my_dna.reverse_complement() 799 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 800 801 Note in the above example, since R = G or A, its complement 802 is Y (which denotes C or T). 803 804 You can of course used mixed case sequences, 805 806 >>> from Bio.Seq import Seq 807 >>> from Bio.Alphabet import generic_dna 808 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 809 >>> my_dna 810 Seq('CCCCCgatA-G', DNAAlphabet()) 811 >>> my_dna.reverse_complement() 812 Seq('C-TatcGGGGG', DNAAlphabet()) 813 814 Trying to complement a protein sequence raises an exception: 815 816 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 817 >>> my_protein.reverse_complement() 818 Traceback (most recent call last): 819 ... 820 ValueError: Proteins do not have complements! 821 """ 822 # Use -1 stride/step to reverse the complement 823 return self.complement()[::-1]
824
825 - def transcribe(self):
826 """Returns the RNA sequence from a DNA sequence. New Seq object. 827 828 >>> from Bio.Seq import Seq 829 >>> from Bio.Alphabet import IUPAC 830 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 831 ... IUPAC.unambiguous_dna) 832 >>> coding_dna 833 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 834 >>> coding_dna.transcribe() 835 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 836 837 Trying to transcribe a protein or RNA sequence raises an exception: 838 839 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 840 >>> my_protein.transcribe() 841 Traceback (most recent call last): 842 ... 843 ValueError: Proteins cannot be transcribed! 844 """ 845 base = Alphabet._get_base_alphabet(self.alphabet) 846 if isinstance(base, Alphabet.ProteinAlphabet): 847 raise ValueError("Proteins cannot be transcribed!") 848 if isinstance(base, Alphabet.RNAAlphabet): 849 raise ValueError("RNA cannot be transcribed!") 850 851 if self.alphabet == IUPAC.unambiguous_dna: 852 alphabet = IUPAC.unambiguous_rna 853 elif self.alphabet == IUPAC.ambiguous_dna: 854 alphabet = IUPAC.ambiguous_rna 855 else: 856 alphabet = Alphabet.generic_rna 857 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
858
859 - def back_transcribe(self):
860 """Returns the DNA sequence from an RNA sequence. New Seq object. 861 862 >>> from Bio.Seq import Seq 863 >>> from Bio.Alphabet import IUPAC 864 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 865 ... IUPAC.unambiguous_rna) 866 >>> messenger_rna 867 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 868 >>> messenger_rna.back_transcribe() 869 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 870 871 Trying to back-transcribe a protein or DNA sequence raises an 872 exception: 873 874 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 875 >>> my_protein.back_transcribe() 876 Traceback (most recent call last): 877 ... 878 ValueError: Proteins cannot be back transcribed! 879 """ 880 base = Alphabet._get_base_alphabet(self.alphabet) 881 if isinstance(base, Alphabet.ProteinAlphabet): 882 raise ValueError("Proteins cannot be back transcribed!") 883 if isinstance(base, Alphabet.DNAAlphabet): 884 raise ValueError("DNA cannot be back transcribed!") 885 886 if self.alphabet == IUPAC.unambiguous_rna: 887 alphabet = IUPAC.unambiguous_dna 888 elif self.alphabet == IUPAC.ambiguous_rna: 889 alphabet = IUPAC.ambiguous_dna 890 else: 891 alphabet = Alphabet.generic_dna 892 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
893
894 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 895 cds=False, gap=None):
896 """Turns a nucleotide sequence into a protein sequence. New Seq object. 897 898 This method will translate DNA or RNA sequences, and those with a 899 nucleotide or generic alphabet. Trying to translate a protein 900 sequence raises an exception. 901 902 Arguments: 903 - table - Which codon table to use? This can be either a name 904 (string), an NCBI identifier (integer), or a CodonTable 905 object (useful for non-standard genetic codes). This 906 defaults to the "Standard" table. 907 - stop_symbol - Single character string, what to use for 908 terminators. This defaults to the asterisk, "*". 909 - to_stop - Boolean, defaults to False meaning do a full 910 translation continuing on past any stop codons (translated as the 911 specified stop_symbol). If True, translation is terminated at 912 the first in frame stop codon (and the stop_symbol is not 913 appended to the returned protein sequence). 914 - cds - Boolean, indicates this is a complete CDS. If True, 915 this checks the sequence starts with a valid alternative start 916 codon (which will be translated as methionine, M), that the 917 sequence length is a multiple of three, and that there is a 918 single in frame stop codon at the end (this will be excluded 919 from the protein sequence, regardless of the to_stop option). 920 If these tests fail, an exception is raised. 921 - gap - Single character string to denote symbol used for gaps. 922 It will try to guess the gap character from the alphabet. 923 924 e.g. Using the standard table: 925 926 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 927 >>> coding_dna.translate() 928 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 929 >>> coding_dna.translate(stop_symbol="@") 930 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 931 >>> coding_dna.translate(to_stop=True) 932 Seq('VAIVMGR', ExtendedIUPACProtein()) 933 934 Now using NCBI table 2, where TGA is not a stop codon: 935 936 >>> coding_dna.translate(table=2) 937 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 938 >>> coding_dna.translate(table=2, to_stop=True) 939 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 940 941 In fact, GTG is an alternative start codon under NCBI table 2, meaning 942 this sequence could be a complete CDS: 943 944 >>> coding_dna.translate(table=2, cds=True) 945 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 946 947 It isn't a valid CDS under NCBI table 1, due to both the start codon 948 and also the in frame stop codons: 949 950 >>> coding_dna.translate(table=1, cds=True) 951 Traceback (most recent call last): 952 ... 953 TranslationError: First codon 'GTG' is not a start codon 954 955 If the sequence has no in-frame stop codon, then the to_stop argument 956 has no effect: 957 958 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 959 >>> coding_dna2.translate() 960 Seq('LAIVMGR', ExtendedIUPACProtein()) 961 >>> coding_dna2.translate(to_stop=True) 962 Seq('LAIVMGR', ExtendedIUPACProtein()) 963 964 When translating gapped sequences, the gap character is inferred from 965 the alphabet: 966 967 >>> from Bio.Alphabet import Gapped 968 >>> coding_dna3 = Seq("GTG---GCCATT", Gapped(IUPAC.unambiguous_dna)) 969 >>> coding_dna3.translate() 970 Seq('V-AI', Gapped(ExtendedIUPACProtein(), '-')) 971 972 It is possible to pass the gap character when the alphabet is missing: 973 974 >>> coding_dna4 = Seq("GTG---GCCATT") 975 >>> coding_dna4.translate(gap='-') 976 Seq('V-AI', Gapped(ExtendedIUPACProtein(), '-')) 977 978 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 979 or a stop codon. These are translated as "X". Any invalid codon 980 (e.g. "TA?" or "T-A") will throw a TranslationError. 981 982 NOTE - This does NOT behave like the python string's translate 983 method. For that use str(my_seq).translate(...) instead. 984 """ 985 if isinstance(table, str) and len(table) == 256: 986 raise ValueError("The Seq object translate method DOES NOT take " 987 "a 256 character string mapping table like " 988 "the python string object's translate method. " 989 "Use str(my_seq).translate(...) instead.") 990 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 991 Alphabet.ProteinAlphabet): 992 raise ValueError("Proteins cannot be translated!") 993 try: 994 table_id = int(table) 995 except ValueError: 996 # Assume its a table name 997 if self.alphabet == IUPAC.unambiguous_dna: 998 # Will use standard IUPAC protein alphabet, no need for X 999 codon_table = CodonTable.unambiguous_dna_by_name[table] 1000 elif self.alphabet == IUPAC.unambiguous_rna: 1001 # Will use standard IUPAC protein alphabet, no need for X 1002 codon_table = CodonTable.unambiguous_rna_by_name[table] 1003 else: 1004 # This will use the extended IUPAC protein alphabet with X etc. 1005 # The same table can be used for RNA or DNA (we use this for 1006 # translating strings). 1007 codon_table = CodonTable.ambiguous_generic_by_name[table] 1008 except (AttributeError, TypeError): 1009 # Assume its a CodonTable object 1010 if isinstance(table, CodonTable.CodonTable): 1011 codon_table = table 1012 else: 1013 raise ValueError('Bad table argument') 1014 else: 1015 # Assume its a table ID 1016 if self.alphabet == IUPAC.unambiguous_dna: 1017 # Will use standard IUPAC protein alphabet, no need for X 1018 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 1019 elif self.alphabet == IUPAC.unambiguous_rna: 1020 # Will use standard IUPAC protein alphabet, no need for X 1021 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 1022 else: 1023 # This will use the extended IUPAC protein alphabet with X etc. 1024 # The same table can be used for RNA or DNA (we use this for 1025 # translating strings). 1026 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 1027 1028 # Deal with gaps for translation 1029 if hasattr(self.alphabet, "gap_char"): 1030 if not gap: 1031 gap = self.alphabet.gap_char 1032 elif gap != self.alphabet.gap_char: 1033 raise ValueError( 1034 "Gap {0!r} does not match {1!r} from alphabet".format( 1035 gap, self.alphabet.gap_char)) 1036 1037 protein = _translate_str(str(self), codon_table, stop_symbol, to_stop, 1038 cds, gap=gap) 1039 1040 if gap and gap in protein: 1041 alphabet = Alphabet.Gapped(codon_table.protein_alphabet, gap) 1042 else: 1043 alphabet = codon_table.protein_alphabet 1044 1045 if stop_symbol in protein: 1046 alphabet = Alphabet.HasStopCodon(alphabet, stop_symbol) 1047 1048 return Seq(protein, alphabet)
1049
1050 - def ungap(self, gap=None):
1051 """Return a copy of the sequence without the gap character(s). 1052 1053 The gap character can be specified in two ways - either as an explicit 1054 argument, or via the sequence's alphabet. For example: 1055 1056 >>> from Bio.Seq import Seq 1057 >>> from Bio.Alphabet import generic_dna 1058 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 1059 >>> my_dna 1060 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1061 >>> my_dna.ungap("-") 1062 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 1063 1064 If the gap character is not given as an argument, it will be taken from 1065 the sequence's alphabet (if defined). Notice that the returned 1066 sequence's alphabet is adjusted since it no longer requires a gapped 1067 alphabet: 1068 1069 >>> from Bio.Seq import Seq 1070 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 1071 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 1072 >>> my_pro 1073 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 1074 >>> my_pro.ungap() 1075 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 1076 1077 Or, with a simpler gapped DNA example: 1078 1079 >>> from Bio.Seq import Seq 1080 >>> from Bio.Alphabet import IUPAC, Gapped 1081 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 1082 >>> my_seq 1083 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1084 >>> my_seq.ungap() 1085 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1086 1087 As long as it is consistent with the alphabet, although it is 1088 redundant, you can still supply the gap character as an argument to 1089 this method: 1090 1091 >>> my_seq 1092 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1093 >>> my_seq.ungap("=") 1094 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1095 1096 However, if the gap character given as the argument disagrees with that 1097 declared in the alphabet, an exception is raised: 1098 1099 >>> my_seq 1100 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1101 >>> my_seq.ungap("-") 1102 Traceback (most recent call last): 1103 ... 1104 ValueError: Gap '-' does not match '=' from alphabet 1105 1106 Finally, if a gap character is not supplied, and the alphabet does not 1107 define one, an exception is raised: 1108 1109 >>> from Bio.Seq import Seq 1110 >>> from Bio.Alphabet import generic_dna 1111 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1112 >>> my_dna 1113 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1114 >>> my_dna.ungap() 1115 Traceback (most recent call last): 1116 ... 1117 ValueError: Gap character not given and not defined in alphabet 1118 1119 """ 1120 if hasattr(self.alphabet, "gap_char"): 1121 if not gap: 1122 gap = self.alphabet.gap_char 1123 elif gap != self.alphabet.gap_char: 1124 raise ValueError( 1125 "Gap {0!r} does not match {1!r} from alphabet".format( 1126 gap, self.alphabet.gap_char)) 1127 alpha = Alphabet._ungap(self.alphabet) 1128 elif not gap: 1129 raise ValueError("Gap character not given and not defined in " 1130 "alphabet") 1131 else: 1132 alpha = self.alphabet # modify! 1133 if len(gap) != 1 or not isinstance(gap, str): 1134 raise ValueError("Unexpected gap character, {0!r}".format(gap)) 1135 return Seq(str(self).replace(gap, ""), alpha)
1136 1137
1138 -class UnknownSeq(Seq):
1139 """A read-only sequence object of known length but unknown contents. 1140 1141 If you have an unknown sequence, you can represent this with a normal 1142 Seq object, for example: 1143 1144 >>> my_seq = Seq("N"*5) 1145 >>> my_seq 1146 Seq('NNNNN', Alphabet()) 1147 >>> len(my_seq) 1148 5 1149 >>> print(my_seq) 1150 NNNNN 1151 1152 However, this is rather wasteful of memory (especially for large 1153 sequences), which is where this class is most useful: 1154 1155 >>> unk_five = UnknownSeq(5) 1156 >>> unk_five 1157 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1158 >>> len(unk_five) 1159 5 1160 >>> print(unk_five) 1161 ????? 1162 1163 You can add unknown sequence together, provided their alphabets and 1164 characters are compatible, and get another memory saving UnknownSeq: 1165 1166 >>> unk_four = UnknownSeq(4) 1167 >>> unk_four 1168 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1169 >>> unk_four + unk_five 1170 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1171 1172 If the alphabet or characters don't match up, the addition gives an 1173 ordinary Seq object: 1174 1175 >>> unk_nnnn = UnknownSeq(4, character = "N") 1176 >>> unk_nnnn 1177 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1178 >>> unk_nnnn + unk_four 1179 Seq('NNNN????', Alphabet()) 1180 1181 Combining with a real Seq gives a new Seq object: 1182 1183 >>> known_seq = Seq("ACGT") 1184 >>> unk_four + known_seq 1185 Seq('????ACGT', Alphabet()) 1186 >>> known_seq + unk_four 1187 Seq('ACGT????', Alphabet()) 1188 """
1189 - def __init__(self, length, alphabet=Alphabet.generic_alphabet, 1190 character=None):
1191 """Create a new UnknownSeq object. 1192 1193 If character is omitted, it is determined from the alphabet, "N" for 1194 nucleotides, "X" for proteins, and "?" otherwise. 1195 """ 1196 self._length = int(length) 1197 if self._length < 0: 1198 # TODO - Block zero length UnknownSeq? You can just use a Seq! 1199 raise ValueError("Length must not be negative.") 1200 self.alphabet = alphabet 1201 if character: 1202 if len(character) != 1: 1203 raise ValueError("character argument should be a single " 1204 "letter string.") 1205 self._character = character 1206 else: 1207 base = Alphabet._get_base_alphabet(alphabet) 1208 # TODO? Check the case of the letters in the alphabet? 1209 # We may have to use "n" instead of "N" etc. 1210 if isinstance(base, Alphabet.NucleotideAlphabet): 1211 self._character = "N" 1212 elif isinstance(base, Alphabet.ProteinAlphabet): 1213 self._character = "X" 1214 else: 1215 self._character = "?"
1216
1217 - def __len__(self):
1218 """Returns the stated length of the unknown sequence.""" 1219 return self._length
1220
1221 - def __str__(self):
1222 """Returns the unknown sequence as full string of the given length.""" 1223 return self._character * self._length
1224
1225 - def __repr__(self):
1226 return "UnknownSeq({0}, alphabet = {1!r}, character = {2!r})".format( 1227 self._length, self.alphabet, self._character)
1228
1229 - def __add__(self, other):
1230 """Add another sequence or string to this sequence. 1231 1232 Adding two UnknownSeq objects returns another UnknownSeq object 1233 provided the character is the same and the alphabets are compatible. 1234 1235 >>> from Bio.Seq import UnknownSeq 1236 >>> from Bio.Alphabet import generic_protein 1237 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1238 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1239 1240 If the characters differ, an UnknownSeq object cannot be used, so a 1241 Seq object is returned: 1242 1243 >>> from Bio.Seq import UnknownSeq 1244 >>> from Bio.Alphabet import generic_protein 1245 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1246 ... character="x") 1247 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1248 1249 If adding a string to an UnknownSeq, a new Seq is returned with the 1250 same alphabet: 1251 1252 >>> from Bio.Seq import UnknownSeq 1253 >>> from Bio.Alphabet import generic_protein 1254 >>> UnknownSeq(5, generic_protein) + "LV" 1255 Seq('XXXXXLV', ProteinAlphabet()) 1256 """ 1257 if isinstance(other, UnknownSeq) and \ 1258 other._character == self._character: 1259 # TODO - Check the alphabets match 1260 return UnknownSeq(len(self) + len(other), 1261 self.alphabet, self._character) 1262 # Offload to the base class... 1263 return Seq(str(self), self.alphabet) + other
1264
1265 - def __radd__(self, other):
1266 # If other is an UnknownSeq, then __add__ would be called. 1267 # Offload to the base class... 1268 return other + Seq(str(self), self.alphabet)
1269
1270 - def __getitem__(self, index):
1271 """Get a subsequence from the UnknownSeq object. 1272 1273 >>> unk = UnknownSeq(8, character="N") 1274 >>> print(unk[:]) 1275 NNNNNNNN 1276 >>> print(unk[5:3]) 1277 <BLANKLINE> 1278 >>> print(unk[1:-1]) 1279 NNNNNN 1280 >>> print(unk[1:-1:2]) 1281 NNN 1282 """ 1283 if isinstance(index, int): 1284 # TODO - Check the bounds without wasting memory 1285 return str(self)[index] 1286 old_length = self._length 1287 step = index.step 1288 if step is None or step == 1: 1289 # This calculates the length you'd get from ("N"*old_length)[index] 1290 start = index.start 1291 end = index.stop 1292 if start is None: 1293 start = 0 1294 elif start < 0: 1295 start = max(0, old_length + start) 1296 elif start > old_length: 1297 start = old_length 1298 if end is None: 1299 end = old_length 1300 elif end < 0: 1301 end = max(0, old_length + end) 1302 elif end > old_length: 1303 end = old_length 1304 new_length = max(0, end - start) 1305 elif step == 0: 1306 raise ValueError("slice step cannot be zero") 1307 else: 1308 # TODO - handle step efficiently 1309 new_length = len(("X" * old_length)[index]) 1310 # assert new_length == len(("X"*old_length)[index]), \ 1311 # (index, start, end, step, old_length, 1312 # new_length, len(("X"*old_length)[index])) 1313 return UnknownSeq(new_length, self.alphabet, self._character)
1314
1315 - def count(self, sub, start=0, end=sys.maxsize):
1316 """Non-overlapping count method, like that of a python string. 1317 1318 This behaves like the python string (and Seq object) method of the 1319 same name, which does a non-overlapping count! 1320 1321 Returns an integer, the number of occurrences of substring 1322 argument sub in the (sub)sequence given by [start:end]. 1323 Optional arguments start and end are interpreted as in slice 1324 notation. 1325 1326 Arguments: 1327 - sub - a string or another Seq object to look for 1328 - start - optional integer, slice start 1329 - end - optional integer, slice end 1330 1331 >>> "NNNN".count("N") 1332 4 1333 >>> Seq("NNNN").count("N") 1334 4 1335 >>> UnknownSeq(4, character="N").count("N") 1336 4 1337 >>> UnknownSeq(4, character="N").count("A") 1338 0 1339 >>> UnknownSeq(4, character="N").count("AA") 1340 0 1341 1342 HOWEVER, please note because that python strings and Seq objects (and 1343 MutableSeq objects) do a non-overlapping search, this may not give 1344 the answer you expect: 1345 1346 >>> UnknownSeq(4, character="N").count("NN") 1347 2 1348 >>> UnknownSeq(4, character="N").count("NNN") 1349 1 1350 """ 1351 sub_str = self._get_seq_str_and_check_alphabet(sub) 1352 if len(sub_str) == 1: 1353 if str(sub_str) == self._character: 1354 if start == 0 and end >= self._length: 1355 return self._length 1356 else: 1357 # This could be done more cleverly... 1358 return str(self).count(sub_str, start, end) 1359 else: 1360 return 0 1361 else: 1362 if set(sub_str) == set(self._character): 1363 if start == 0 and end >= self._length: 1364 return self._length // len(sub_str) 1365 else: 1366 # This could be done more cleverly... 1367 return str(self).count(sub_str, start, end) 1368 else: 1369 return 0
1370
1371 - def complement(self):
1372 """The complement of an unknown nucleotide equals itself. 1373 1374 >>> my_nuc = UnknownSeq(8) 1375 >>> my_nuc 1376 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1377 >>> print(my_nuc) 1378 ???????? 1379 >>> my_nuc.complement() 1380 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1381 >>> print(my_nuc.complement()) 1382 ???????? 1383 """ 1384 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1385 Alphabet.ProteinAlphabet): 1386 raise ValueError("Proteins do not have complements!") 1387 return self
1388
1389 - def reverse_complement(self):
1390 """The reverse complement of an unknown nucleotide equals itself. 1391 1392 >>> my_nuc = UnknownSeq(10) 1393 >>> my_nuc 1394 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1395 >>> print(my_nuc) 1396 ?????????? 1397 >>> my_nuc.reverse_complement() 1398 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1399 >>> print(my_nuc.reverse_complement()) 1400 ?????????? 1401 """ 1402 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1403 Alphabet.ProteinAlphabet): 1404 raise ValueError("Proteins do not have complements!") 1405 return self
1406
1407 - def transcribe(self):
1408 """Returns unknown RNA sequence from an unknown DNA sequence. 1409 1410 >>> my_dna = UnknownSeq(10, character="N") 1411 >>> my_dna 1412 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1413 >>> print(my_dna) 1414 NNNNNNNNNN 1415 >>> my_rna = my_dna.transcribe() 1416 >>> my_rna 1417 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1418 >>> print(my_rna) 1419 NNNNNNNNNN 1420 """ 1421 # Offload the alphabet stuff 1422 s = Seq(self._character, self.alphabet).transcribe() 1423 return UnknownSeq(self._length, s.alphabet, self._character)
1424
1425 - def back_transcribe(self):
1426 """Returns unknown DNA sequence from an unknown RNA sequence. 1427 1428 >>> my_rna = UnknownSeq(20, character="N") 1429 >>> my_rna 1430 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1431 >>> print(my_rna) 1432 NNNNNNNNNNNNNNNNNNNN 1433 >>> my_dna = my_rna.back_transcribe() 1434 >>> my_dna 1435 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1436 >>> print(my_dna) 1437 NNNNNNNNNNNNNNNNNNNN 1438 """ 1439 # Offload the alphabet stuff 1440 s = Seq(self._character, self.alphabet).back_transcribe() 1441 return UnknownSeq(self._length, s.alphabet, self._character)
1442
1443 - def upper(self):
1444 """Returns an upper case copy of the sequence. 1445 1446 >>> from Bio.Alphabet import generic_dna 1447 >>> from Bio.Seq import UnknownSeq 1448 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1449 >>> my_seq 1450 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1451 >>> print(my_seq) 1452 nnnnnnnnnnnnnnnnnnnn 1453 >>> my_seq.upper() 1454 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1455 >>> print(my_seq.upper()) 1456 NNNNNNNNNNNNNNNNNNNN 1457 1458 This will adjust the alphabet if required. See also the lower method. 1459 """ 1460 return UnknownSeq(self._length, self.alphabet._upper(), 1461 self._character.upper())
1462
1463 - def lower(self):
1464 """Returns a lower case copy of the sequence. 1465 1466 This will adjust the alphabet if required: 1467 1468 >>> from Bio.Alphabet import IUPAC 1469 >>> from Bio.Seq import UnknownSeq 1470 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1471 >>> my_seq 1472 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1473 >>> print(my_seq) 1474 XXXXXXXXXXXXXXXXXXXX 1475 >>> my_seq.lower() 1476 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1477 >>> print(my_seq.lower()) 1478 xxxxxxxxxxxxxxxxxxxx 1479 1480 See also the upper method. 1481 """ 1482 return UnknownSeq(self._length, self.alphabet._lower(), 1483 self._character.lower())
1484
1485 - def translate(self, **kwargs):
1486 """Translate an unknown nucleotide sequence into an unknown protein. 1487 1488 e.g. 1489 1490 >>> my_seq = UnknownSeq(9, character="N") 1491 >>> print(my_seq) 1492 NNNNNNNNN 1493 >>> my_protein = my_seq.translate() 1494 >>> my_protein 1495 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1496 >>> print(my_protein) 1497 XXX 1498 1499 In comparison, using a normal Seq object: 1500 1501 >>> my_seq = Seq("NNNNNNNNN") 1502 >>> print(my_seq) 1503 NNNNNNNNN 1504 >>> my_protein = my_seq.translate() 1505 >>> my_protein 1506 Seq('XXX', ExtendedIUPACProtein()) 1507 >>> print(my_protein) 1508 XXX 1509 1510 """ 1511 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1512 Alphabet.ProteinAlphabet): 1513 raise ValueError("Proteins cannot be translated!") 1514 return UnknownSeq(self._length // 3, Alphabet.generic_protein, "X")
1515
1516 - def ungap(self, gap=None):
1517 """Return a copy of the sequence without the gap character(s). 1518 1519 The gap character can be specified in two ways - either as an explicit 1520 argument, or via the sequence's alphabet. For example: 1521 1522 >>> from Bio.Seq import UnknownSeq 1523 >>> from Bio.Alphabet import Gapped, generic_dna 1524 >>> my_dna = UnknownSeq(20, Gapped(generic_dna, "-")) 1525 >>> my_dna 1526 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1527 >>> my_dna.ungap() 1528 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1529 >>> my_dna.ungap("-") 1530 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1531 1532 If the UnknownSeq is using the gap character, then an empty Seq is 1533 returned: 1534 1535 >>> my_gap = UnknownSeq(20, Gapped(generic_dna, "-"), character="-") 1536 >>> my_gap 1537 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1538 >>> my_gap.ungap() 1539 Seq('', DNAAlphabet()) 1540 >>> my_gap.ungap("-") 1541 Seq('', DNAAlphabet()) 1542 1543 Notice that the returned sequence's alphabet is adjusted to remove any 1544 explicit gap character declaration. 1545 """ 1546 # Offload the alphabet stuff 1547 s = Seq(self._character, self.alphabet).ungap() 1548 if s: 1549 return UnknownSeq(self._length, s.alphabet, self._character) 1550 else: 1551 return Seq("", s.alphabet)
1552 1553
1554 -class MutableSeq(object):
1555 """An editable sequence object (with an alphabet). 1556 1557 Unlike normal python strings and our basic sequence object (the Seq class) 1558 which are immutable, the MutableSeq lets you edit the sequence in place. 1559 However, this means you cannot use a MutableSeq object as a dictionary key. 1560 1561 >>> from Bio.Seq import MutableSeq 1562 >>> from Bio.Alphabet import generic_dna 1563 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1564 >>> my_seq 1565 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1566 >>> my_seq[5] 1567 'T' 1568 >>> my_seq[5] = "A" 1569 >>> my_seq 1570 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1571 >>> my_seq[5] 1572 'A' 1573 >>> my_seq[5:8] = "NNN" 1574 >>> my_seq 1575 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1576 >>> len(my_seq) 1577 11 1578 1579 Note that the MutableSeq object does not support as many string-like 1580 or biological methods as the Seq object. 1581 """
1582 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
1583 if sys.version_info[0] == 3: 1584 self.array_indicator = "u" 1585 else: 1586 self.array_indicator = "c" 1587 if isinstance(data, str): # TODO - What about unicode? 1588 self.data = array.array(self.array_indicator, data) 1589 else: 1590 self.data = data # assumes the input is an array 1591 self.alphabet = alphabet
1592
1593 - def __repr__(self):
1594 """Return (truncated) representation of the sequence for debugging.""" 1595 if len(self) > 60: 1596 # Shows the last three letters as it is often useful to see if 1597 # there is a stop codon at the end of a sequence. 1598 # Note total length is 54+3+3=60 1599 return "{0}('{1}...{2}', {3!r})".format(self.__class__.__name__, 1600 str(self[:54]), 1601 str(self[-3:]), 1602 self.alphabet) 1603 else: 1604 return "{0}('{1}', {2!r})".format(self.__class__.__name__, 1605 str(self), 1606 self.alphabet)
1607
1608 - def __str__(self):
1609 """Returns the full sequence as a python string. 1610 1611 Note that Biopython 1.44 and earlier would give a truncated 1612 version of repr(my_seq) for str(my_seq). If you are writing code 1613 which needs to be backwards compatible with old Biopython, you 1614 should continue to use my_seq.tostring() rather than str(my_seq). 1615 """ 1616 # See test_GAQueens.py for an historic usage of a non-string alphabet! 1617 return "".join(self.data)
1618
1619 - def __eq__(self, other):
1620 """Compare the sequence to another sequence or a string (README). 1621 1622 Currently if compared to another sequence the alphabets must be 1623 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1624 an exception. Otherwise only the sequence itself is compared, not the 1625 precise alphabet. 1626 1627 A future release of Biopython will change this (and the Seq object etc) 1628 to use simple string comparison. The plan is that comparing sequences 1629 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1630 but not an exception. 1631 1632 During this transition period, please just do explicit comparisons: 1633 1634 >>> seq1 = MutableSeq("ACGT") 1635 >>> seq2 = MutableSeq("ACGT") 1636 >>> id(seq1) == id(seq2) 1637 False 1638 >>> str(seq1) == str(seq2) 1639 True 1640 1641 Biopython now does: 1642 1643 >>> seq1 == seq2 1644 True 1645 >>> seq1 == Seq("ACGT") 1646 True 1647 >>> seq1 == "ACGT" 1648 True 1649 1650 """ 1651 if hasattr(other, "alphabet"): 1652 if not Alphabet._check_type_compatible([self.alphabet, 1653 other.alphabet]): 1654 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1655 self.alphabet, other.alphabet), 1656 BiopythonWarning) 1657 if isinstance(other, MutableSeq): 1658 return self.data == other.data 1659 return str(self) == str(other)
1660
1661 - def __ne__(self, other):
1662 """Not equal, see __eq__ documentation.""" 1663 # Seem to require this method for Python 2 but not needed on Python 3? 1664 return not (self == other)
1665
1666 - def __lt__(self, other):
1667 """Less than, see __eq__ documentation.""" 1668 if hasattr(other, "alphabet"): 1669 if not Alphabet._check_type_compatible([self.alphabet, 1670 other.alphabet]): 1671 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1672 self.alphabet, other.alphabet), 1673 BiopythonWarning) 1674 if isinstance(other, MutableSeq): 1675 return self.data < other.data 1676 return str(self) < str(other)
1677
1678 - def __le__(self, other):
1679 """Less than or equal, see __eq__ documentation.""" 1680 if hasattr(other, "alphabet"): 1681 if not Alphabet._check_type_compatible([self.alphabet, 1682 other.alphabet]): 1683 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1684 self.alphabet, other.alphabet), 1685 BiopythonWarning) 1686 if isinstance(other, MutableSeq): 1687 return self.data <= other.data 1688 return str(self) <= str(other)
1689
1690 - def __len__(self):
1691 return len(self.data)
1692
1693 - def __getitem__(self, index):
1694 # Note since Python 2.0, __getslice__ is deprecated 1695 # and __getitem__ is used instead. 1696 # See http://docs.python.org/ref/sequence-methods.html 1697 if isinstance(index, int): 1698 # Return a single letter as a string 1699 return self.data[index] 1700 else: 1701 # Return the (sub)sequence as another Seq object 1702 return MutableSeq(self.data[index], self.alphabet)
1703
1704 - def __setitem__(self, index, value):
1705 # Note since Python 2.0, __setslice__ is deprecated 1706 # and __setitem__ is used instead. 1707 # See http://docs.python.org/ref/sequence-methods.html 1708 if isinstance(index, int): 1709 # Replacing a single letter with a new string 1710 self.data[index] = value 1711 else: 1712 # Replacing a sub-sequence 1713 if isinstance(value, MutableSeq): 1714 self.data[index] = value.data 1715 elif isinstance(value, type(self.data)): 1716 self.data[index] = value 1717 else: 1718 self.data[index] = array.array(self.array_indicator, 1719 str(value))
1720
1721 - def __delitem__(self, index):
1722 # Note since Python 2.0, __delslice__ is deprecated 1723 # and __delitem__ is used instead. 1724 # See http://docs.python.org/ref/sequence-methods.html 1725 1726 # Could be deleting a single letter, or a slice 1727 del self.data[index]
1728
1729 - def __add__(self, other):
1730 """Add another sequence or string to this sequence. 1731 1732 Returns a new MutableSeq object.""" 1733 if hasattr(other, "alphabet"): 1734 # other should be a Seq or a MutableSeq 1735 if not Alphabet._check_type_compatible([self.alphabet, 1736 other.alphabet]): 1737 raise TypeError( 1738 "Incompatible alphabets {0!r} and {1!r}".format( 1739 self.alphabet, other.alphabet)) 1740 # They should be the same sequence type (or one of them is generic) 1741 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1742 if isinstance(other, MutableSeq): 1743 # See test_GAQueens.py for an historic usage of a non-string 1744 # alphabet! Adding the arrays should support this. 1745 return self.__class__(self.data + other.data, a) 1746 else: 1747 return self.__class__(str(self) + str(other), a) 1748 elif isinstance(other, basestring): 1749 # other is a plain string - use the current alphabet 1750 return self.__class__(str(self) + str(other), self.alphabet) 1751 else: 1752 raise TypeError
1753
1754 - def __radd__(self, other):
1755 if hasattr(other, "alphabet"): 1756 # other should be a Seq or a MutableSeq 1757 if not Alphabet._check_type_compatible([self.alphabet, 1758 other.alphabet]): 1759 raise TypeError( 1760 "Incompatible alphabets {0!r} and {1!r}".format( 1761 self.alphabet, other.alphabet)) 1762 # They should be the same sequence type (or one of them is generic) 1763 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1764 if isinstance(other, MutableSeq): 1765 # See test_GAQueens.py for an historic usage of a non-string 1766 # alphabet! Adding the arrays should support this. 1767 return self.__class__(other.data + self.data, a) 1768 else: 1769 return self.__class__(str(other) + str(self), a) 1770 elif isinstance(other, basestring): 1771 # other is a plain string - use the current alphabet 1772 return self.__class__(str(other) + str(self), self.alphabet) 1773 else: 1774 raise TypeError
1775
1776 - def append(self, c):
1777 self.data.append(c)
1778
1779 - def insert(self, i, c):
1780 self.data.insert(i, c)
1781
1782 - def pop(self, i=(-1)):
1783 c = self.data[i] 1784 del self.data[i] 1785 return c
1786
1787 - def remove(self, item):
1788 for i in range(len(self.data)): 1789 if self.data[i] == item: 1790 del self.data[i] 1791 return 1792 raise ValueError("MutableSeq.remove(x): x not in list")
1793
1794 - def count(self, sub, start=0, end=sys.maxsize):
1795 """Non-overlapping count method, like that of a python string. 1796 1797 This behaves like the python string method of the same name, 1798 which does a non-overlapping count! 1799 1800 Returns an integer, the number of occurrences of substring 1801 argument sub in the (sub)sequence given by [start:end]. 1802 Optional arguments start and end are interpreted as in slice 1803 notation. 1804 1805 Arguments: 1806 - sub - a string or another Seq object to look for 1807 - start - optional integer, slice start 1808 - end - optional integer, slice end 1809 1810 e.g. 1811 1812 >>> from Bio.Seq import MutableSeq 1813 >>> my_mseq = MutableSeq("AAAATGA") 1814 >>> print(my_mseq.count("A")) 1815 5 1816 >>> print(my_mseq.count("ATG")) 1817 1 1818 >>> print(my_mseq.count(Seq("AT"))) 1819 1 1820 >>> print(my_mseq.count("AT", 2, -1)) 1821 1 1822 1823 HOWEVER, please note because that python strings, Seq objects and 1824 MutableSeq objects do a non-overlapping search, this may not give 1825 the answer you expect: 1826 1827 >>> "AAAA".count("AA") 1828 2 1829 >>> print(MutableSeq("AAAA").count("AA")) 1830 2 1831 1832 An overlapping search would give the answer as three! 1833 """ 1834 try: 1835 # TODO - Should we check the alphabet? 1836 search = str(sub) 1837 except AttributeError: 1838 search = sub 1839 1840 if not isinstance(search, basestring): 1841 raise TypeError("expected a string, Seq or MutableSeq") 1842 1843 if len(search) == 1: 1844 # Try and be efficient and work directly from the array. 1845 count = 0 1846 for c in self.data[start:end]: 1847 if c == search: 1848 count += 1 1849 return count 1850 else: 1851 # TODO - Can we do this more efficiently? 1852 return str(self).count(search, start, end)
1853
1854 - def index(self, item):
1855 for i in range(len(self.data)): 1856 if self.data[i] == item: 1857 return i 1858 raise ValueError("MutableSeq.index(x): x not in list")
1859
1860 - def reverse(self):
1861 """Modify the mutable sequence to reverse itself. 1862 1863 No return value. 1864 """ 1865 self.data.reverse()
1866
1867 - def complement(self):
1868 """Modify the mutable sequence to take on its complement. 1869 1870 Trying to complement a protein sequence raises an exception. 1871 1872 No return value. 1873 """ 1874 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1875 Alphabet.ProteinAlphabet): 1876 raise ValueError("Proteins do not have complements!") 1877 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1878 d = ambiguous_dna_complement 1879 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1880 d = ambiguous_rna_complement 1881 elif 'U' in self.data and 'T' in self.data: 1882 # TODO - Handle this cleanly? 1883 raise ValueError("Mixed RNA/DNA found") 1884 elif 'U' in self.data: 1885 d = ambiguous_rna_complement 1886 else: 1887 d = ambiguous_dna_complement 1888 c = dict([(x.lower(), y.lower()) for x, y in d.items()]) 1889 d.update(c) 1890 self.data = [d[c] for c in self.data] 1891 self.data = array.array(self.array_indicator, self.data)
1892
1893 - def reverse_complement(self):
1894 """Modify the mutable sequence to take on its reverse complement. 1895 1896 Trying to reverse complement a protein sequence raises an exception. 1897 1898 No return value. 1899 """ 1900 self.complement() 1901 self.data.reverse()
1902 1903 # Sorting a sequence makes no sense. 1904 # def sort(self, *args): self.data.sort(*args) 1905
1906 - def extend(self, other):
1907 if isinstance(other, MutableSeq): 1908 for c in other.data: 1909 self.data.append(c) 1910 else: 1911 for c in other: 1912 self.data.append(c)
1913
1914 - def tostring(self):
1915 """Returns the full sequence as a python string (DEPRECATED). 1916 1917 You are now encouraged to use str(my_seq) instead of my_seq.tostring() 1918 as this method is officially deprecated. 1919 1920 Because str(my_seq) will give you the full sequence as a python string, 1921 there is often no need to make an explicit conversion. For example, 1922 1923 >>> my_seq = Seq("ATCGTG") 1924 >>> my_name = "seq_1" 1925 >>> print("ID={%s}, sequence={%s}" % (my_name, my_seq)) 1926 ID={seq_1}, sequence={ATCGTG} 1927 1928 On Biopython 1.44 or older you would have to have done this: 1929 1930 >>> print("ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())) 1931 ID={seq_1}, sequence={ATCGTG} 1932 """ 1933 from Bio import BiopythonDeprecationWarning 1934 warnings.warn("This method is obsolete; please use str(my_seq) " 1935 "instead of my_seq.tostring().", 1936 BiopythonDeprecationWarning) 1937 return "".join(self.data)
1938
1939 - def toseq(self):
1940 """Returns the full sequence as a new immutable Seq object. 1941 1942 >>> from Bio.Seq import Seq 1943 >>> from Bio.Alphabet import IUPAC 1944 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 1945 ... IUPAC.protein) 1946 >>> my_mseq 1947 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1948 >>> my_mseq.toseq() 1949 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1950 1951 Note that the alphabet is preserved. 1952 """ 1953 return Seq("".join(self.data), self.alphabet)
1954 1955 1956 # The transcribe, backward_transcribe, and translate functions are 1957 # user-friendly versions of the corresponding functions in Bio.Transcribe 1958 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1959
1960 -def transcribe(dna):
1961 """Transcribes a DNA sequence into RNA. 1962 1963 If given a string, returns a new string object. 1964 1965 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1966 1967 Trying to transcribe a protein or RNA sequence raises an exception. 1968 1969 e.g. 1970 1971 >>> transcribe("ACTGN") 1972 'ACUGN' 1973 """ 1974 if isinstance(dna, Seq): 1975 return dna.transcribe() 1976 elif isinstance(dna, MutableSeq): 1977 return dna.toseq().transcribe() 1978 else: 1979 return dna.replace('T', 'U').replace('t', 'u')
1980 1981
1982 -def back_transcribe(rna):
1983 """Back-transcribes an RNA sequence into DNA. 1984 1985 If given a string, returns a new string object. 1986 1987 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1988 1989 Trying to transcribe a protein or DNA sequence raises an exception. 1990 1991 e.g. 1992 1993 >>> back_transcribe("ACUGN") 1994 'ACTGN' 1995 """ 1996 if isinstance(rna, Seq): 1997 return rna.back_transcribe() 1998 elif isinstance(rna, MutableSeq): 1999 return rna.toseq().back_transcribe() 2000 else: 2001 return rna.replace('U', 'T').replace('u', 't')
2002 2003
2004 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 2005 cds=False, pos_stop="X", gap=None):
2006 """Helper function to translate a nucleotide string (PRIVATE). 2007 2008 Arguments: 2009 - sequence - a string 2010 - table - a CodonTable object (NOT a table name or id number) 2011 - stop_symbol - a single character string, what to use for terminators. 2012 - to_stop - boolean, should translation terminate at the first 2013 in frame stop codon? If there is no in-frame stop codon 2014 then translation continues to the end. 2015 - pos_stop - a single character string for a possible stop codon 2016 (e.g. TAN or NNN) 2017 - cds - Boolean, indicates this is a complete CDS. If True, this 2018 checks the sequence starts with a valid alternative start 2019 codon (which will be translated as methionine, M), that the 2020 sequence length is a multiple of three, and that there is a 2021 single in frame stop codon at the end (this will be excluded 2022 from the protein sequence, regardless of the to_stop option). 2023 If these tests fail, an exception is raised. 2024 - gap - Single character string to denote symbol used for gaps. 2025 Defaults to None. 2026 2027 Returns a string. 2028 2029 e.g. 2030 2031 >>> from Bio.Data import CodonTable 2032 >>> table = CodonTable.ambiguous_dna_by_id[1] 2033 >>> _translate_str("AAA", table) 2034 'K' 2035 >>> _translate_str("TAR", table) 2036 '*' 2037 >>> _translate_str("TAN", table) 2038 'X' 2039 >>> _translate_str("TAN", table, pos_stop="@") 2040 '@' 2041 >>> _translate_str("TA?", table) 2042 Traceback (most recent call last): 2043 ... 2044 TranslationError: Codon 'TA?' is invalid 2045 2046 In a change to older versions of Biopython, partial codons are now 2047 always regarded as an error (previously only checked if cds=True) 2048 and will trigger a warning (likely to become an exception in a 2049 future release). 2050 2051 If **cds=True**, the start and stop codons are checked, and the start 2052 codon will be translated at methionine. The sequence must be an 2053 while number of codons. 2054 2055 >>> _translate_str("ATGCCCTAG", table, cds=True) 2056 'MP' 2057 >>> _translate_str("AAACCCTAG", table, cds=True) 2058 Traceback (most recent call last): 2059 ... 2060 TranslationError: First codon 'AAA' is not a start codon 2061 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 2062 Traceback (most recent call last): 2063 ... 2064 TranslationError: Extra in frame stop codon found. 2065 """ 2066 sequence = sequence.upper() 2067 amino_acids = [] 2068 forward_table = table.forward_table 2069 stop_codons = table.stop_codons 2070 if table.nucleotide_alphabet.letters is not None: 2071 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 2072 else: 2073 # Assume the worst case, ambiguous DNA or RNA: 2074 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + 2075 IUPAC.ambiguous_rna.letters.upper()) 2076 n = len(sequence) 2077 if cds: 2078 if str(sequence[:3]).upper() not in table.start_codons: 2079 raise CodonTable.TranslationError( 2080 "First codon '{0}' is not a start codon".format(sequence[:3])) 2081 if n % 3 != 0: 2082 raise CodonTable.TranslationError( 2083 "Sequence length {0} is not a multiple of three".format(n)) 2084 if str(sequence[-3:]).upper() not in stop_codons: 2085 raise CodonTable.TranslationError( 2086 "Final codon '{0}' is not a stop codon".format(sequence[-3:])) 2087 # Don't translate the stop symbol, and manually translate the M 2088 sequence = sequence[3:-3] 2089 n -= 6 2090 amino_acids = ["M"] 2091 elif n % 3 != 0: 2092 warnings.warn("Partial codon, len(sequence) not a multiple of three. " 2093 "Explicitly trim the sequence or add trailing N before " 2094 "translation. This may become an error in future.", 2095 BiopythonWarning) 2096 if gap is not None: 2097 if not isinstance(gap, basestring): 2098 raise TypeError("Gap character should be a single character " 2099 "string.") 2100 elif len(gap) > 1: 2101 raise ValueError("Gap character should be a single character " 2102 "string.") 2103 2104 for i in range(0, n - n % 3, 3): 2105 codon = sequence[i:i + 3] 2106 try: 2107 amino_acids.append(forward_table[codon]) 2108 except (KeyError, CodonTable.TranslationError): 2109 if codon in table.stop_codons: 2110 if cds: 2111 raise CodonTable.TranslationError( 2112 "Extra in frame stop codon found.") 2113 if to_stop: 2114 break 2115 amino_acids.append(stop_symbol) 2116 elif valid_letters.issuperset(set(codon)): 2117 # Possible stop codon (e.g. NNN or TAN) 2118 amino_acids.append(pos_stop) 2119 elif gap is not None and codon == gap * 3: 2120 # Gapped translation 2121 amino_acids.append(gap) 2122 else: 2123 raise CodonTable.TranslationError( 2124 "Codon '{0}' is invalid".format(codon)) 2125 return "".join(amino_acids)
2126 2127
2128 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 2129 cds=False, gap=None):
2130 """Translate a nucleotide sequence into amino acids. 2131 2132 If given a string, returns a new string object. Given a Seq or 2133 MutableSeq, returns a Seq object with a protein alphabet. 2134 2135 Arguments: 2136 - table - Which codon table to use? This can be either a name 2137 (string), an NCBI identifier (integer), or a CodonTable object 2138 (useful for non-standard genetic codes). Defaults to the "Standard" 2139 table. 2140 - stop_symbol - Single character string, what to use for any 2141 terminators, defaults to the asterisk, "*". 2142 - to_stop - Boolean, defaults to False meaning do a full 2143 translation continuing on past any stop codons 2144 (translated as the specified stop_symbol). If 2145 True, translation is terminated at the first in 2146 frame stop codon (and the stop_symbol is not 2147 appended to the returned protein sequence). 2148 - cds - Boolean, indicates this is a complete CDS. If True, this 2149 checks the sequence starts with a valid alternative start 2150 codon (which will be translated as methionine, M), that the 2151 sequence length is a multiple of three, and that there is a 2152 single in frame stop codon at the end (this will be excluded 2153 from the protein sequence, regardless of the to_stop option). 2154 If these tests fail, an exception is raised. 2155 - gap - Single character string to denote symbol used for gaps. 2156 Defaults to None. 2157 2158 A simple string example using the default (standard) genetic code: 2159 2160 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2161 >>> translate(coding_dna) 2162 'VAIVMGR*KGAR*' 2163 >>> translate(coding_dna, stop_symbol="@") 2164 'VAIVMGR@KGAR@' 2165 >>> translate(coding_dna, to_stop=True) 2166 'VAIVMGR' 2167 2168 Now using NCBI table 2, where TGA is not a stop codon: 2169 2170 >>> translate(coding_dna, table=2) 2171 'VAIVMGRWKGAR*' 2172 >>> translate(coding_dna, table=2, to_stop=True) 2173 'VAIVMGRWKGAR' 2174 2175 In fact this example uses an alternative start codon valid under NCBI 2176 table 2, GTG, which means this example is a complete valid CDS which 2177 when translated should really start with methionine (not valine): 2178 2179 >>> translate(coding_dna, table=2, cds=True) 2180 'MAIVMGRWKGAR' 2181 2182 Note that if the sequence has no in-frame stop codon, then the to_stop 2183 argument has no effect: 2184 2185 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2186 >>> translate(coding_dna2) 2187 'VAIVMGR' 2188 >>> translate(coding_dna2, to_stop=True) 2189 'VAIVMGR' 2190 2191 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2192 or a stop codon. These are translated as "X". Any invalid codon 2193 (e.g. "TA?" or "T-A") will throw a TranslationError. 2194 2195 It will however translate either DNA or RNA. 2196 """ 2197 if isinstance(sequence, Seq): 2198 return sequence.translate(table, stop_symbol, to_stop, cds) 2199 elif isinstance(sequence, MutableSeq): 2200 # Return a Seq object 2201 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2202 else: 2203 # Assume its a string, return a string 2204 try: 2205 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2206 except ValueError: 2207 codon_table = CodonTable.ambiguous_generic_by_name[table] 2208 except (AttributeError, TypeError): 2209 if isinstance(table, CodonTable.CodonTable): 2210 codon_table = table 2211 else: 2212 raise ValueError('Bad table argument') 2213 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds, 2214 gap=gap)
2215 2216
2217 -def reverse_complement(sequence):
2218 """Returns the reverse complement sequence of a nucleotide string. 2219 2220 If given a string, returns a new string object. 2221 Given a Seq or a MutableSeq, returns a new Seq object with the same 2222 alphabet. 2223 2224 Supports unambiguous and ambiguous nucleotide sequences. 2225 2226 e.g. 2227 2228 >>> reverse_complement("ACTG-NH") 2229 'DN-CAGT' 2230 """ 2231 return complement(sequence)[::-1]
2232 2233
2234 -def complement(sequence):
2235 """Returns the complement sequence of a nucleotide string. 2236 2237 If given a string, returns a new string object. 2238 Given a Seq or a MutableSeq, returns a new Seq object with the same 2239 alphabet. 2240 2241 Supports unambiguous and ambiguous nucleotide sequences. 2242 2243 e.g. 2244 2245 >>> complement("ACTG-NH") 2246 'TGAC-ND' 2247 """ 2248 if isinstance(sequence, Seq): 2249 # Return a Seq 2250 return sequence.complement() 2251 elif isinstance(sequence, MutableSeq): 2252 # Return a Seq 2253 # Don't use the MutableSeq reverse_complement method as it is 2254 # 'in place'. 2255 return sequence.toseq().complement() 2256 2257 # Assume its a string. 2258 # In order to avoid some code duplication, the old code would turn the 2259 # string into a Seq, use the reverse_complement method, and convert back 2260 # to a string. 2261 # This worked, but is over five times slower on short sequences! 2262 if ('U' in sequence or 'u' in sequence) \ 2263 and ('T' in sequence or 't' in sequence): 2264 raise ValueError("Mixed RNA/DNA found") 2265 elif 'U' in sequence or 'u' in sequence: 2266 ttable = _rna_complement_table 2267 else: 2268 ttable = _dna_complement_table 2269 return sequence.translate(ttable)
2270 2271
2272 -def _test():
2273 """Run the Bio.Seq module's doctests (PRIVATE).""" 2274 print("Running doctests...") 2275 import doctest 2276 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2277 print("Done")
2278 2279 if __name__ == "__main__": 2280 _test() 2281