Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2015 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also the Seq_ wiki and the chapter in our tutorial: 
  11      - `HTML Tutorial`_ 
  12      - `PDF Tutorial`_ 
  13   
  14  .. _Seq: http://biopython.org/wiki/Seq 
  15  .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html 
  16  .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf 
  17  """ 
  18  from __future__ import print_function 
  19   
  20  import string  # for maketrans only 
  21  import array 
  22  import sys 
  23  import warnings 
  24   
  25  from Bio._py3k import range 
  26  from Bio._py3k import basestring 
  27   
  28  from Bio import BiopythonWarning 
  29  from Bio import Alphabet 
  30  from Bio.Alphabet import IUPAC 
  31  from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  32  from Bio.Data import CodonTable 
  33   
  34   
35 -def _maketrans(complement_mapping):
36 """Makes a python string translation table (PRIVATE). 37 38 Arguments: 39 - complement_mapping - a dictionary such as ambiguous_dna_complement 40 and ambiguous_rna_complement from Data.IUPACData. 41 42 Returns a translation table (a string of length 256) for use with the 43 python string's translate method to use in a (reverse) complement. 44 45 Compatible with lower case and upper case sequences. 46 47 For internal use only. 48 """ 49 before = ''.join(complement_mapping.keys()) 50 after = ''.join(complement_mapping.values()) 51 before += before.lower() 52 after += after.lower() 53 if sys.version_info[0] == 3: 54 return str.maketrans(before, after) 55 else: 56 return string.maketrans(before, after)
57 58 _dna_complement_table = _maketrans(ambiguous_dna_complement) 59 _rna_complement_table = _maketrans(ambiguous_rna_complement) 60 61
62 -class Seq(object):
63 """A read-only sequence object (essentially a string with an alphabet). 64 65 Like normal python strings, our basic sequence object is immutable. 66 This prevents you from doing my_seq[5] = "A" for example, but does allow 67 Seq objects to be used as dictionary keys. 68 69 The Seq object provides a number of string like methods (such as count, 70 find, split and strip), which are alphabet aware where appropriate. 71 72 In addition to the string like sequence, the Seq object has an alphabet 73 property. This is an instance of an Alphabet class from Bio.Alphabet, 74 for example generic DNA, or IUPAC DNA. This describes the type of molecule 75 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 76 (letters). 77 78 The Seq object also provides some biological methods, such as complement, 79 reverse_complement, transcribe, back_transcribe and translate (which are 80 not applicable to sequences with a protein alphabet). 81 """
82 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
83 """Create a Seq object. 84 85 Arguments: 86 - seq - Sequence, required (string) 87 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 88 89 You will typically use Bio.SeqIO to read in sequences from files as 90 SeqRecord objects, whose sequence will be exposed as a Seq object via 91 the seq property. 92 93 However, will often want to create your own Seq objects directly: 94 95 >>> from Bio.Seq import Seq 96 >>> from Bio.Alphabet import IUPAC 97 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 98 ... IUPAC.protein) 99 >>> my_seq 100 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 101 >>> print(my_seq) 102 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 103 >>> my_seq.alphabet 104 IUPACProtein() 105 """ 106 # Enforce string storage 107 if not isinstance(data, basestring): 108 raise TypeError("The sequence data given to a Seq object should " 109 "be a string (not another Seq object etc)") 110 self._data = data 111 self.alphabet = alphabet # Seq API requirement
112
113 - def __repr__(self):
114 """Returns a (truncated) representation of the sequence for debugging.""" 115 if len(self) > 60: 116 # Shows the last three letters as it is often useful to see if there 117 # is a stop codon at the end of a sequence. 118 # Note total length is 54+3+3=60 119 return "{0}('{1}...{2}', {3!r})".format(self.__class__.__name__, 120 str(self)[:54], 121 str(self)[-3:], 122 self.alphabet) 123 else: 124 return '{0}({1!r}, {2!r})'.format(self.__class__.__name__, 125 self._data, 126 self.alphabet)
127
128 - def __str__(self):
129 """Returns the full sequence as a python string, use str(my_seq). 130 131 Note that Biopython 1.44 and earlier would give a truncated 132 version of repr(my_seq) for str(my_seq). If you are writing code 133 which need to be backwards compatible with old Biopython, you 134 should continue to use my_seq.tostring() rather than str(my_seq). 135 """ 136 return self._data
137
138 - def __hash__(self):
139 """Hash for comparison. 140 141 See the __cmp__ documentation - this has changed from past 142 versions of Biopython! 143 """ 144 # TODO - remove this warning in a future release 145 warnings.warn("Biopython Seq objects now use string comparison. " 146 "Older versions of Biopython used object comparison. " 147 "During this transition, please use hash(id(my_seq)) " 148 "or my_dict[id(my_seq)] if you want the old behaviour, " 149 "or use hash(str(my_seq)) or my_dict[str(my_seq)] for " 150 "the new string hashing behaviour.", BiopythonWarning) 151 return hash(str(self))
152
153 - def __eq__(self, other):
154 """Compare the sequence to another sequence or a string (README). 155 156 Historically comparing Seq objects has done Python object comparison. 157 After considerable discussion (keeping in mind constraints of the 158 Python language, hashes and dictionary support), Biopython now uses 159 simple string comparison (with a warning about the change). 160 161 Note that incompatible alphabets (e.g. DNA to RNA) will trigger a 162 warning. 163 164 During this transition period, please just do explicit comparisons: 165 166 >>> from Bio.Seq import Seq 167 >>> from Bio.Alphabet import generic_dna 168 >>> seq1 = Seq("ACGT") 169 >>> seq2 = Seq("ACGT") 170 >>> id(seq1) == id(seq2) 171 False 172 >>> str(seq1) == str(seq2) 173 True 174 175 The new behaviour is to use string-like equality: 176 177 >>> from Bio.Seq import Seq 178 >>> from Bio.Alphabet import generic_dna 179 >>> seq1 == seq2 180 True 181 >>> seq1 == "ACGT" 182 True 183 >>> seq1 == Seq("ACGT", generic_dna) 184 True 185 186 """ 187 if hasattr(other, "alphabet"): 188 # other could be a Seq or a MutableSeq 189 if not Alphabet._check_type_compatible([self.alphabet, 190 other.alphabet]): 191 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 192 self.alphabet, other.alphabet), 193 BiopythonWarning) 194 return str(self) == str(other)
195
196 - def __ne__(self, other):
197 """Not equal, see __eq__ documentation.""" 198 # Seem to require this method under Python 2 but not needed on Python 3? 199 return not (self == other)
200
201 - def __lt__(self, other):
202 """Less than, see __eq__ documentation.""" 203 if hasattr(other, "alphabet"): 204 if not Alphabet._check_type_compatible([self.alphabet, 205 other.alphabet]): 206 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 207 self.alphabet, other.alphabet), 208 BiopythonWarning) 209 return str(self) < str(other)
210
211 - def __le__(self, other):
212 """Less than or equal, see __eq__ documentation.""" 213 if hasattr(other, "alphabet"): 214 if not Alphabet._check_type_compatible([self.alphabet, 215 other.alphabet]): 216 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 217 self.alphabet, other.alphabet), 218 BiopythonWarning) 219 return str(self) <= str(other)
220
221 - def __len__(self):
222 """Returns the length of the sequence, use len(my_seq).""" 223 return len(self._data) # Seq API requirement
224
225 - def __getitem__(self, index): # Seq API requirement
226 """Returns a subsequence of single letter, use my_seq[index].""" 227 # Note since Python 2.0, __getslice__ is deprecated 228 # and __getitem__ is used instead. 229 # See http://docs.python.org/ref/sequence-methods.html 230 if isinstance(index, int): 231 # Return a single letter as a string 232 return self._data[index] 233 else: 234 # Return the (sub)sequence as another Seq object 235 return Seq(self._data[index], self.alphabet)
236
237 - def __add__(self, other):
238 """Add another sequence or string to this sequence. 239 240 If adding a string to a Seq, the alphabet is preserved: 241 242 >>> from Bio.Seq import Seq 243 >>> from Bio.Alphabet import generic_protein 244 >>> Seq("MELKI", generic_protein) + "LV" 245 Seq('MELKILV', ProteinAlphabet()) 246 247 When adding two Seq (like) objects, the alphabets are important. 248 Consider this example: 249 250 >>> from Bio.Seq import Seq 251 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 252 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 253 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 254 >>> unamb_dna_seq 255 Seq('ACGT', IUPACUnambiguousDNA()) 256 >>> ambig_dna_seq 257 Seq('ACRGT', IUPACAmbiguousDNA()) 258 259 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 260 the more general ambiguous IUPAC DNA alphabet: 261 262 >>> unamb_dna_seq + ambig_dna_seq 263 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 264 265 However, if the default generic alphabet is included, the result is 266 a generic alphabet: 267 268 >>> Seq("") + ambig_dna_seq 269 Seq('ACRGT', Alphabet()) 270 271 You can't add RNA and DNA sequences: 272 273 >>> from Bio.Alphabet import generic_dna, generic_rna 274 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 275 Traceback (most recent call last): 276 ... 277 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 278 279 You can't add nucleotide and protein sequences: 280 281 >>> from Bio.Alphabet import generic_dna, generic_protein 282 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 283 Traceback (most recent call last): 284 ... 285 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 286 """ 287 if hasattr(other, "alphabet"): 288 # other should be a Seq or a MutableSeq 289 if not Alphabet._check_type_compatible([self.alphabet, 290 other.alphabet]): 291 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 292 self.alphabet, other.alphabet)) 293 # They should be the same sequence type (or one of them is generic) 294 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 295 return self.__class__(str(self) + str(other), a) 296 elif isinstance(other, basestring): 297 # other is a plain string - use the current alphabet 298 return self.__class__(str(self) + other, self.alphabet) 299 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports 300 if isinstance(other, SeqRecord): 301 # Get the SeqRecord's __radd__ to handle this 302 return NotImplemented 303 else: 304 raise TypeError
305
306 - def __radd__(self, other):
307 """Adding a sequence on the left. 308 309 If adding a string to a Seq, the alphabet is preserved: 310 311 >>> from Bio.Seq import Seq 312 >>> from Bio.Alphabet import generic_protein 313 >>> "LV" + Seq("MELKI", generic_protein) 314 Seq('LVMELKI', ProteinAlphabet()) 315 316 Adding two Seq (like) objects is handled via the __add__ method. 317 """ 318 if hasattr(other, "alphabet"): 319 # other should be a Seq or a MutableSeq 320 if not Alphabet._check_type_compatible([self.alphabet, 321 other.alphabet]): 322 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 323 self.alphabet, other.alphabet)) 324 # They should be the same sequence type (or one of them is generic) 325 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 326 return self.__class__(str(other) + str(self), a) 327 elif isinstance(other, basestring): 328 # other is a plain string - use the current alphabet 329 return self.__class__(other + str(self), self.alphabet) 330 else: 331 raise TypeError
332
333 - def tostring(self): # Seq API requirement
334 """Returns the full sequence as a python string (DEPRECATED). 335 336 You are now encouraged to use str(my_seq) instead of 337 my_seq.tostring().""" 338 from Bio import BiopythonDeprecationWarning 339 warnings.warn("This method is obsolete; please use str(my_seq) " 340 "instead of my_seq.tostring().", 341 BiopythonDeprecationWarning) 342 return str(self) 343
344 - def tomutable(self): # Needed? Or use a function?
345 """Returns the full sequence as a MutableSeq object. 346 347 >>> from Bio.Seq import Seq 348 >>> from Bio.Alphabet import IUPAC 349 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 350 ... IUPAC.protein) 351 >>> my_seq 352 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 353 >>> my_seq.tomutable() 354 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 355 356 Note that the alphabet is preserved. 357 """ 358 return MutableSeq(str(self), self.alphabet) 359
360 - def _get_seq_str_and_check_alphabet(self, other_sequence):
361 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 362 363 For a string argument, returns the string. 364 365 For a Seq or MutableSeq, it checks the alphabet is compatible 366 (raising an exception if it isn't), and then returns a string. 367 """ 368 try: 369 other_alpha = other_sequence.alphabet 370 except AttributeError: 371 # Assume other_sequence is a string 372 return other_sequence 373 374 # Other should be a Seq or a MutableSeq 375 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 376 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 377 self.alphabet, other_alpha)) 378 # Return as a string 379 return str(other_sequence)
380
381 - def count(self, sub, start=0, end=sys.maxsize):
382 """Non-overlapping count method, like that of a python string. 383 384 This behaves like the python string method of the same name, 385 which does a non-overlapping count! 386 387 Returns an integer, the number of occurrences of substring 388 argument sub in the (sub)sequence given by [start:end]. 389 Optional arguments start and end are interpreted as in slice 390 notation. 391 392 Arguments: 393 - sub - a string or another Seq object to look for 394 - start - optional integer, slice start 395 - end - optional integer, slice end 396 397 e.g. 398 399 >>> from Bio.Seq import Seq 400 >>> my_seq = Seq("AAAATGA") 401 >>> print(my_seq.count("A")) 402 5 403 >>> print(my_seq.count("ATG")) 404 1 405 >>> print(my_seq.count(Seq("AT"))) 406 1 407 >>> print(my_seq.count("AT", 2, -1)) 408 1 409 410 HOWEVER, please note because python strings and Seq objects (and 411 MutableSeq objects) do a non-overlapping search, this may not give 412 the answer you expect: 413 414 >>> "AAAA".count("AA") 415 2 416 >>> print(Seq("AAAA").count("AA")) 417 2 418 419 An overlapping search would give the answer as three! 420 """ 421 # If it has one, check the alphabet: 422 sub_str = self._get_seq_str_and_check_alphabet(sub) 423 return str(self).count(sub_str, start, end)
424
425 - def __contains__(self, char):
426 """Implements the 'in' keyword, like a python string. 427 428 e.g. 429 430 >>> from Bio.Seq import Seq 431 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 432 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 433 >>> "AAA" in my_dna 434 True 435 >>> Seq("AAA") in my_dna 436 True 437 >>> Seq("AAA", generic_dna) in my_dna 438 True 439 440 Like other Seq methods, this will raise a type error if another Seq 441 (or Seq like) object with an incompatible alphabet is used: 442 443 >>> Seq("AAA", generic_rna) in my_dna 444 Traceback (most recent call last): 445 ... 446 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 447 >>> Seq("AAA", generic_protein) in my_dna 448 Traceback (most recent call last): 449 ... 450 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 451 """ 452 # If it has one, check the alphabet: 453 sub_str = self._get_seq_str_and_check_alphabet(char) 454 return sub_str in str(self)
455
456 - def find(self, sub, start=0, end=sys.maxsize):
457 """Find method, like that of a python string. 458 459 This behaves like the python string method of the same name. 460 461 Returns an integer, the index of the first occurrence of substring 462 argument sub in the (sub)sequence given by [start:end]. 463 464 Arguments: 465 - sub - a string or another Seq object to look for 466 - start - optional integer, slice start 467 - end - optional integer, slice end 468 469 Returns -1 if the subsequence is NOT found. 470 471 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 472 473 >>> from Bio.Seq import Seq 474 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 475 >>> my_rna.find("AUG") 476 3 477 """ 478 # If it has one, check the alphabet: 479 sub_str = self._get_seq_str_and_check_alphabet(sub) 480 return str(self).find(sub_str, start, end)
481
482 - def rfind(self, sub, start=0, end=sys.maxsize):
483 """Find from right method, like that of a python string. 484 485 This behaves like the python string method of the same name. 486 487 Returns an integer, the index of the last (right most) occurrence of 488 substring argument sub in the (sub)sequence given by [start:end]. 489 490 Arguments: 491 - sub - a string or another Seq object to look for 492 - start - optional integer, slice start 493 - end - optional integer, slice end 494 495 Returns -1 if the subsequence is NOT found. 496 497 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 498 499 >>> from Bio.Seq import Seq 500 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 501 >>> my_rna.rfind("AUG") 502 15 503 """ 504 # If it has one, check the alphabet: 505 sub_str = self._get_seq_str_and_check_alphabet(sub) 506 return str(self).rfind(sub_str, start, end)
507
508 - def startswith(self, prefix, start=0, end=sys.maxsize):
509 """Does the Seq start with the given prefix? Returns True/False. 510 511 This behaves like the python string method of the same name. 512 513 Return True if the sequence starts with the specified prefix 514 (a string or another Seq object), False otherwise. 515 With optional start, test sequence beginning at that position. 516 With optional end, stop comparing sequence at that position. 517 prefix can also be a tuple of strings to try. e.g. 518 519 >>> from Bio.Seq import Seq 520 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 521 >>> my_rna.startswith("GUC") 522 True 523 >>> my_rna.startswith("AUG") 524 False 525 >>> my_rna.startswith("AUG", 3) 526 True 527 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) 528 True 529 """ 530 # If it has one, check the alphabet: 531 if isinstance(prefix, tuple): 532 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 533 for p in prefix) 534 return str(self).startswith(prefix_strs, start, end) 535 else: 536 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 537 return str(self).startswith(prefix_str, start, end)
538
539 - def endswith(self, suffix, start=0, end=sys.maxsize):
540 """Does the Seq end with the given suffix? Returns True/False. 541 542 This behaves like the python string method of the same name. 543 544 Return True if the sequence ends with the specified suffix 545 (a string or another Seq object), False otherwise. 546 With optional start, test sequence beginning at that position. 547 With optional end, stop comparing sequence at that position. 548 suffix can also be a tuple of strings to try. e.g. 549 550 >>> from Bio.Seq import Seq 551 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 552 >>> my_rna.endswith("UUG") 553 True 554 >>> my_rna.endswith("AUG") 555 False 556 >>> my_rna.endswith("AUG", 0, 18) 557 True 558 >>> my_rna.endswith(("UCC", "UCA", "UUG")) 559 True 560 """ 561 # If it has one, check the alphabet: 562 if isinstance(suffix, tuple): 563 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 564 for p in suffix) 565 return str(self).endswith(suffix_strs, start, end) 566 else: 567 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 568 return str(self).endswith(suffix_str, start, end)
569
570 - def split(self, sep=None, maxsplit=-1):
571 """Split method, like that of a python string. 572 573 This behaves like the python string method of the same name. 574 575 Return a list of the 'words' in the string (as Seq objects), 576 using sep as the delimiter string. If maxsplit is given, at 577 most maxsplit splits are done. If maxsplit is omitted, all 578 splits are made. 579 580 Following the python string method, sep will by default be any 581 white space (tabs, spaces, newlines) but this is unlikely to 582 apply to biological sequences. 583 584 e.g. 585 586 >>> from Bio.Seq import Seq 587 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 588 >>> my_aa = my_rna.translate() 589 >>> my_aa 590 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 591 >>> my_aa.split("*") 592 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 593 >>> my_aa.split("*", 1) 594 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 595 596 See also the rsplit method: 597 598 >>> my_aa.rsplit("*", 1) 599 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 600 """ 601 # If it has one, check the alphabet: 602 sep_str = self._get_seq_str_and_check_alphabet(sep) 603 # TODO - If the sep is the defined stop symbol, or gap char, 604 # should we adjust the alphabet? 605 return [Seq(part, self.alphabet) 606 for part in str(self).split(sep_str, maxsplit)]
607
608 - def rsplit(self, sep=None, maxsplit=-1):
609 """Right split method, like that of a python string. 610 611 This behaves like the python string method of the same name. 612 613 Return a list of the 'words' in the string (as Seq objects), 614 using sep as the delimiter string. If maxsplit is given, at 615 most maxsplit splits are done COUNTING FROM THE RIGHT. 616 If maxsplit is omitted, all splits are made. 617 618 Following the python string method, sep will by default be any 619 white space (tabs, spaces, newlines) but this is unlikely to 620 apply to biological sequences. 621 622 e.g. print(my_seq.rsplit("*",1)) 623 624 See also the split method. 625 """ 626 # If it has one, check the alphabet: 627 sep_str = self._get_seq_str_and_check_alphabet(sep) 628 return [Seq(part, self.alphabet) 629 for part in str(self).rsplit(sep_str, maxsplit)]
630
631 - def strip(self, chars=None):
632 """Returns a new Seq object with leading and trailing ends stripped. 633 634 This behaves like the python string method of the same name. 635 636 Optional argument chars defines which characters to remove. If 637 omitted or None (default) then as for the python string method, 638 this defaults to removing any white space. 639 640 e.g. print(my_seq.strip("-")) 641 642 See also the lstrip and rstrip methods. 643 """ 644 # If it has one, check the alphabet: 645 strip_str = self._get_seq_str_and_check_alphabet(chars) 646 return Seq(str(self).strip(strip_str), self.alphabet)
647
648 - def lstrip(self, chars=None):
649 """Returns a new Seq object with leading (left) end stripped. 650 651 This behaves like the python string method of the same name. 652 653 Optional argument chars defines which characters to remove. If 654 omitted or None (default) then as for the python string method, 655 this defaults to removing any white space. 656 657 e.g. print(my_seq.lstrip("-")) 658 659 See also the strip and rstrip methods. 660 """ 661 # If it has one, check the alphabet: 662 strip_str = self._get_seq_str_and_check_alphabet(chars) 663 return Seq(str(self).lstrip(strip_str), self.alphabet)
664
665 - def rstrip(self, chars=None):
666 """Returns a new Seq object with trailing (right) end stripped. 667 668 This behaves like the python string method of the same name. 669 670 Optional argument chars defines which characters to remove. If 671 omitted or None (default) then as for the python string method, 672 this defaults to removing any white space. 673 674 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 675 676 >>> from Bio.Alphabet import IUPAC 677 >>> from Bio.Seq import Seq 678 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 679 >>> my_seq 680 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 681 >>> my_seq.rstrip("A") 682 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 683 684 See also the strip and lstrip methods. 685 """ 686 # If it has one, check the alphabet: 687 strip_str = self._get_seq_str_and_check_alphabet(chars) 688 return Seq(str(self).rstrip(strip_str), self.alphabet)
689
690 - def upper(self):
691 """Returns an upper case copy of the sequence. 692 693 >>> from Bio.Alphabet import HasStopCodon, generic_protein 694 >>> from Bio.Seq import Seq 695 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 696 >>> my_seq 697 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 698 >>> my_seq.lower() 699 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 700 >>> my_seq.upper() 701 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 702 703 This will adjust the alphabet if required. See also the lower method. 704 """ 705 return Seq(str(self).upper(), self.alphabet._upper())
706
707 - def lower(self):
708 """Returns a lower case copy of the sequence. 709 710 This will adjust the alphabet if required. Note that the IUPAC alphabets 711 are upper case only, and thus a generic alphabet must be substituted. 712 713 >>> from Bio.Alphabet import Gapped, generic_dna 714 >>> from Bio.Alphabet import IUPAC 715 >>> from Bio.Seq import Seq 716 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*")) 717 >>> my_seq 718 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 719 >>> my_seq.lower() 720 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 721 722 See also the upper method. 723 """ 724 return Seq(str(self).lower(), self.alphabet._lower())
725
726 - def complement(self):
727 """Returns the complement sequence. New Seq object. 728 729 >>> from Bio.Seq import Seq 730 >>> from Bio.Alphabet import IUPAC 731 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 732 >>> my_dna 733 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 734 >>> my_dna.complement() 735 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 736 737 You can of course used mixed case sequences, 738 739 >>> from Bio.Seq import Seq 740 >>> from Bio.Alphabet import generic_dna 741 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 742 >>> my_dna 743 Seq('CCCCCgatA-GD', DNAAlphabet()) 744 >>> my_dna.complement() 745 Seq('GGGGGctaT-CH', DNAAlphabet()) 746 747 Note in the above example, ambiguous character D denotes 748 G, A or T so its complement is H (for C, T or A). 749 750 Trying to complement a protein sequence raises an exception. 751 752 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 753 >>> my_protein.complement() 754 Traceback (most recent call last): 755 ... 756 ValueError: Proteins do not have complements! 757 """ 758 base = Alphabet._get_base_alphabet(self.alphabet) 759 if isinstance(base, Alphabet.ProteinAlphabet): 760 raise ValueError("Proteins do not have complements!") 761 if isinstance(base, Alphabet.DNAAlphabet): 762 ttable = _dna_complement_table 763 elif isinstance(base, Alphabet.RNAAlphabet): 764 ttable = _rna_complement_table 765 elif ('U' in self._data or 'u' in self._data) \ 766 and ('T' in self._data or 't' in self._data): 767 # TODO - Handle this cleanly? 768 raise ValueError("Mixed RNA/DNA found") 769 elif 'U' in self._data or 'u' in self._data: 770 ttable = _rna_complement_table 771 else: 772 ttable = _dna_complement_table 773 # Much faster on really long sequences than the previous loop based one. 774 # thx to Michael Palmer, University of Waterloo 775 return Seq(str(self).translate(ttable), self.alphabet)
776
777 - def reverse_complement(self):
778 """Returns the reverse complement sequence. New Seq object. 779 780 >>> from Bio.Seq import Seq 781 >>> from Bio.Alphabet import IUPAC 782 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 783 >>> my_dna 784 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 785 >>> my_dna.reverse_complement() 786 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 787 788 Note in the above example, since R = G or A, its complement 789 is Y (which denotes C or T). 790 791 You can of course used mixed case sequences, 792 793 >>> from Bio.Seq import Seq 794 >>> from Bio.Alphabet import generic_dna 795 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 796 >>> my_dna 797 Seq('CCCCCgatA-G', DNAAlphabet()) 798 >>> my_dna.reverse_complement() 799 Seq('C-TatcGGGGG', DNAAlphabet()) 800 801 Trying to complement a protein sequence raises an exception: 802 803 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 804 >>> my_protein.reverse_complement() 805 Traceback (most recent call last): 806 ... 807 ValueError: Proteins do not have complements! 808 """ 809 # Use -1 stride/step to reverse the complement 810 return self.complement()[::-1]
811
812 - def transcribe(self):
813 """Returns the RNA sequence from a DNA sequence. New Seq object. 814 815 >>> from Bio.Seq import Seq 816 >>> from Bio.Alphabet import IUPAC 817 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 818 ... IUPAC.unambiguous_dna) 819 >>> coding_dna 820 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 821 >>> coding_dna.transcribe() 822 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 823 824 Trying to transcribe a protein or RNA sequence raises an exception: 825 826 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 827 >>> my_protein.transcribe() 828 Traceback (most recent call last): 829 ... 830 ValueError: Proteins cannot be transcribed! 831 """ 832 base = Alphabet._get_base_alphabet(self.alphabet) 833 if isinstance(base, Alphabet.ProteinAlphabet): 834 raise ValueError("Proteins cannot be transcribed!") 835 if isinstance(base, Alphabet.RNAAlphabet): 836 raise ValueError("RNA cannot be transcribed!") 837 838 if self.alphabet == IUPAC.unambiguous_dna: 839 alphabet = IUPAC.unambiguous_rna 840 elif self.alphabet == IUPAC.ambiguous_dna: 841 alphabet = IUPAC.ambiguous_rna 842 else: 843 alphabet = Alphabet.generic_rna 844 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
845
846 - def back_transcribe(self):
847 """Returns the DNA sequence from an RNA sequence. New Seq object. 848 849 >>> from Bio.Seq import Seq 850 >>> from Bio.Alphabet import IUPAC 851 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 852 ... IUPAC.unambiguous_rna) 853 >>> messenger_rna 854 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 855 >>> messenger_rna.back_transcribe() 856 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 857 858 Trying to back-transcribe a protein or DNA sequence raises an 859 exception: 860 861 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 862 >>> my_protein.back_transcribe() 863 Traceback (most recent call last): 864 ... 865 ValueError: Proteins cannot be back transcribed! 866 """ 867 base = Alphabet._get_base_alphabet(self.alphabet) 868 if isinstance(base, Alphabet.ProteinAlphabet): 869 raise ValueError("Proteins cannot be back transcribed!") 870 if isinstance(base, Alphabet.DNAAlphabet): 871 raise ValueError("DNA cannot be back transcribed!") 872 873 if self.alphabet == IUPAC.unambiguous_rna: 874 alphabet = IUPAC.unambiguous_dna 875 elif self.alphabet == IUPAC.ambiguous_rna: 876 alphabet = IUPAC.ambiguous_dna 877 else: 878 alphabet = Alphabet.generic_dna 879 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
880
881 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 882 cds=False, gap=None):
883 """Turns a nucleotide sequence into a protein sequence. New Seq object. 884 885 This method will translate DNA or RNA sequences, and those with a 886 nucleotide or generic alphabet. Trying to translate a protein 887 sequence raises an exception. 888 889 Arguments: 890 - table - Which codon table to use? This can be either a name 891 (string), an NCBI identifier (integer), or a CodonTable 892 object (useful for non-standard genetic codes). This 893 defaults to the "Standard" table. 894 - stop_symbol - Single character string, what to use for terminators. 895 This defaults to the asterisk, "*". 896 - to_stop - Boolean, defaults to False meaning do a full translation 897 continuing on past any stop codons (translated as the 898 specified stop_symbol). If True, translation is 899 terminated at the first in frame stop codon (and the 900 stop_symbol is not appended to the returned protein 901 sequence). 902 - cds - Boolean, indicates this is a complete CDS. If True, 903 this checks the sequence starts with a valid alternative start 904 codon (which will be translated as methionine, M), that the 905 sequence length is a multiple of three, and that there is a 906 single in frame stop codon at the end (this will be excluded 907 from the protein sequence, regardless of the to_stop option). 908 If these tests fail, an exception is raised. 909 - gap - Single character string to denote symbol used for gaps. 910 It will try to guess the gap character from the alphabet. 911 912 e.g. Using the standard table: 913 914 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 915 >>> coding_dna.translate() 916 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 917 >>> coding_dna.translate(stop_symbol="@") 918 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 919 >>> coding_dna.translate(to_stop=True) 920 Seq('VAIVMGR', ExtendedIUPACProtein()) 921 922 Now using NCBI table 2, where TGA is not a stop codon: 923 924 >>> coding_dna.translate(table=2) 925 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 926 >>> coding_dna.translate(table=2, to_stop=True) 927 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 928 929 In fact, GTG is an alternative start codon under NCBI table 2, meaning 930 this sequence could be a complete CDS: 931 932 >>> coding_dna.translate(table=2, cds=True) 933 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 934 935 It isn't a valid CDS under NCBI table 1, due to both the start codon and 936 also the in frame stop codons: 937 938 >>> coding_dna.translate(table=1, cds=True) 939 Traceback (most recent call last): 940 ... 941 TranslationError: First codon 'GTG' is not a start codon 942 943 If the sequence has no in-frame stop codon, then the to_stop argument 944 has no effect: 945 946 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 947 >>> coding_dna2.translate() 948 Seq('LAIVMGR', ExtendedIUPACProtein()) 949 >>> coding_dna2.translate(to_stop=True) 950 Seq('LAIVMGR', ExtendedIUPACProtein()) 951 952 When translating gapped sequences, the gap character is inferred from 953 the alphabet: 954 955 >>> from Bio.Alphabet import Gapped 956 >>> coding_dna3 = Seq("GTG---GCCATT", Gapped(IUPAC.unambiguous_dna)) 957 >>> coding_dna3.translate() 958 Seq('V-AI', Gapped(ExtendedIUPACProtein(), '-')) 959 960 It is possible to pass the gap character when the alphabet is missing: 961 962 >>> coding_dna4 = Seq("GTG---GCCATT") 963 >>> coding_dna4.translate(gap='-') 964 Seq('V-AI', Gapped(ExtendedIUPACProtein(), '-')) 965 966 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 967 or a stop codon. These are translated as "X". Any invalid codon 968 (e.g. "TA?" or "T-A") will throw a TranslationError. 969 970 NOTE - This does NOT behave like the python string's translate 971 method. For that use str(my_seq).translate(...) instead. 972 """ 973 if isinstance(table, str) and len(table) == 256: 974 raise ValueError("The Seq object translate method DOES NOT take " 975 "a 256 character string mapping table like " 976 "the python string object's translate method. " 977 "Use str(my_seq).translate(...) instead.") 978 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 979 Alphabet.ProteinAlphabet): 980 raise ValueError("Proteins cannot be translated!") 981 try: 982 table_id = int(table) 983 except ValueError: 984 # Assume its a table name 985 if self.alphabet == IUPAC.unambiguous_dna: 986 # Will use standard IUPAC protein alphabet, no need for X 987 codon_table = CodonTable.unambiguous_dna_by_name[table] 988 elif self.alphabet == IUPAC.unambiguous_rna: 989 # Will use standard IUPAC protein alphabet, no need for X 990 codon_table = CodonTable.unambiguous_rna_by_name[table] 991 else: 992 # This will use the extended IUPAC protein alphabet with X etc. 993 # The same table can be used for RNA or DNA (we use this for 994 # translating strings). 995 codon_table = CodonTable.ambiguous_generic_by_name[table] 996 except (AttributeError, TypeError): 997 # Assume its a CodonTable object 998 if isinstance(table, CodonTable.CodonTable): 999 codon_table = table 1000 else: 1001 raise ValueError('Bad table argument') 1002 else: 1003 # Assume its a table ID 1004 if self.alphabet == IUPAC.unambiguous_dna: 1005 # Will use standard IUPAC protein alphabet, no need for X 1006 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 1007 elif self.alphabet == IUPAC.unambiguous_rna: 1008 # Will use standard IUPAC protein alphabet, no need for X 1009 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 1010 else: 1011 # This will use the extended IUPAC protein alphabet with X etc. 1012 # The same table can be used for RNA or DNA (we use this for 1013 # translating strings). 1014 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 1015 1016 # Deal with gaps for translation 1017 if hasattr(self.alphabet, "gap_char"): 1018 if not gap: 1019 gap = self.alphabet.gap_char 1020 elif gap != self.alphabet.gap_char: 1021 raise ValueError("Gap {0!r} does not match {1!r} from alphabet".format( 1022 gap, self.alphabet.gap_char)) 1023 1024 protein = _translate_str(str(self), codon_table, stop_symbol, to_stop, 1025 cds, gap=gap) 1026 1027 if gap and gap in protein: 1028 alphabet = Alphabet.Gapped(codon_table.protein_alphabet, gap) 1029 else: 1030 alphabet = codon_table.protein_alphabet 1031 1032 if stop_symbol in protein: 1033 alphabet = Alphabet.HasStopCodon(alphabet, stop_symbol) 1034 1035 return Seq(protein, alphabet)
1036
1037 - def ungap(self, gap=None):
1038 """Return a copy of the sequence without the gap character(s). 1039 1040 The gap character can be specified in two ways - either as an explicit 1041 argument, or via the sequence's alphabet. For example: 1042 1043 >>> from Bio.Seq import Seq 1044 >>> from Bio.Alphabet import generic_dna 1045 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 1046 >>> my_dna 1047 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1048 >>> my_dna.ungap("-") 1049 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 1050 1051 If the gap character is not given as an argument, it will be taken from 1052 the sequence's alphabet (if defined). Notice that the returned sequence's 1053 alphabet is adjusted since it no longer requires a gapped alphabet: 1054 1055 >>> from Bio.Seq import Seq 1056 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 1057 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 1058 >>> my_pro 1059 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 1060 >>> my_pro.ungap() 1061 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 1062 1063 Or, with a simpler gapped DNA example: 1064 1065 >>> from Bio.Seq import Seq 1066 >>> from Bio.Alphabet import IUPAC, Gapped 1067 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 1068 >>> my_seq 1069 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1070 >>> my_seq.ungap() 1071 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1072 1073 As long as it is consistent with the alphabet, although it is redundant, 1074 you can still supply the gap character as an argument to this method: 1075 1076 >>> my_seq 1077 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1078 >>> my_seq.ungap("=") 1079 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1080 1081 However, if the gap character given as the argument disagrees with that 1082 declared in the alphabet, an exception is raised: 1083 1084 >>> my_seq 1085 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1086 >>> my_seq.ungap("-") 1087 Traceback (most recent call last): 1088 ... 1089 ValueError: Gap '-' does not match '=' from alphabet 1090 1091 Finally, if a gap character is not supplied, and the alphabet does not 1092 define one, an exception is raised: 1093 1094 >>> from Bio.Seq import Seq 1095 >>> from Bio.Alphabet import generic_dna 1096 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1097 >>> my_dna 1098 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1099 >>> my_dna.ungap() 1100 Traceback (most recent call last): 1101 ... 1102 ValueError: Gap character not given and not defined in alphabet 1103 1104 """ 1105 if hasattr(self.alphabet, "gap_char"): 1106 if not gap: 1107 gap = self.alphabet.gap_char 1108 elif gap != self.alphabet.gap_char: 1109 raise ValueError("Gap {0!r} does not match {1!r} from alphabet".format( 1110 gap, self.alphabet.gap_char)) 1111 alpha = Alphabet._ungap(self.alphabet) 1112 elif not gap: 1113 raise ValueError("Gap character not given and not defined in alphabet") 1114 else: 1115 alpha = self.alphabet # modify! 1116 if len(gap) != 1 or not isinstance(gap, str): 1117 raise ValueError("Unexpected gap character, {0!r}".format(gap)) 1118 return Seq(str(self).replace(gap, ""), alpha)
1119 1120
1121 -class UnknownSeq(Seq):
1122 """A read-only sequence object of known length but unknown contents. 1123 1124 If you have an unknown sequence, you can represent this with a normal 1125 Seq object, for example: 1126 1127 >>> my_seq = Seq("N"*5) 1128 >>> my_seq 1129 Seq('NNNNN', Alphabet()) 1130 >>> len(my_seq) 1131 5 1132 >>> print(my_seq) 1133 NNNNN 1134 1135 However, this is rather wasteful of memory (especially for large 1136 sequences), which is where this class is most useful: 1137 1138 >>> unk_five = UnknownSeq(5) 1139 >>> unk_five 1140 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1141 >>> len(unk_five) 1142 5 1143 >>> print(unk_five) 1144 ????? 1145 1146 You can add unknown sequence together, provided their alphabets and 1147 characters are compatible, and get another memory saving UnknownSeq: 1148 1149 >>> unk_four = UnknownSeq(4) 1150 >>> unk_four 1151 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1152 >>> unk_four + unk_five 1153 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1154 1155 If the alphabet or characters don't match up, the addition gives an 1156 ordinary Seq object: 1157 1158 >>> unk_nnnn = UnknownSeq(4, character = "N") 1159 >>> unk_nnnn 1160 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1161 >>> unk_nnnn + unk_four 1162 Seq('NNNN????', Alphabet()) 1163 1164 Combining with a real Seq gives a new Seq object: 1165 1166 >>> known_seq = Seq("ACGT") 1167 >>> unk_four + known_seq 1168 Seq('????ACGT', Alphabet()) 1169 >>> known_seq + unk_four 1170 Seq('ACGT????', Alphabet()) 1171 """
1172 - def __init__(self, length, alphabet=Alphabet.generic_alphabet, character=None):
1173 """Create a new UnknownSeq object. 1174 1175 If character is omitted, it is determined from the alphabet, "N" for 1176 nucleotides, "X" for proteins, and "?" otherwise. 1177 """ 1178 self._length = int(length) 1179 if self._length < 0: 1180 # TODO - Block zero length UnknownSeq? You can just use a Seq! 1181 raise ValueError("Length must not be negative.") 1182 self.alphabet = alphabet 1183 if character: 1184 if len(character) != 1: 1185 raise ValueError("character argument should be a single letter string.") 1186 self._character = character 1187 else: 1188 base = Alphabet._get_base_alphabet(alphabet) 1189 # TODO? Check the case of the letters in the alphabet? 1190 # We may have to use "n" instead of "N" etc. 1191 if isinstance(base, Alphabet.NucleotideAlphabet): 1192 self._character = "N" 1193 elif isinstance(base, Alphabet.ProteinAlphabet): 1194 self._character = "X" 1195 else: 1196 self._character = "?"
1197
1198 - def __len__(self):
1199 """Returns the stated length of the unknown sequence.""" 1200 return self._length
1201
1202 - def __str__(self):
1203 """Returns the unknown sequence as full string of the given length.""" 1204 return self._character * self._length
1205
1206 - def __repr__(self):
1207 return "UnknownSeq({0}, alphabet = {1!r}, character = {2!r})".format( 1208 self._length, self.alphabet, self._character)
1209
1210 - def __add__(self, other):
1211 """Add another sequence or string to this sequence. 1212 1213 Adding two UnknownSeq objects returns another UnknownSeq object 1214 provided the character is the same and the alphabets are compatible. 1215 1216 >>> from Bio.Seq import UnknownSeq 1217 >>> from Bio.Alphabet import generic_protein 1218 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1219 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1220 1221 If the characters differ, an UnknownSeq object cannot be used, so a 1222 Seq object is returned: 1223 1224 >>> from Bio.Seq import UnknownSeq 1225 >>> from Bio.Alphabet import generic_protein 1226 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1227 ... character="x") 1228 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1229 1230 If adding a string to an UnknownSeq, a new Seq is returned with the 1231 same alphabet: 1232 1233 >>> from Bio.Seq import UnknownSeq 1234 >>> from Bio.Alphabet import generic_protein 1235 >>> UnknownSeq(5, generic_protein) + "LV" 1236 Seq('XXXXXLV', ProteinAlphabet()) 1237 """ 1238 if isinstance(other, UnknownSeq) and other._character == self._character: 1239 # TODO - Check the alphabets match 1240 return UnknownSeq(len(self) + len(other), 1241 self.alphabet, self._character) 1242 # Offload to the base class... 1243 return Seq(str(self), self.alphabet) + other
1244
1245 - def __radd__(self, other):
1246 # If other is an UnknownSeq, then __add__ would be called. 1247 # Offload to the base class... 1248 return other + Seq(str(self), self.alphabet)
1249
1250 - def __getitem__(self, index):
1251 """Get a subsequence from the UnknownSeq object. 1252 1253 >>> unk = UnknownSeq(8, character="N") 1254 >>> print(unk[:]) 1255 NNNNNNNN 1256 >>> print(unk[5:3]) 1257 <BLANKLINE> 1258 >>> print(unk[1:-1]) 1259 NNNNNN 1260 >>> print(unk[1:-1:2]) 1261 NNN 1262 """ 1263 if isinstance(index, int): 1264 # TODO - Check the bounds without wasting memory 1265 return str(self)[index] 1266 old_length = self._length 1267 step = index.step 1268 if step is None or step == 1: 1269 # This calculates the length you'd get from ("N"*old_length)[index] 1270 start = index.start 1271 end = index.stop 1272 if start is None: 1273 start = 0 1274 elif start < 0: 1275 start = max(0, old_length + start) 1276 elif start > old_length: 1277 start = old_length 1278 if end is None: 1279 end = old_length 1280 elif end < 0: 1281 end = max(0, old_length + end) 1282 elif end > old_length: 1283 end = old_length 1284 new_length = max(0, end - start) 1285 elif step == 0: 1286 raise ValueError("slice step cannot be zero") 1287 else: 1288 # TODO - handle step efficiently 1289 new_length = len(("X" * old_length)[index]) 1290 # assert new_length == len(("X"*old_length)[index]), \ 1291 # (index, start, end, step, old_length, 1292 # new_length, len(("X"*old_length)[index])) 1293 return UnknownSeq(new_length, self.alphabet, self._character)
1294
1295 - def count(self, sub, start=0, end=sys.maxsize):
1296 """Non-overlapping count method, like that of a python string. 1297 1298 This behaves like the python string (and Seq object) method of the 1299 same name, which does a non-overlapping count! 1300 1301 Returns an integer, the number of occurrences of substring 1302 argument sub in the (sub)sequence given by [start:end]. 1303 Optional arguments start and end are interpreted as in slice 1304 notation. 1305 1306 Arguments: 1307 - sub - a string or another Seq object to look for 1308 - start - optional integer, slice start 1309 - end - optional integer, slice end 1310 1311 >>> "NNNN".count("N") 1312 4 1313 >>> Seq("NNNN").count("N") 1314 4 1315 >>> UnknownSeq(4, character="N").count("N") 1316 4 1317 >>> UnknownSeq(4, character="N").count("A") 1318 0 1319 >>> UnknownSeq(4, character="N").count("AA") 1320 0 1321 1322 HOWEVER, please note because that python strings and Seq objects (and 1323 MutableSeq objects) do a non-overlapping search, this may not give 1324 the answer you expect: 1325 1326 >>> UnknownSeq(4, character="N").count("NN") 1327 2 1328 >>> UnknownSeq(4, character="N").count("NNN") 1329 1 1330 """ 1331 sub_str = self._get_seq_str_and_check_alphabet(sub) 1332 if len(sub_str) == 1: 1333 if str(sub_str) == self._character: 1334 if start == 0 and end >= self._length: 1335 return self._length 1336 else: 1337 # This could be done more cleverly... 1338 return str(self).count(sub_str, start, end) 1339 else: 1340 return 0 1341 else: 1342 if set(sub_str) == set(self._character): 1343 if start == 0 and end >= self._length: 1344 return self._length // len(sub_str) 1345 else: 1346 # This could be done more cleverly... 1347 return str(self).count(sub_str, start, end) 1348 else: 1349 return 0
1350
1351 - def complement(self):
1352 """The complement of an unknown nucleotide equals itself. 1353 1354 >>> my_nuc = UnknownSeq(8) 1355 >>> my_nuc 1356 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1357 >>> print(my_nuc) 1358 ???????? 1359 >>> my_nuc.complement() 1360 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1361 >>> print(my_nuc.complement()) 1362 ???????? 1363 """ 1364 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1365 Alphabet.ProteinAlphabet): 1366 raise ValueError("Proteins do not have complements!") 1367 return self
1368
1369 - def reverse_complement(self):
1370 """The reverse complement of an unknown nucleotide equals itself. 1371 1372 >>> my_nuc = UnknownSeq(10) 1373 >>> my_nuc 1374 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1375 >>> print(my_nuc) 1376 ?????????? 1377 >>> my_nuc.reverse_complement() 1378 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1379 >>> print(my_nuc.reverse_complement()) 1380 ?????????? 1381 """ 1382 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1383 Alphabet.ProteinAlphabet): 1384 raise ValueError("Proteins do not have complements!") 1385 return self
1386
1387 - def transcribe(self):
1388 """Returns unknown RNA sequence from an unknown DNA sequence. 1389 1390 >>> my_dna = UnknownSeq(10, character="N") 1391 >>> my_dna 1392 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1393 >>> print(my_dna) 1394 NNNNNNNNNN 1395 >>> my_rna = my_dna.transcribe() 1396 >>> my_rna 1397 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1398 >>> print(my_rna) 1399 NNNNNNNNNN 1400 """ 1401 # Offload the alphabet stuff 1402 s = Seq(self._character, self.alphabet).transcribe() 1403 return UnknownSeq(self._length, s.alphabet, self._character)
1404
1405 - def back_transcribe(self):
1406 """Returns unknown DNA sequence from an unknown RNA sequence. 1407 1408 >>> my_rna = UnknownSeq(20, character="N") 1409 >>> my_rna 1410 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1411 >>> print(my_rna) 1412 NNNNNNNNNNNNNNNNNNNN 1413 >>> my_dna = my_rna.back_transcribe() 1414 >>> my_dna 1415 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1416 >>> print(my_dna) 1417 NNNNNNNNNNNNNNNNNNNN 1418 """ 1419 # Offload the alphabet stuff 1420 s = Seq(self._character, self.alphabet).back_transcribe() 1421 return UnknownSeq(self._length, s.alphabet, self._character)
1422
1423 - def upper(self):
1424 """Returns an upper case copy of the sequence. 1425 1426 >>> from Bio.Alphabet import generic_dna 1427 >>> from Bio.Seq import UnknownSeq 1428 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1429 >>> my_seq 1430 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1431 >>> print(my_seq) 1432 nnnnnnnnnnnnnnnnnnnn 1433 >>> my_seq.upper() 1434 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1435 >>> print(my_seq.upper()) 1436 NNNNNNNNNNNNNNNNNNNN 1437 1438 This will adjust the alphabet if required. See also the lower method. 1439 """ 1440 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1441
1442 - def lower(self):
1443 """Returns a lower case copy of the sequence. 1444 1445 This will adjust the alphabet if required: 1446 1447 >>> from Bio.Alphabet import IUPAC 1448 >>> from Bio.Seq import UnknownSeq 1449 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1450 >>> my_seq 1451 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1452 >>> print(my_seq) 1453 XXXXXXXXXXXXXXXXXXXX 1454 >>> my_seq.lower() 1455 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1456 >>> print(my_seq.lower()) 1457 xxxxxxxxxxxxxxxxxxxx 1458 1459 See also the upper method. 1460 """ 1461 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1462
1463 - def translate(self, **kwargs):
1464 """Translate an unknown nucleotide sequence into an unknown protein. 1465 1466 e.g. 1467 1468 >>> my_seq = UnknownSeq(9, character="N") 1469 >>> print(my_seq) 1470 NNNNNNNNN 1471 >>> my_protein = my_seq.translate() 1472 >>> my_protein 1473 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1474 >>> print(my_protein) 1475 XXX 1476 1477 In comparison, using a normal Seq object: 1478 1479 >>> my_seq = Seq("NNNNNNNNN") 1480 >>> print(my_seq) 1481 NNNNNNNNN 1482 >>> my_protein = my_seq.translate() 1483 >>> my_protein 1484 Seq('XXX', ExtendedIUPACProtein()) 1485 >>> print(my_protein) 1486 XXX 1487 1488 """ 1489 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1490 Alphabet.ProteinAlphabet): 1491 raise ValueError("Proteins cannot be translated!") 1492 return UnknownSeq(self._length // 3, Alphabet.generic_protein, "X")
1493
1494 - def ungap(self, gap=None):
1495 """Return a copy of the sequence without the gap character(s). 1496 1497 The gap character can be specified in two ways - either as an explicit 1498 argument, or via the sequence's alphabet. For example: 1499 1500 >>> from Bio.Seq import UnknownSeq 1501 >>> from Bio.Alphabet import Gapped, generic_dna 1502 >>> my_dna = UnknownSeq(20, Gapped(generic_dna, "-")) 1503 >>> my_dna 1504 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1505 >>> my_dna.ungap() 1506 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1507 >>> my_dna.ungap("-") 1508 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1509 1510 If the UnknownSeq is using the gap character, then an empty Seq is 1511 returned: 1512 1513 >>> my_gap = UnknownSeq(20, Gapped(generic_dna, "-"), character="-") 1514 >>> my_gap 1515 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1516 >>> my_gap.ungap() 1517 Seq('', DNAAlphabet()) 1518 >>> my_gap.ungap("-") 1519 Seq('', DNAAlphabet()) 1520 1521 Notice that the returned sequence's alphabet is adjusted to remove any 1522 explicit gap character declaration. 1523 """ 1524 # Offload the alphabet stuff 1525 s = Seq(self._character, self.alphabet).ungap() 1526 if s: 1527 return UnknownSeq(self._length, s.alphabet, self._character) 1528 else: 1529 return Seq("", s.alphabet)
1530 1531
1532 -class MutableSeq(object):
1533 """An editable sequence object (with an alphabet). 1534 1535 Unlike normal python strings and our basic sequence object (the Seq class) 1536 which are immutable, the MutableSeq lets you edit the sequence in place. 1537 However, this means you cannot use a MutableSeq object as a dictionary key. 1538 1539 >>> from Bio.Seq import MutableSeq 1540 >>> from Bio.Alphabet import generic_dna 1541 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1542 >>> my_seq 1543 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1544 >>> my_seq[5] 1545 'T' 1546 >>> my_seq[5] = "A" 1547 >>> my_seq 1548 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1549 >>> my_seq[5] 1550 'A' 1551 >>> my_seq[5:8] = "NNN" 1552 >>> my_seq 1553 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1554 >>> len(my_seq) 1555 11 1556 1557 Note that the MutableSeq object does not support as many string-like 1558 or biological methods as the Seq object. 1559 """
1560 - def __init__(self, data, alphabet=Alphabet.generic_alphabet):
1561 if sys.version_info[0] == 3: 1562 self.array_indicator = "u" 1563 else: 1564 self.array_indicator = "c" 1565 if isinstance(data, str): # TODO - What about unicode? 1566 self.data = array.array(self.array_indicator, data) 1567 else: 1568 self.data = data # assumes the input is an array 1569 self.alphabet = alphabet
1570
1571 - def __repr__(self):
1572 """Returns a (truncated) representation of the sequence for debugging.""" 1573 if len(self) > 60: 1574 # Shows the last three letters as it is often useful to see if there 1575 # is a stop codon at the end of a sequence. 1576 # Note total length is 54+3+3=60 1577 return "{0}('{1}...{2}', {3!r})".format(self.__class__.__name__, 1578 str(self[:54]), 1579 str(self[-3:]), 1580 self.alphabet) 1581 else: 1582 return "{0}('{1}', {2!r})".format(self.__class__.__name__, 1583 str(self), 1584 self.alphabet)
1585
1586 - def __str__(self):
1587 """Returns the full sequence as a python string. 1588 1589 Note that Biopython 1.44 and earlier would give a truncated 1590 version of repr(my_seq) for str(my_seq). If you are writing code 1591 which needs to be backwards compatible with old Biopython, you 1592 should continue to use my_seq.tostring() rather than str(my_seq). 1593 """ 1594 # See test_GAQueens.py for an historic usage of a non-string alphabet! 1595 return "".join(self.data)
1596
1597 - def __eq__(self, other):
1598 """Compare the sequence to another sequence or a string (README). 1599 1600 Currently if compared to another sequence the alphabets must be 1601 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1602 an exception. Otherwise only the sequence itself is compared, not the 1603 precise alphabet. 1604 1605 A future release of Biopython will change this (and the Seq object etc) 1606 to use simple string comparison. The plan is that comparing sequences 1607 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1608 but not an exception. 1609 1610 During this transition period, please just do explicit comparisons: 1611 1612 >>> seq1 = MutableSeq("ACGT") 1613 >>> seq2 = MutableSeq("ACGT") 1614 >>> id(seq1) == id(seq2) 1615 False 1616 >>> str(seq1) == str(seq2) 1617 True 1618 1619 Biopython now does: 1620 1621 >>> seq1 == seq2 1622 True 1623 >>> seq1 == Seq("ACGT") 1624 True 1625 >>> seq1 == "ACGT" 1626 True 1627 1628 """ 1629 if hasattr(other, "alphabet"): 1630 if not Alphabet._check_type_compatible([self.alphabet, 1631 other.alphabet]): 1632 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1633 self.alphabet, other.alphabet), 1634 BiopythonWarning) 1635 if isinstance(other, MutableSeq): 1636 return self.data == other.data 1637 return str(self) == str(other)
1638
1639 - def __ne__(self, other):
1640 """Not equal, see __eq__ documentation.""" 1641 # Seem to require this method under Python 2 but not needed on Python 3? 1642 return not (self == other)
1643
1644 - def __lt__(self, other):
1645 """Less than, see __eq__ documentation.""" 1646 if hasattr(other, "alphabet"): 1647 if not Alphabet._check_type_compatible([self.alphabet, 1648 other.alphabet]): 1649 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1650 self.alphabet, other.alphabet), 1651 BiopythonWarning) 1652 if isinstance(other, MutableSeq): 1653 return self.data < other.data 1654 return str(self) < str(other)
1655
1656 - def __le__(self, other):
1657 """Less than or equal, see __eq__ documentation.""" 1658 if hasattr(other, "alphabet"): 1659 if not Alphabet._check_type_compatible([self.alphabet, 1660 other.alphabet]): 1661 warnings.warn("Incompatible alphabets {0!r} and {1!r}".format( 1662 self.alphabet, other.alphabet), 1663 BiopythonWarning) 1664 if isinstance(other, MutableSeq): 1665 return self.data <= other.data 1666 return str(self) <= str(other)
1667
1668 - def __len__(self):
1669 return len(self.data)
1670
1671 - def __getitem__(self, index):
1672 # Note since Python 2.0, __getslice__ is deprecated 1673 # and __getitem__ is used instead. 1674 # See http://docs.python.org/ref/sequence-methods.html 1675 if isinstance(index, int): 1676 # Return a single letter as a string 1677 return self.data[index] 1678 else: 1679 # Return the (sub)sequence as another Seq object 1680 return MutableSeq(self.data[index], self.alphabet)
1681
1682 - def __setitem__(self, index, value):
1683 # Note since Python 2.0, __setslice__ is deprecated 1684 # and __setitem__ is used instead. 1685 # See http://docs.python.org/ref/sequence-methods.html 1686 if isinstance(index, int): 1687 # Replacing a single letter with a new string 1688 self.data[index] = value 1689 else: 1690 # Replacing a sub-sequence 1691 if isinstance(value, MutableSeq): 1692 self.data[index] = value.data 1693 elif isinstance(value, type(self.data)): 1694 self.data[index] = value 1695 else: 1696 self.data[index] = array.array(self.array_indicator, 1697 str(value))
1698
1699 - def __delitem__(self, index):
1700 # Note since Python 2.0, __delslice__ is deprecated 1701 # and __delitem__ is used instead. 1702 # See http://docs.python.org/ref/sequence-methods.html 1703 1704 # Could be deleting a single letter, or a slice 1705 del self.data[index]
1706
1707 - def __add__(self, other):
1708 """Add another sequence or string to this sequence. 1709 1710 Returns a new MutableSeq object.""" 1711 if hasattr(other, "alphabet"): 1712 # other should be a Seq or a MutableSeq 1713 if not Alphabet._check_type_compatible([self.alphabet, 1714 other.alphabet]): 1715 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 1716 self.alphabet, other.alphabet)) 1717 # They should be the same sequence type (or one of them is generic) 1718 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1719 if isinstance(other, MutableSeq): 1720 # See test_GAQueens.py for an historic usage of a non-string 1721 # alphabet! Adding the arrays should support this. 1722 return self.__class__(self.data + other.data, a) 1723 else: 1724 return self.__class__(str(self) + str(other), a) 1725 elif isinstance(other, basestring): 1726 # other is a plain string - use the current alphabet 1727 return self.__class__(str(self) + str(other), self.alphabet) 1728 else: 1729 raise TypeError
1730
1731 - def __radd__(self, other):
1732 if hasattr(other, "alphabet"): 1733 # other should be a Seq or a MutableSeq 1734 if not Alphabet._check_type_compatible([self.alphabet, 1735 other.alphabet]): 1736 raise TypeError("Incompatible alphabets {0!r} and {1!r}".format( 1737 self.alphabet, other.alphabet)) 1738 # They should be the same sequence type (or one of them is generic) 1739 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1740 if isinstance(other, MutableSeq): 1741 # See test_GAQueens.py for an historic usage of a non-string 1742 # alphabet! Adding the arrays should support this. 1743 return self.__class__(other.data + self.data, a) 1744 else: 1745 return self.__class__(str(other) + str(self), a) 1746 elif isinstance(other, basestring): 1747 # other is a plain string - use the current alphabet 1748 return self.__class__(str(other) + str(self), self.alphabet) 1749 else: 1750 raise TypeError
1751
1752 - def append(self, c):
1753 self.data.append(c)
1754
1755 - def insert(self, i, c):
1756 self.data.insert(i, c)
1757
1758 - def pop(self, i=(-1)):
1759 c = self.data[i] 1760 del self.data[i] 1761 return c
1762
1763 - def remove(self, item):
1764 for i in range(len(self.data)): 1765 if self.data[i] == item: 1766 del self.data[i] 1767 return 1768 raise ValueError("MutableSeq.remove(x): x not in list")
1769
1770 - def count(self, sub, start=0, end=sys.maxsize):
1771 """Non-overlapping count method, like that of a python string. 1772 1773 This behaves like the python string method of the same name, 1774 which does a non-overlapping count! 1775 1776 Returns an integer, the number of occurrences of substring 1777 argument sub in the (sub)sequence given by [start:end]. 1778 Optional arguments start and end are interpreted as in slice 1779 notation. 1780 1781 Arguments: 1782 - sub - a string or another Seq object to look for 1783 - start - optional integer, slice start 1784 - end - optional integer, slice end 1785 1786 e.g. 1787 1788 >>> from Bio.Seq import MutableSeq 1789 >>> my_mseq = MutableSeq("AAAATGA") 1790 >>> print(my_mseq.count("A")) 1791 5 1792 >>> print(my_mseq.count("ATG")) 1793 1 1794 >>> print(my_mseq.count(Seq("AT"))) 1795 1 1796 >>> print(my_mseq.count("AT", 2, -1)) 1797 1 1798 1799 HOWEVER, please note because that python strings, Seq objects and 1800 MutableSeq objects do a non-overlapping search, this may not give 1801 the answer you expect: 1802 1803 >>> "AAAA".count("AA") 1804 2 1805 >>> print(MutableSeq("AAAA").count("AA")) 1806 2 1807 1808 An overlapping search would give the answer as three! 1809 """ 1810 try: 1811 # TODO - Should we check the alphabet? 1812 search = str(sub) 1813 except AttributeError: 1814 search = sub 1815 1816 if not isinstance(search, basestring): 1817 raise TypeError("expected a string, Seq or MutableSeq") 1818 1819 if len(search) == 1: 1820 # Try and be efficient and work directly from the array. 1821 count = 0 1822 for c in self.data[start:end]: 1823 if c == search: 1824 count += 1 1825 return count 1826 else: 1827 # TODO - Can we do this more efficiently? 1828 return str(self).count(search, start, end)
1829
1830 - def index(self, item):
1831 for i in range(len(self.data)): 1832 if self.data[i] == item: 1833 return i 1834 raise ValueError("MutableSeq.index(x): x not in list")
1835
1836 - def reverse(self):
1837 """Modify the mutable sequence to reverse itself. 1838 1839 No return value. 1840 """ 1841 self.data.reverse()
1842
1843 - def complement(self):
1844 """Modify the mutable sequence to take on its complement. 1845 1846 Trying to complement a protein sequence raises an exception. 1847 1848 No return value. 1849 """ 1850 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1851 Alphabet.ProteinAlphabet): 1852 raise ValueError("Proteins do not have complements!") 1853 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1854 d = ambiguous_dna_complement 1855 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1856 d = ambiguous_rna_complement 1857 elif 'U' in self.data and 'T' in self.data: 1858 # TODO - Handle this cleanly? 1859 raise ValueError("Mixed RNA/DNA found") 1860 elif 'U' in self.data: 1861 d = ambiguous_rna_complement 1862 else: 1863 d = ambiguous_dna_complement 1864 c = dict([(x.lower(), y.lower()) for x, y in d.items()]) 1865 d.update(c) 1866 self.data = [d[c] for c in self.data] 1867 self.data = array.array(self.array_indicator, self.data)
1868
1869 - def reverse_complement(self):
1870 """Modify the mutable sequence to take on its reverse complement. 1871 1872 Trying to reverse complement a protein sequence raises an exception. 1873 1874 No return value. 1875 """ 1876 self.complement() 1877 self.data.reverse()
1878 1879 # Sorting a sequence makes no sense. 1880 # def sort(self, *args): self.data.sort(*args) 1881
1882 - def extend(self, other):
1883 if isinstance(other, MutableSeq): 1884 for c in other.data: 1885 self.data.append(c) 1886 else: 1887 for c in other: 1888 self.data.append(c)
1889
1890 - def tostring(self):
1891 """Returns the full sequence as a python string (DEPRECATED). 1892 1893 You are now encouraged to use str(my_seq) instead of my_seq.tostring() 1894 as this method is officially deprecated. 1895 1896 Because str(my_seq) will give you the full sequence as a python string, 1897 there is often no need to make an explicit conversion. For example, 1898 1899 >>> my_seq = Seq("ATCGTG") 1900 >>> my_name = "seq_1" 1901 >>> print("ID={%s}, sequence={%s}" % (my_name, my_seq)) 1902 ID={seq_1}, sequence={ATCGTG} 1903 1904 On Biopython 1.44 or older you would have to have done this: 1905 1906 >>> print("ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())) 1907 ID={seq_1}, sequence={ATCGTG} 1908 """ 1909 from Bio import BiopythonDeprecationWarning 1910 warnings.warn("This method is obsolete; please use str(my_seq) " 1911 "instead of my_seq.tostring().", 1912 BiopythonDeprecationWarning) 1913 return "".join(self.data)
1914
1915 - def toseq(self):
1916 """Returns the full sequence as a new immutable Seq object. 1917 1918 >>> from Bio.Seq import Seq 1919 >>> from Bio.Alphabet import IUPAC 1920 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 1921 ... IUPAC.protein) 1922 >>> my_mseq 1923 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1924 >>> my_mseq.toseq() 1925 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1926 1927 Note that the alphabet is preserved. 1928 """ 1929 return Seq("".join(self.data), self.alphabet)
1930 1931 1932 # The transcribe, backward_transcribe, and translate functions are 1933 # user-friendly versions of the corresponding functions in Bio.Transcribe 1934 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1935
1936 -def transcribe(dna):
1937 """Transcribes a DNA sequence into RNA. 1938 1939 If given a string, returns a new string object. 1940 1941 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1942 1943 Trying to transcribe a protein or RNA sequence raises an exception. 1944 1945 e.g. 1946 1947 >>> transcribe("ACTGN") 1948 'ACUGN' 1949 """ 1950 if isinstance(dna, Seq): 1951 return dna.transcribe() 1952 elif isinstance(dna, MutableSeq): 1953 return dna.toseq().transcribe() 1954 else: 1955 return dna.replace('T', 'U').replace('t', 'u')
1956 1957
1958 -def back_transcribe(rna):
1959 """Back-transcribes an RNA sequence into DNA. 1960 1961 If given a string, returns a new string object. 1962 1963 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1964 1965 Trying to transcribe a protein or DNA sequence raises an exception. 1966 1967 e.g. 1968 1969 >>> back_transcribe("ACUGN") 1970 'ACTGN' 1971 """ 1972 if isinstance(rna, Seq): 1973 return rna.back_transcribe() 1974 elif isinstance(rna, MutableSeq): 1975 return rna.toseq().back_transcribe() 1976 else: 1977 return rna.replace('U', 'T').replace('u', 't')
1978 1979
1980 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 1981 cds=False, pos_stop="X", gap=None):
1982 """Helper function to translate a nucleotide string (PRIVATE). 1983 1984 Arguments: 1985 - sequence - a string 1986 - table - a CodonTable object (NOT a table name or id number) 1987 - stop_symbol - a single character string, what to use for terminators. 1988 - to_stop - boolean, should translation terminate at the first 1989 in frame stop codon? If there is no in-frame stop codon 1990 then translation continues to the end. 1991 - pos_stop - a single character string for a possible stop codon 1992 (e.g. TAN or NNN) 1993 - cds - Boolean, indicates this is a complete CDS. If True, this 1994 checks the sequence starts with a valid alternative start 1995 codon (which will be translated as methionine, M), that the 1996 sequence length is a multiple of three, and that there is a 1997 single in frame stop codon at the end (this will be excluded 1998 from the protein sequence, regardless of the to_stop option). 1999 If these tests fail, an exception is raised. 2000 - gap - Single character string to denote symbol used for gaps. 2001 Defaults to None. 2002 2003 Returns a string. 2004 2005 e.g. 2006 2007 >>> from Bio.Data import CodonTable 2008 >>> table = CodonTable.ambiguous_dna_by_id[1] 2009 >>> _translate_str("AAA", table) 2010 'K' 2011 >>> _translate_str("TAR", table) 2012 '*' 2013 >>> _translate_str("TAN", table) 2014 'X' 2015 >>> _translate_str("TAN", table, pos_stop="@") 2016 '@' 2017 >>> _translate_str("TA?", table) 2018 Traceback (most recent call last): 2019 ... 2020 TranslationError: Codon 'TA?' is invalid 2021 2022 In a change to older versions of Biopython, partial codons are now 2023 always regarded as an error (previously only checked if cds=True) 2024 and will trigger a warning (likely to become an exception in a 2025 future release). 2026 2027 If **cds=True**, the start and stop codons are checked, and the start 2028 codon will be translated at methionine. The sequence must be an 2029 while number of codons. 2030 2031 >>> _translate_str("ATGCCCTAG", table, cds=True) 2032 'MP' 2033 >>> _translate_str("AAACCCTAG", table, cds=True) 2034 Traceback (most recent call last): 2035 ... 2036 TranslationError: First codon 'AAA' is not a start codon 2037 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 2038 Traceback (most recent call last): 2039 ... 2040 TranslationError: Extra in frame stop codon found. 2041 """ 2042 sequence = sequence.upper() 2043 amino_acids = [] 2044 forward_table = table.forward_table 2045 stop_codons = table.stop_codons 2046 if table.nucleotide_alphabet.letters is not None: 2047 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 2048 else: 2049 # Assume the worst case, ambiguous DNA or RNA: 2050 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + 2051 IUPAC.ambiguous_rna.letters.upper()) 2052 n = len(sequence) 2053 if cds: 2054 if str(sequence[:3]).upper() not in table.start_codons: 2055 raise CodonTable.TranslationError( 2056 "First codon '{0}' is not a start codon".format(sequence[:3])) 2057 if n % 3 != 0: 2058 raise CodonTable.TranslationError( 2059 "Sequence length {0} is not a multiple of three".format(n)) 2060 if str(sequence[-3:]).upper() not in stop_codons: 2061 raise CodonTable.TranslationError( 2062 "Final codon '{0}' is not a stop codon".format(sequence[-3:])) 2063 # Don't translate the stop symbol, and manually translate the M 2064 sequence = sequence[3:-3] 2065 n -= 6 2066 amino_acids = ["M"] 2067 elif n % 3 != 0: 2068 warnings.warn("Partial codon, len(sequence) not a multiple of three. " 2069 "Explicitly trim the sequence or add trailing N before " 2070 "translation. This may become an error in future.", 2071 BiopythonWarning) 2072 if gap is not None: 2073 if not isinstance(gap, basestring): 2074 raise TypeError("Gap character should be a single character string.") 2075 elif len(gap) > 1: 2076 raise ValueError("Gap character should be a single character string.") 2077 2078 for i in range(0, n - n % 3, 3): 2079 codon = sequence[i:i + 3] 2080 try: 2081 amino_acids.append(forward_table[codon]) 2082 except (KeyError, CodonTable.TranslationError): 2083 if codon in table.stop_codons: 2084 if cds: 2085 raise CodonTable.TranslationError( 2086 "Extra in frame stop codon found.") 2087 if to_stop: 2088 break 2089 amino_acids.append(stop_symbol) 2090 elif valid_letters.issuperset(set(codon)): 2091 # Possible stop codon (e.g. NNN or TAN) 2092 amino_acids.append(pos_stop) 2093 elif gap is not None and codon == gap * 3: 2094 # Gapped translation 2095 amino_acids.append(gap) 2096 else: 2097 raise CodonTable.TranslationError( 2098 "Codon '{0}' is invalid".format(codon)) 2099 return "".join(amino_acids)
2100 2101
2102 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 2103 cds=False, gap=None):
2104 """Translate a nucleotide sequence into amino acids. 2105 2106 If given a string, returns a new string object. Given a Seq or 2107 MutableSeq, returns a Seq object with a protein alphabet. 2108 2109 Arguments: 2110 - table - Which codon table to use? This can be either a name (string), 2111 an NCBI identifier (integer), or a CodonTable object (useful 2112 for non-standard genetic codes). Defaults to the "Standard" 2113 table. 2114 - stop_symbol - Single character string, what to use for any 2115 terminators, defaults to the asterisk, "*". 2116 - to_stop - Boolean, defaults to False meaning do a full 2117 translation continuing on past any stop codons 2118 (translated as the specified stop_symbol). If 2119 True, translation is terminated at the first in 2120 frame stop codon (and the stop_symbol is not 2121 appended to the returned protein sequence). 2122 - cds - Boolean, indicates this is a complete CDS. If True, this 2123 checks the sequence starts with a valid alternative start 2124 codon (which will be translated as methionine, M), that the 2125 sequence length is a multiple of three, and that there is a 2126 single in frame stop codon at the end (this will be excluded 2127 from the protein sequence, regardless of the to_stop option). 2128 If these tests fail, an exception is raised. 2129 - gap - Single character string to denote symbol used for gaps. 2130 Defaults to None. 2131 2132 A simple string example using the default (standard) genetic code: 2133 2134 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2135 >>> translate(coding_dna) 2136 'VAIVMGR*KGAR*' 2137 >>> translate(coding_dna, stop_symbol="@") 2138 'VAIVMGR@KGAR@' 2139 >>> translate(coding_dna, to_stop=True) 2140 'VAIVMGR' 2141 2142 Now using NCBI table 2, where TGA is not a stop codon: 2143 2144 >>> translate(coding_dna, table=2) 2145 'VAIVMGRWKGAR*' 2146 >>> translate(coding_dna, table=2, to_stop=True) 2147 'VAIVMGRWKGAR' 2148 2149 In fact this example uses an alternative start codon valid under NCBI table 2, 2150 GTG, which means this example is a complete valid CDS which when translated 2151 should really start with methionine (not valine): 2152 2153 >>> translate(coding_dna, table=2, cds=True) 2154 'MAIVMGRWKGAR' 2155 2156 Note that if the sequence has no in-frame stop codon, then the to_stop 2157 argument has no effect: 2158 2159 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2160 >>> translate(coding_dna2) 2161 'VAIVMGR' 2162 >>> translate(coding_dna2, to_stop=True) 2163 'VAIVMGR' 2164 2165 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2166 or a stop codon. These are translated as "X". Any invalid codon 2167 (e.g. "TA?" or "T-A") will throw a TranslationError. 2168 2169 It will however translate either DNA or RNA. 2170 """ 2171 if isinstance(sequence, Seq): 2172 return sequence.translate(table, stop_symbol, to_stop, cds) 2173 elif isinstance(sequence, MutableSeq): 2174 # Return a Seq object 2175 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2176 else: 2177 # Assume its a string, return a string 2178 try: 2179 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2180 except ValueError: 2181 codon_table = CodonTable.ambiguous_generic_by_name[table] 2182 except (AttributeError, TypeError): 2183 if isinstance(table, CodonTable.CodonTable): 2184 codon_table = table 2185 else: 2186 raise ValueError('Bad table argument') 2187 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds, 2188 gap=gap)
2189 2190
2191 -def reverse_complement(sequence):
2192 """Returns the reverse complement sequence of a nucleotide string. 2193 2194 If given a string, returns a new string object. 2195 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 2196 2197 Supports unambiguous and ambiguous nucleotide sequences. 2198 2199 e.g. 2200 2201 >>> reverse_complement("ACTG-NH") 2202 'DN-CAGT' 2203 """ 2204 if isinstance(sequence, Seq): 2205 # Return a Seq 2206 return sequence.reverse_complement() 2207 elif isinstance(sequence, MutableSeq): 2208 # Return a Seq 2209 # Don't use the MutableSeq reverse_complement method as it is 'in place'. 2210 return sequence.toseq().reverse_complement() 2211 2212 # Assume its a string. 2213 # In order to avoid some code duplication, the old code would turn the string 2214 # into a Seq, use the reverse_complement method, and convert back to a string. 2215 # This worked, but is over five times slower on short sequences! 2216 if ('U' in sequence or 'u' in sequence) \ 2217 and ('T' in sequence or 't' in sequence): 2218 raise ValueError("Mixed RNA/DNA found") 2219 elif 'U' in sequence or 'u' in sequence: 2220 ttable = _rna_complement_table 2221 else: 2222 ttable = _dna_complement_table 2223 return sequence.translate(ttable)[::-1]
2224 2225
2226 -def _test():
2227 """Run the Bio.Seq module's doctests (PRIVATE).""" 2228 if sys.version_info[0:2] == (3, 1): 2229 print("Not running Bio.Seq doctest on Python 3.1") 2230 print("See http://bugs.python.org/issue7490") 2231 else: 2232 print("Running doctests...") 2233 import doctest 2234 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2235 print("Done")
2236 2237 if __name__ == "__main__": 2238 _test() 2239