Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial: 
  11   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html} 
  12   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf} 
  13  """ 
  14  from __future__ import print_function 
  15   
  16  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
  17   
  18  import string  # for maketrans only 
  19  import array 
  20  import sys 
  21  import warnings 
  22   
  23  from Bio._py3k import range 
  24  from Bio._py3k import basestring 
  25   
  26  from Bio import Alphabet 
  27  from Bio.Alphabet import IUPAC 
  28  from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  29  from Bio.Data import CodonTable 
  30   
  31   
32 -def _maketrans(complement_mapping):
33 """Makes a python string translation table (PRIVATE). 34 35 Arguments: 36 - complement_mapping - a dictionary such as ambiguous_dna_complement 37 and ambiguous_rna_complement from Data.IUPACData. 38 39 Returns a translation table (a string of length 256) for use with the 40 python string's translate method to use in a (reverse) complement. 41 42 Compatible with lower case and upper case sequences. 43 44 For internal use only. 45 """ 46 before = ''.join(complement_mapping.keys()) 47 after = ''.join(complement_mapping.values()) 48 before = before + before.lower() 49 after = after + after.lower() 50 if sys.version_info[0] == 3: 51 return str.maketrans(before, after) 52 else: 53 return string.maketrans(before, after)
54 55 _dna_complement_table = _maketrans(ambiguous_dna_complement) 56 _rna_complement_table = _maketrans(ambiguous_rna_complement) 57 58
59 -class Seq(object):
60 """A read-only sequence object (essentially a string with an alphabet). 61 62 Like normal python strings, our basic sequence object is immutable. 63 This prevents you from doing my_seq[5] = "A" for example, but does allow 64 Seq objects to be used as dictionary keys. 65 66 The Seq object provides a number of string like methods (such as count, 67 find, split and strip), which are alphabet aware where appropriate. 68 69 In addition to the string like sequence, the Seq object has an alphabet 70 property. This is an instance of an Alphabet class from Bio.Alphabet, 71 for example generic DNA, or IUPAC DNA. This describes the type of molecule 72 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 73 (letters). 74 75 The Seq object also provides some biological methods, such as complement, 76 reverse_complement, transcribe, back_transcribe and translate (which are 77 not applicable to sequences with a protein alphabet). 78 """
79 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
80 """Create a Seq object. 81 82 Arguments: 83 - seq - Sequence, required (string) 84 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 85 86 You will typically use Bio.SeqIO to read in sequences from files as 87 SeqRecord objects, whose sequence will be exposed as a Seq object via 88 the seq property. 89 90 However, will often want to create your own Seq objects directly: 91 92 >>> from Bio.Seq import Seq 93 >>> from Bio.Alphabet import IUPAC 94 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 95 ... IUPAC.protein) 96 >>> my_seq 97 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 98 >>> print(my_seq) 99 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 100 >>> my_seq.alphabet 101 IUPACProtein() 102 103 """ 104 # Enforce string storage 105 if not isinstance(data, basestring): 106 raise TypeError("The sequence data given to a Seq object should " 107 "be a string (not another Seq object etc)") 108 self._data = data 109 self.alphabet = alphabet # Seq API requirement
110
111 - def __repr__(self):
112 """Returns a (truncated) representation of the sequence for debugging.""" 113 if len(self) > 60: 114 #Shows the last three letters as it is often useful to see if there 115 #is a stop codon at the end of a sequence. 116 #Note total length is 54+3+3=60 117 return "%s('%s...%s', %s)" % (self.__class__.__name__, 118 str(self)[:54], str(self)[-3:], 119 repr(self.alphabet)) 120 else: 121 return "%s(%s, %s)" % (self.__class__.__name__, 122 repr(self._data), 123 repr(self.alphabet))
124
125 - def __str__(self):
126 """Returns the full sequence as a python string, use str(my_seq). 127 128 Note that Biopython 1.44 and earlier would give a truncated 129 version of repr(my_seq) for str(my_seq). If you are writing code 130 which need to be backwards compatible with old Biopython, you 131 should continue to use my_seq.tostring() rather than str(my_seq). 132 """ 133 return self._data
134
135 - def __hash__(self):
136 """Hash for comparison. 137 138 See the __cmp__ documentation - we plan to change this! 139 """ 140 warnings.warn("In future comparing Seq objects will use string " 141 "comparison (not object comparison). Please use " 142 "hash(id(my_seq)) or my_dict[id(my_seq)] if you " 143 "want the current behaviour, or for string hashing " 144 "use hash(str(my_seq)) or my_dict[str(my_seq)] to " 145 "to make your code explicit and to avoid this " 146 "warning.", FutureWarning) 147 return id(self) # Currently use object identity for equality testing
148
149 - def __cmp__(self, other):
150 """Compare the sequence to another sequence or a string (README). 151 152 Historically comparing Seq objects has done Python object comparison. 153 After considerable discussion (keeping in mind constraints of the 154 Python language, hashes and dictionary support) a future release of 155 Biopython will change this to use simple string comparison. The plan is 156 that comparing incompatible alphabets (e.g. DNA to RNA) will trigger a 157 warning. 158 159 This version of Biopython still does Python object comparison, but with 160 a warning about this future change. During this transition period, 161 please just do explicit comparisons: 162 163 >>> seq1 = Seq("ACGT") 164 >>> seq2 = Seq("ACGT") 165 >>> id(seq1) == id(seq2) 166 False 167 >>> str(seq1) == str(seq2) 168 True 169 170 Note - This method indirectly supports ==, < , etc. 171 """ 172 if hasattr(other, "alphabet"): 173 #other should be a Seq or a MutableSeq 174 warnings.warn("In future comparing Seq objects will use string " 175 "comparison (not object comparison). Incompatible " 176 "alphabets will trigger a warning (not an exception). " 177 "In the interim please use id(seq1)==id(seq2) or " 178 "str(seq1)==str(seq2) to make your code explicit " 179 "and to avoid this warning.", FutureWarning) 180 return cmp(id(self), id(other))
181
182 - def __len__(self):
183 """Returns the length of the sequence, use len(my_seq).""" 184 return len(self._data) # Seq API requirement
185
186 - def __getitem__(self, index): # Seq API requirement
187 """Returns a subsequence of single letter, use my_seq[index].""" 188 #Note since Python 2.0, __getslice__ is deprecated 189 #and __getitem__ is used instead. 190 #See http://docs.python.org/ref/sequence-methods.html 191 if isinstance(index, int): 192 #Return a single letter as a string 193 return self._data[index] 194 else: 195 #Return the (sub)sequence as another Seq object 196 return Seq(self._data[index], self.alphabet)
197
198 - def __add__(self, other):
199 """Add another sequence or string to this sequence. 200 201 If adding a string to a Seq, the alphabet is preserved: 202 203 >>> from Bio.Seq import Seq 204 >>> from Bio.Alphabet import generic_protein 205 >>> Seq("MELKI", generic_protein) + "LV" 206 Seq('MELKILV', ProteinAlphabet()) 207 208 When adding two Seq (like) objects, the alphabets are important. 209 Consider this example: 210 211 >>> from Bio.Seq import Seq 212 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 213 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 214 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 215 >>> unamb_dna_seq 216 Seq('ACGT', IUPACUnambiguousDNA()) 217 >>> ambig_dna_seq 218 Seq('ACRGT', IUPACAmbiguousDNA()) 219 220 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 221 the more general ambiguous IUPAC DNA alphabet: 222 223 >>> unamb_dna_seq + ambig_dna_seq 224 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 225 226 However, if the default generic alphabet is included, the result is 227 a generic alphabet: 228 229 >>> Seq("") + ambig_dna_seq 230 Seq('ACRGT', Alphabet()) 231 232 You can't add RNA and DNA sequences: 233 234 >>> from Bio.Alphabet import generic_dna, generic_rna 235 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 236 Traceback (most recent call last): 237 ... 238 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 239 240 You can't add nucleotide and protein sequences: 241 242 >>> from Bio.Alphabet import generic_dna, generic_protein 243 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 244 Traceback (most recent call last): 245 ... 246 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 247 """ 248 if hasattr(other, "alphabet"): 249 #other should be a Seq or a MutableSeq 250 if not Alphabet._check_type_compatible([self.alphabet, 251 other.alphabet]): 252 raise TypeError("Incompatible alphabets %s and %s" 253 % (repr(self.alphabet), repr(other.alphabet))) 254 #They should be the same sequence type (or one of them is generic) 255 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 256 return self.__class__(str(self) + str(other), a) 257 elif isinstance(other, basestring): 258 #other is a plain string - use the current alphabet 259 return self.__class__(str(self) + other, self.alphabet) 260 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports 261 if isinstance(other, SeqRecord): 262 #Get the SeqRecord's __radd__ to handle this 263 return NotImplemented 264 else: 265 raise TypeError
266
267 - def __radd__(self, other):
268 """Adding a sequence on the left. 269 270 If adding a string to a Seq, the alphabet is preserved: 271 272 >>> from Bio.Seq import Seq 273 >>> from Bio.Alphabet import generic_protein 274 >>> "LV" + Seq("MELKI", generic_protein) 275 Seq('LVMELKI', ProteinAlphabet()) 276 277 Adding two Seq (like) objects is handled via the __add__ method. 278 """ 279 if hasattr(other, "alphabet"): 280 #other should be a Seq or a MutableSeq 281 if not Alphabet._check_type_compatible([self.alphabet, 282 other.alphabet]): 283 raise TypeError("Incompatable alphabets %s and %s" 284 % (repr(self.alphabet), repr(other.alphabet))) 285 #They should be the same sequence type (or one of them is generic) 286 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 287 return self.__class__(str(other) + str(self), a) 288 elif isinstance(other, basestring): 289 #other is a plain string - use the current alphabet 290 return self.__class__(other + str(self), self.alphabet) 291 else: 292 raise TypeError
293
294 - def tostring(self): # Seq API requirement
295 """Returns the full sequence as a python string (DEPRECATED). 296 297 You are now encouraged to use str(my_seq) instead of 298 my_seq.tostring().""" 299 from Bio import BiopythonDeprecationWarning 300 warnings.warn("This method is obsolete; please use str(my_seq) " 301 "instead of my_seq.tostring().", 302 BiopythonDeprecationWarning) 303 return str(self) 304
305 - def tomutable(self): # Needed? Or use a function?
306 """Returns the full sequence as a MutableSeq object. 307 308 >>> from Bio.Seq import Seq 309 >>> from Bio.Alphabet import IUPAC 310 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 311 ... IUPAC.protein) 312 >>> my_seq 313 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 314 >>> my_seq.tomutable() 315 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 316 317 Note that the alphabet is preserved. 318 """ 319 return MutableSeq(str(self), self.alphabet) 320
321 - def _get_seq_str_and_check_alphabet(self, other_sequence):
322 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 323 324 For a string argument, returns the string. 325 326 For a Seq or MutableSeq, it checks the alphabet is compatible 327 (raising an exception if it isn't), and then returns a string. 328 """ 329 try: 330 other_alpha = other_sequence.alphabet 331 except AttributeError: 332 #Assume other_sequence is a string 333 return other_sequence 334 335 #Other should be a Seq or a MutableSeq 336 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 337 raise TypeError("Incompatable alphabets %s and %s" 338 % (repr(self.alphabet), repr(other_alpha))) 339 #Return as a string 340 return str(other_sequence)
341
342 - def count(self, sub, start=0, end=sys.maxsize):
343 """Non-overlapping count method, like that of a python string. 344 345 This behaves like the python string method of the same name, 346 which does a non-overlapping count! 347 348 Returns an integer, the number of occurrences of substring 349 argument sub in the (sub)sequence given by [start:end]. 350 Optional arguments start and end are interpreted as in slice 351 notation. 352 353 Arguments: 354 - sub - a string or another Seq object to look for 355 - start - optional integer, slice start 356 - end - optional integer, slice end 357 358 e.g. 359 360 >>> from Bio.Seq import Seq 361 >>> my_seq = Seq("AAAATGA") 362 >>> print(my_seq.count("A")) 363 5 364 >>> print(my_seq.count("ATG")) 365 1 366 >>> print(my_seq.count(Seq("AT"))) 367 1 368 >>> print(my_seq.count("AT", 2, -1)) 369 1 370 371 HOWEVER, please note because python strings and Seq objects (and 372 MutableSeq objects) do a non-overlapping search, this may not give 373 the answer you expect: 374 375 >>> "AAAA".count("AA") 376 2 377 >>> print(Seq("AAAA").count("AA")) 378 2 379 380 An overlapping search would give the answer as three! 381 """ 382 #If it has one, check the alphabet: 383 sub_str = self._get_seq_str_and_check_alphabet(sub) 384 return str(self).count(sub_str, start, end)
385
386 - def __contains__(self, char):
387 """Implements the 'in' keyword, like a python string. 388 389 e.g. 390 391 >>> from Bio.Seq import Seq 392 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 393 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 394 >>> "AAA" in my_dna 395 True 396 >>> Seq("AAA") in my_dna 397 True 398 >>> Seq("AAA", generic_dna) in my_dna 399 True 400 401 Like other Seq methods, this will raise a type error if another Seq 402 (or Seq like) object with an incompatible alphabet is used: 403 404 >>> Seq("AAA", generic_rna) in my_dna 405 Traceback (most recent call last): 406 ... 407 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet() 408 >>> Seq("AAA", generic_protein) in my_dna 409 Traceback (most recent call last): 410 ... 411 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet() 412 """ 413 #If it has one, check the alphabet: 414 sub_str = self._get_seq_str_and_check_alphabet(char) 415 return sub_str in str(self)
416
417 - def find(self, sub, start=0, end=sys.maxsize):
418 """Find method, like that of a python string. 419 420 This behaves like the python string method of the same name. 421 422 Returns an integer, the index of the first occurrence of substring 423 argument sub in the (sub)sequence given by [start:end]. 424 425 Arguments: 426 - sub - a string or another Seq object to look for 427 - start - optional integer, slice start 428 - end - optional integer, slice end 429 430 Returns -1 if the subsequence is NOT found. 431 432 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 433 434 >>> from Bio.Seq import Seq 435 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 436 >>> my_rna.find("AUG") 437 3 438 """ 439 #If it has one, check the alphabet: 440 sub_str = self._get_seq_str_and_check_alphabet(sub) 441 return str(self).find(sub_str, start, end)
442
443 - def rfind(self, sub, start=0, end=sys.maxsize):
444 """Find from right method, like that of a python string. 445 446 This behaves like the python string method of the same name. 447 448 Returns an integer, the index of the last (right most) occurrence of 449 substring argument sub in the (sub)sequence given by [start:end]. 450 451 Arguments: 452 - sub - a string or another Seq object to look for 453 - start - optional integer, slice start 454 - end - optional integer, slice end 455 456 Returns -1 if the subsequence is NOT found. 457 458 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 459 460 >>> from Bio.Seq import Seq 461 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 462 >>> my_rna.rfind("AUG") 463 15 464 """ 465 #If it has one, check the alphabet: 466 sub_str = self._get_seq_str_and_check_alphabet(sub) 467 return str(self).rfind(sub_str, start, end)
468
469 - def startswith(self, prefix, start=0, end=sys.maxsize):
470 """Does the Seq start with the given prefix? Returns True/False. 471 472 This behaves like the python string method of the same name. 473 474 Return True if the sequence starts with the specified prefix 475 (a string or another Seq object), False otherwise. 476 With optional start, test sequence beginning at that position. 477 With optional end, stop comparing sequence at that position. 478 prefix can also be a tuple of strings to try. e.g. 479 480 >>> from Bio.Seq import Seq 481 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 482 >>> my_rna.startswith("GUC") 483 True 484 >>> my_rna.startswith("AUG") 485 False 486 >>> my_rna.startswith("AUG", 3) 487 True 488 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) 489 True 490 """ 491 #If it has one, check the alphabet: 492 if isinstance(prefix, tuple): 493 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 494 for p in prefix) 495 return str(self).startswith(prefix_strs, start, end) 496 else: 497 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 498 return str(self).startswith(prefix_str, start, end)
499
500 - def endswith(self, suffix, start=0, end=sys.maxsize):
501 """Does the Seq end with the given suffix? Returns True/False. 502 503 This behaves like the python string method of the same name. 504 505 Return True if the sequence ends with the specified suffix 506 (a string or another Seq object), False otherwise. 507 With optional start, test sequence beginning at that position. 508 With optional end, stop comparing sequence at that position. 509 suffix can also be a tuple of strings to try. e.g. 510 511 >>> from Bio.Seq import Seq 512 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 513 >>> my_rna.endswith("UUG") 514 True 515 >>> my_rna.endswith("AUG") 516 False 517 >>> my_rna.endswith("AUG", 0, 18) 518 True 519 >>> my_rna.endswith(("UCC", "UCA", "UUG")) 520 True 521 """ 522 #If it has one, check the alphabet: 523 if isinstance(suffix, tuple): 524 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 525 for p in suffix) 526 return str(self).endswith(suffix_strs, start, end) 527 else: 528 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 529 return str(self).endswith(suffix_str, start, end)
530
531 - def split(self, sep=None, maxsplit=-1):
532 """Split method, like that of a python string. 533 534 This behaves like the python string method of the same name. 535 536 Return a list of the 'words' in the string (as Seq objects), 537 using sep as the delimiter string. If maxsplit is given, at 538 most maxsplit splits are done. If maxsplit is omitted, all 539 splits are made. 540 541 Following the python string method, sep will by default be any 542 white space (tabs, spaces, newlines) but this is unlikely to 543 apply to biological sequences. 544 545 e.g. 546 547 >>> from Bio.Seq import Seq 548 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 549 >>> my_aa = my_rna.translate() 550 >>> my_aa 551 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 552 >>> my_aa.split("*") 553 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 554 >>> my_aa.split("*", 1) 555 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 556 557 See also the rsplit method: 558 559 >>> my_aa.rsplit("*", 1) 560 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 561 """ 562 #If it has one, check the alphabet: 563 sep_str = self._get_seq_str_and_check_alphabet(sep) 564 #TODO - If the sep is the defined stop symbol, or gap char, 565 #should we adjust the alphabet? 566 return [Seq(part, self.alphabet) 567 for part in str(self).split(sep_str, maxsplit)]
568
569 - def rsplit(self, sep=None, maxsplit=-1):
570 """Right split method, like that of a python string. 571 572 This behaves like the python string method of the same name. 573 574 Return a list of the 'words' in the string (as Seq objects), 575 using sep as the delimiter string. If maxsplit is given, at 576 most maxsplit splits are done COUNTING FROM THE RIGHT. 577 If maxsplit is omitted, all splits are made. 578 579 Following the python string method, sep will by default be any 580 white space (tabs, spaces, newlines) but this is unlikely to 581 apply to biological sequences. 582 583 e.g. print(my_seq.rsplit("*",1)) 584 585 See also the split method. 586 """ 587 #If it has one, check the alphabet: 588 sep_str = self._get_seq_str_and_check_alphabet(sep) 589 return [Seq(part, self.alphabet) 590 for part in str(self).rsplit(sep_str, maxsplit)]
591
592 - def strip(self, chars=None):
593 """Returns a new Seq object with leading and trailing ends stripped. 594 595 This behaves like the python string method of the same name. 596 597 Optional argument chars defines which characters to remove. If 598 omitted or None (default) then as for the python string method, 599 this defaults to removing any white space. 600 601 e.g. print(my_seq.strip("-")) 602 603 See also the lstrip and rstrip methods. 604 """ 605 #If it has one, check the alphabet: 606 strip_str = self._get_seq_str_and_check_alphabet(chars) 607 return Seq(str(self).strip(strip_str), self.alphabet)
608
609 - def lstrip(self, chars=None):
610 """Returns a new Seq object with leading (left) end stripped. 611 612 This behaves like the python string method of the same name. 613 614 Optional argument chars defines which characters to remove. If 615 omitted or None (default) then as for the python string method, 616 this defaults to removing any white space. 617 618 e.g. print(my_seq.lstrip("-")) 619 620 See also the strip and rstrip methods. 621 """ 622 #If it has one, check the alphabet: 623 strip_str = self._get_seq_str_and_check_alphabet(chars) 624 return Seq(str(self).lstrip(strip_str), self.alphabet)
625
626 - def rstrip(self, chars=None):
627 """Returns a new Seq object with trailing (right) end stripped. 628 629 This behaves like the python string method of the same name. 630 631 Optional argument chars defines which characters to remove. If 632 omitted or None (default) then as for the python string method, 633 this defaults to removing any white space. 634 635 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 636 637 >>> from Bio.Alphabet import IUPAC 638 >>> from Bio.Seq import Seq 639 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 640 >>> my_seq 641 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 642 >>> my_seq.rstrip("A") 643 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 644 645 See also the strip and lstrip methods. 646 """ 647 #If it has one, check the alphabet: 648 strip_str = self._get_seq_str_and_check_alphabet(chars) 649 return Seq(str(self).rstrip(strip_str), self.alphabet)
650
651 - def upper(self):
652 """Returns an upper case copy of the sequence. 653 654 >>> from Bio.Alphabet import HasStopCodon, generic_protein 655 >>> from Bio.Seq import Seq 656 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 657 >>> my_seq 658 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 659 >>> my_seq.lower() 660 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 661 >>> my_seq.upper() 662 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 663 664 This will adjust the alphabet if required. See also the lower method. 665 """ 666 return Seq(str(self).upper(), self.alphabet._upper())
667
668 - def lower(self):
669 """Returns a lower case copy of the sequence. 670 671 This will adjust the alphabet if required. Note that the IUPAC alphabets 672 are upper case only, and thus a generic alphabet must be substituted. 673 674 >>> from Bio.Alphabet import Gapped, generic_dna 675 >>> from Bio.Alphabet import IUPAC 676 >>> from Bio.Seq import Seq 677 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*")) 678 >>> my_seq 679 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 680 >>> my_seq.lower() 681 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 682 683 See also the upper method. 684 """ 685 return Seq(str(self).lower(), self.alphabet._lower())
686
687 - def complement(self):
688 """Returns the complement sequence. New Seq object. 689 690 >>> from Bio.Seq import Seq 691 >>> from Bio.Alphabet import IUPAC 692 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 693 >>> my_dna 694 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 695 >>> my_dna.complement() 696 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 697 698 You can of course used mixed case sequences, 699 700 >>> from Bio.Seq import Seq 701 >>> from Bio.Alphabet import generic_dna 702 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 703 >>> my_dna 704 Seq('CCCCCgatA-GD', DNAAlphabet()) 705 >>> my_dna.complement() 706 Seq('GGGGGctaT-CH', DNAAlphabet()) 707 708 Note in the above example, ambiguous character D denotes 709 G, A or T so its complement is H (for C, T or A). 710 711 Trying to complement a protein sequence raises an exception. 712 713 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 714 >>> my_protein.complement() 715 Traceback (most recent call last): 716 ... 717 ValueError: Proteins do not have complements! 718 """ 719 base = Alphabet._get_base_alphabet(self.alphabet) 720 if isinstance(base, Alphabet.ProteinAlphabet): 721 raise ValueError("Proteins do not have complements!") 722 if isinstance(base, Alphabet.DNAAlphabet): 723 ttable = _dna_complement_table 724 elif isinstance(base, Alphabet.RNAAlphabet): 725 ttable = _rna_complement_table 726 elif ('U' in self._data or 'u' in self._data) \ 727 and ('T' in self._data or 't' in self._data): 728 #TODO - Handle this cleanly? 729 raise ValueError("Mixed RNA/DNA found") 730 elif 'U' in self._data or 'u' in self._data: 731 ttable = _rna_complement_table 732 else: 733 ttable = _dna_complement_table 734 #Much faster on really long sequences than the previous loop based one. 735 #thx to Michael Palmer, University of Waterloo 736 return Seq(str(self).translate(ttable), self.alphabet)
737
738 - def reverse_complement(self):
739 """Returns the reverse complement sequence. New Seq object. 740 741 >>> from Bio.Seq import Seq 742 >>> from Bio.Alphabet import IUPAC 743 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 744 >>> my_dna 745 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 746 >>> my_dna.reverse_complement() 747 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 748 749 Note in the above example, since R = G or A, its complement 750 is Y (which denotes C or T). 751 752 You can of course used mixed case sequences, 753 754 >>> from Bio.Seq import Seq 755 >>> from Bio.Alphabet import generic_dna 756 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 757 >>> my_dna 758 Seq('CCCCCgatA-G', DNAAlphabet()) 759 >>> my_dna.reverse_complement() 760 Seq('C-TatcGGGGG', DNAAlphabet()) 761 762 Trying to complement a protein sequence raises an exception: 763 764 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 765 >>> my_protein.reverse_complement() 766 Traceback (most recent call last): 767 ... 768 ValueError: Proteins do not have complements! 769 """ 770 #Use -1 stride/step to reverse the complement 771 return self.complement()[::-1]
772
773 - def transcribe(self):
774 """Returns the RNA sequence from a DNA sequence. New Seq object. 775 776 >>> from Bio.Seq import Seq 777 >>> from Bio.Alphabet import IUPAC 778 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 779 ... IUPAC.unambiguous_dna) 780 >>> coding_dna 781 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 782 >>> coding_dna.transcribe() 783 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 784 785 Trying to transcribe a protein or RNA sequence raises an exception: 786 787 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 788 >>> my_protein.transcribe() 789 Traceback (most recent call last): 790 ... 791 ValueError: Proteins cannot be transcribed! 792 """ 793 base = Alphabet._get_base_alphabet(self.alphabet) 794 if isinstance(base, Alphabet.ProteinAlphabet): 795 raise ValueError("Proteins cannot be transcribed!") 796 if isinstance(base, Alphabet.RNAAlphabet): 797 raise ValueError("RNA cannot be transcribed!") 798 799 if self.alphabet==IUPAC.unambiguous_dna: 800 alphabet = IUPAC.unambiguous_rna 801 elif self.alphabet==IUPAC.ambiguous_dna: 802 alphabet = IUPAC.ambiguous_rna 803 else: 804 alphabet = Alphabet.generic_rna 805 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
806
807 - def back_transcribe(self):
808 """Returns the DNA sequence from an RNA sequence. New Seq object. 809 810 >>> from Bio.Seq import Seq 811 >>> from Bio.Alphabet import IUPAC 812 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 813 ... IUPAC.unambiguous_rna) 814 >>> messenger_rna 815 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 816 >>> messenger_rna.back_transcribe() 817 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 818 819 Trying to back-transcribe a protein or DNA sequence raises an 820 exception: 821 822 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 823 >>> my_protein.back_transcribe() 824 Traceback (most recent call last): 825 ... 826 ValueError: Proteins cannot be back transcribed! 827 """ 828 base = Alphabet._get_base_alphabet(self.alphabet) 829 if isinstance(base, Alphabet.ProteinAlphabet): 830 raise ValueError("Proteins cannot be back transcribed!") 831 if isinstance(base, Alphabet.DNAAlphabet): 832 raise ValueError("DNA cannot be back transcribed!") 833 834 if self.alphabet==IUPAC.unambiguous_rna: 835 alphabet = IUPAC.unambiguous_dna 836 elif self.alphabet==IUPAC.ambiguous_rna: 837 alphabet = IUPAC.ambiguous_dna 838 else: 839 alphabet = Alphabet.generic_dna 840 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
841
842 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 843 cds=False):
844 """Turns a nucleotide sequence into a protein sequence. New Seq object. 845 846 This method will translate DNA or RNA sequences, and those with a 847 nucleotide or generic alphabet. Trying to translate a protein 848 sequence raises an exception. 849 850 Arguments: 851 - table - Which codon table to use? This can be either a name 852 (string), an NCBI identifier (integer), or a CodonTable 853 object (useful for non-standard genetic codes). This 854 defaults to the "Standard" table. 855 - stop_symbol - Single character string, what to use for terminators. 856 This defaults to the asterisk, "*". 857 - to_stop - Boolean, defaults to False meaning do a full translation 858 continuing on past any stop codons (translated as the 859 specified stop_symbol). If True, translation is 860 terminated at the first in frame stop codon (and the 861 stop_symbol is not appended to the returned protein 862 sequence). 863 - cds - Boolean, indicates this is a complete CDS. If True, 864 this checks the sequence starts with a valid alternative start 865 codon (which will be translated as methionine, M), that the 866 sequence length is a multiple of three, and that there is a 867 single in frame stop codon at the end (this will be excluded 868 from the protein sequence, regardless of the to_stop option). 869 If these tests fail, an exception is raised. 870 871 e.g. Using the standard table: 872 873 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 874 >>> coding_dna.translate() 875 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 876 >>> coding_dna.translate(stop_symbol="@") 877 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 878 >>> coding_dna.translate(to_stop=True) 879 Seq('VAIVMGR', ExtendedIUPACProtein()) 880 881 Now using NCBI table 2, where TGA is not a stop codon: 882 883 >>> coding_dna.translate(table=2) 884 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 885 >>> coding_dna.translate(table=2, to_stop=True) 886 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 887 888 In fact, GTG is an alternative start codon under NCBI table 2, meaning 889 this sequence could be a complete CDS: 890 891 >>> coding_dna.translate(table=2, cds=True) 892 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 893 894 It isn't a valid CDS under NCBI table 1, due to both the start codon and 895 also the in frame stop codons: 896 897 >>> coding_dna.translate(table=1, cds=True) 898 Traceback (most recent call last): 899 ... 900 TranslationError: First codon 'GTG' is not a start codon 901 902 If the sequence has no in-frame stop codon, then the to_stop argument 903 has no effect: 904 905 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 906 >>> coding_dna2.translate() 907 Seq('LAIVMGR', ExtendedIUPACProtein()) 908 >>> coding_dna2.translate(to_stop=True) 909 Seq('LAIVMGR', ExtendedIUPACProtein()) 910 911 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 912 or a stop codon. These are translated as "X". Any invalid codon 913 (e.g. "TA?" or "T-A") will throw a TranslationError. 914 915 NOTE - Does NOT support gapped sequences. 916 917 NOTE - This does NOT behave like the python string's translate 918 method. For that use str(my_seq).translate(...) instead. 919 """ 920 if isinstance(table, str) and len(table)==256: 921 raise ValueError("The Seq object translate method DOES NOT take " 922 + "a 256 character string mapping table like " 923 + "the python string object's translate method. " 924 + "Use str(my_seq).translate(...) instead.") 925 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 926 Alphabet.ProteinAlphabet): 927 raise ValueError("Proteins cannot be translated!") 928 try: 929 table_id = int(table) 930 except ValueError: 931 #Assume its a table name 932 if self.alphabet==IUPAC.unambiguous_dna: 933 #Will use standard IUPAC protein alphabet, no need for X 934 codon_table = CodonTable.unambiguous_dna_by_name[table] 935 elif self.alphabet==IUPAC.unambiguous_rna: 936 #Will use standard IUPAC protein alphabet, no need for X 937 codon_table = CodonTable.unambiguous_rna_by_name[table] 938 else: 939 #This will use the extended IUPAC protein alphabet with X etc. 940 #The same table can be used for RNA or DNA (we use this for 941 #translating strings). 942 codon_table = CodonTable.ambiguous_generic_by_name[table] 943 except (AttributeError, TypeError): 944 #Assume its a CodonTable object 945 if isinstance(table, CodonTable.CodonTable): 946 codon_table = table 947 else: 948 raise ValueError('Bad table argument') 949 else: 950 #Assume its a table ID 951 if self.alphabet==IUPAC.unambiguous_dna: 952 #Will use standard IUPAC protein alphabet, no need for X 953 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 954 elif self.alphabet==IUPAC.unambiguous_rna: 955 #Will use standard IUPAC protein alphabet, no need for X 956 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 957 else: 958 #This will use the extended IUPAC protein alphabet with X etc. 959 #The same table can be used for RNA or DNA (we use this for 960 #translating strings). 961 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 962 protein = _translate_str(str(self), codon_table, 963 stop_symbol, to_stop, cds) 964 if stop_symbol in protein: 965 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 966 stop_symbol = stop_symbol) 967 else: 968 alphabet = codon_table.protein_alphabet 969 return Seq(protein, alphabet)
970
971 - def ungap(self, gap=None):
972 """Return a copy of the sequence without the gap character(s). 973 974 The gap character can be specified in two ways - either as an explicit 975 argument, or via the sequence's alphabet. For example: 976 977 >>> from Bio.Seq import Seq 978 >>> from Bio.Alphabet import generic_dna 979 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 980 >>> my_dna 981 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 982 >>> my_dna.ungap("-") 983 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 984 985 If the gap character is not given as an argument, it will be taken from 986 the sequence's alphabet (if defined). Notice that the returned sequence's 987 alphabet is adjusted since it no longer requires a gapped alphabet: 988 989 >>> from Bio.Seq import Seq 990 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 991 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 992 >>> my_pro 993 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 994 >>> my_pro.ungap() 995 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 996 997 Or, with a simpler gapped DNA example: 998 999 >>> from Bio.Seq import Seq 1000 >>> from Bio.Alphabet import IUPAC, Gapped 1001 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 1002 >>> my_seq 1003 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1004 >>> my_seq.ungap() 1005 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1006 1007 As long as it is consistent with the alphabet, although it is redundant, 1008 you can still supply the gap character as an argument to this method: 1009 1010 >>> my_seq 1011 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1012 >>> my_seq.ungap("=") 1013 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1014 1015 However, if the gap character given as the argument disagrees with that 1016 declared in the alphabet, an exception is raised: 1017 1018 >>> my_seq 1019 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1020 >>> my_seq.ungap("-") 1021 Traceback (most recent call last): 1022 ... 1023 ValueError: Gap '-' does not match '=' from alphabet 1024 1025 Finally, if a gap character is not supplied, and the alphabet does not 1026 define one, an exception is raised: 1027 1028 >>> from Bio.Seq import Seq 1029 >>> from Bio.Alphabet import generic_dna 1030 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1031 >>> my_dna 1032 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1033 >>> my_dna.ungap() 1034 Traceback (most recent call last): 1035 ... 1036 ValueError: Gap character not given and not defined in alphabet 1037 1038 """ 1039 if hasattr(self.alphabet, "gap_char"): 1040 if not gap: 1041 gap = self.alphabet.gap_char 1042 elif gap != self.alphabet.gap_char: 1043 raise ValueError("Gap %s does not match %s from alphabet" 1044 % (repr(gap), repr(self.alphabet.gap_char))) 1045 alpha = Alphabet._ungap(self.alphabet) 1046 elif not gap: 1047 raise ValueError("Gap character not given and not defined in alphabet") 1048 else: 1049 alpha = self.alphabet # modify! 1050 if len(gap)!=1 or not isinstance(gap, str): 1051 raise ValueError("Unexpected gap character, %s" % repr(gap)) 1052 return Seq(str(self).replace(gap, ""), alpha)
1053 1054
1055 -class UnknownSeq(Seq):
1056 """A read-only sequence object of known length but unknown contents. 1057 1058 If you have an unknown sequence, you can represent this with a normal 1059 Seq object, for example: 1060 1061 >>> my_seq = Seq("N"*5) 1062 >>> my_seq 1063 Seq('NNNNN', Alphabet()) 1064 >>> len(my_seq) 1065 5 1066 >>> print(my_seq) 1067 NNNNN 1068 1069 However, this is rather wasteful of memory (especially for large 1070 sequences), which is where this class is most usefull: 1071 1072 >>> unk_five = UnknownSeq(5) 1073 >>> unk_five 1074 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1075 >>> len(unk_five) 1076 5 1077 >>> print(unk_five) 1078 ????? 1079 1080 You can add unknown sequence together, provided their alphabets and 1081 characters are compatible, and get another memory saving UnknownSeq: 1082 1083 >>> unk_four = UnknownSeq(4) 1084 >>> unk_four 1085 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1086 >>> unk_four + unk_five 1087 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1088 1089 If the alphabet or characters don't match up, the addition gives an 1090 ordinary Seq object: 1091 1092 >>> unk_nnnn = UnknownSeq(4, character = "N") 1093 >>> unk_nnnn 1094 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1095 >>> unk_nnnn + unk_four 1096 Seq('NNNN????', Alphabet()) 1097 1098 Combining with a real Seq gives a new Seq object: 1099 1100 >>> known_seq = Seq("ACGT") 1101 >>> unk_four + known_seq 1102 Seq('????ACGT', Alphabet()) 1103 >>> known_seq + unk_four 1104 Seq('ACGT????', Alphabet()) 1105 """
1106 - def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None):
1107 """Create a new UnknownSeq object. 1108 1109 If character is omitted, it is determined from the alphabet, "N" for 1110 nucleotides, "X" for proteins, and "?" otherwise. 1111 """ 1112 self._length = int(length) 1113 if self._length < 0: 1114 #TODO - Block zero length UnknownSeq? You can just use a Seq! 1115 raise ValueError("Length must not be negative.") 1116 self.alphabet = alphabet 1117 if character: 1118 if len(character) != 1: 1119 raise ValueError("character argument should be a single letter string.") 1120 self._character = character 1121 else: 1122 base = Alphabet._get_base_alphabet(alphabet) 1123 #TODO? Check the case of the letters in the alphabet? 1124 #We may have to use "n" instead of "N" etc. 1125 if isinstance(base, Alphabet.NucleotideAlphabet): 1126 self._character = "N" 1127 elif isinstance(base, Alphabet.ProteinAlphabet): 1128 self._character = "X" 1129 else: 1130 self._character = "?"
1131
1132 - def __len__(self):
1133 """Returns the stated length of the unknown sequence.""" 1134 return self._length
1135
1136 - def __str__(self):
1137 """Returns the unknown sequence as full string of the given length.""" 1138 return self._character * self._length
1139
1140 - def __repr__(self):
1141 return "UnknownSeq(%i, alphabet = %s, character = %s)" \ 1142 % (self._length, repr(self.alphabet), repr(self._character))
1143
1144 - def __add__(self, other):
1145 """Add another sequence or string to this sequence. 1146 1147 Adding two UnknownSeq objects returns another UnknownSeq object 1148 provided the character is the same and the alphabets are compatible. 1149 1150 >>> from Bio.Seq import UnknownSeq 1151 >>> from Bio.Alphabet import generic_protein 1152 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1153 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1154 1155 If the characters differ, an UnknownSeq object cannot be used, so a 1156 Seq object is returned: 1157 1158 >>> from Bio.Seq import UnknownSeq 1159 >>> from Bio.Alphabet import generic_protein 1160 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1161 ... character="x") 1162 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1163 1164 If adding a string to an UnknownSeq, a new Seq is returned with the 1165 same alphabet: 1166 1167 >>> from Bio.Seq import UnknownSeq 1168 >>> from Bio.Alphabet import generic_protein 1169 >>> UnknownSeq(5, generic_protein) + "LV" 1170 Seq('XXXXXLV', ProteinAlphabet()) 1171 """ 1172 if isinstance(other, UnknownSeq) \ 1173 and other._character == self._character: 1174 #TODO - Check the alphabets match 1175 return UnknownSeq(len(self)+len(other), 1176 self.alphabet, self._character) 1177 #Offload to the base class... 1178 return Seq(str(self), self.alphabet) + other
1179
1180 - def __radd__(self, other):
1181 #If other is an UnknownSeq, then __add__ would be called. 1182 #Offload to the base class... 1183 return other + Seq(str(self), self.alphabet)
1184
1185 - def __getitem__(self, index):
1186 """Get a subsequence from the UnknownSeq object. 1187 1188 >>> unk = UnknownSeq(8, character="N") 1189 >>> print(unk[:]) 1190 NNNNNNNN 1191 >>> print(unk[5:3]) 1192 <BLANKLINE> 1193 >>> print(unk[1:-1]) 1194 NNNNNN 1195 >>> print(unk[1:-1:2]) 1196 NNN 1197 """ 1198 if isinstance(index, int): 1199 #TODO - Check the bounds without wasting memory 1200 return str(self)[index] 1201 old_length = self._length 1202 step = index.step 1203 if step is None or step == 1: 1204 #This calculates the length you'd get from ("N"*old_length)[index] 1205 start = index.start 1206 end = index.stop 1207 if start is None: 1208 start = 0 1209 elif start < 0: 1210 start = max(0, old_length + start) 1211 elif start > old_length: 1212 start = old_length 1213 if end is None: 1214 end = old_length 1215 elif end < 0: 1216 end = max(0, old_length + end) 1217 elif end > old_length: 1218 end = old_length 1219 new_length = max(0, end-start) 1220 elif step == 0: 1221 raise ValueError("slice step cannot be zero") 1222 else: 1223 #TODO - handle step efficiently 1224 new_length = len(("X"*old_length)[index]) 1225 #assert new_length == len(("X"*old_length)[index]), \ 1226 # (index, start, end, step, old_length, 1227 # new_length, len(("X"*old_length)[index])) 1228 return UnknownSeq(new_length, self.alphabet, self._character)
1229
1230 - def count(self, sub, start=0, end=sys.maxsize):
1231 """Non-overlapping count method, like that of a python string. 1232 1233 This behaves like the python string (and Seq object) method of the 1234 same name, which does a non-overlapping count! 1235 1236 Returns an integer, the number of occurrences of substring 1237 argument sub in the (sub)sequence given by [start:end]. 1238 Optional arguments start and end are interpreted as in slice 1239 notation. 1240 1241 Arguments: 1242 - sub - a string or another Seq object to look for 1243 - start - optional integer, slice start 1244 - end - optional integer, slice end 1245 1246 >>> "NNNN".count("N") 1247 4 1248 >>> Seq("NNNN").count("N") 1249 4 1250 >>> UnknownSeq(4, character="N").count("N") 1251 4 1252 >>> UnknownSeq(4, character="N").count("A") 1253 0 1254 >>> UnknownSeq(4, character="N").count("AA") 1255 0 1256 1257 HOWEVER, please note because that python strings and Seq objects (and 1258 MutableSeq objects) do a non-overlapping search, this may not give 1259 the answer you expect: 1260 1261 >>> UnknownSeq(4, character="N").count("NN") 1262 2 1263 >>> UnknownSeq(4, character="N").count("NNN") 1264 1 1265 """ 1266 sub_str = self._get_seq_str_and_check_alphabet(sub) 1267 if len(sub_str) == 1: 1268 if str(sub_str) == self._character: 1269 if start==0 and end >= self._length: 1270 return self._length 1271 else: 1272 #This could be done more cleverly... 1273 return str(self).count(sub_str, start, end) 1274 else: 1275 return 0 1276 else: 1277 if set(sub_str) == set(self._character): 1278 if start==0 and end >= self._length: 1279 return self._length // len(sub_str) 1280 else: 1281 #This could be done more cleverly... 1282 return str(self).count(sub_str, start, end) 1283 else: 1284 return 0
1285
1286 - def complement(self):
1287 """The complement of an unknown nucleotide equals itself. 1288 1289 >>> my_nuc = UnknownSeq(8) 1290 >>> my_nuc 1291 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1292 >>> print(my_nuc) 1293 ???????? 1294 >>> my_nuc.complement() 1295 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1296 >>> print(my_nuc.complement()) 1297 ???????? 1298 """ 1299 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1300 Alphabet.ProteinAlphabet): 1301 raise ValueError("Proteins do not have complements!") 1302 return self
1303
1304 - def reverse_complement(self):
1305 """The reverse complement of an unknown nucleotide equals itself. 1306 1307 >>> my_nuc = UnknownSeq(10) 1308 >>> my_nuc 1309 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1310 >>> print(my_nuc) 1311 ?????????? 1312 >>> my_nuc.reverse_complement() 1313 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1314 >>> print(my_nuc.reverse_complement()) 1315 ?????????? 1316 """ 1317 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1318 Alphabet.ProteinAlphabet): 1319 raise ValueError("Proteins do not have complements!") 1320 return self
1321
1322 - def transcribe(self):
1323 """Returns unknown RNA sequence from an unknown DNA sequence. 1324 1325 >>> my_dna = UnknownSeq(10, character="N") 1326 >>> my_dna 1327 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1328 >>> print(my_dna) 1329 NNNNNNNNNN 1330 >>> my_rna = my_dna.transcribe() 1331 >>> my_rna 1332 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1333 >>> print(my_rna) 1334 NNNNNNNNNN 1335 """ 1336 #Offload the alphabet stuff 1337 s = Seq(self._character, self.alphabet).transcribe() 1338 return UnknownSeq(self._length, s.alphabet, self._character)
1339
1340 - def back_transcribe(self):
1341 """Returns unknown DNA sequence from an unknown RNA sequence. 1342 1343 >>> my_rna = UnknownSeq(20, character="N") 1344 >>> my_rna 1345 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1346 >>> print(my_rna) 1347 NNNNNNNNNNNNNNNNNNNN 1348 >>> my_dna = my_rna.back_transcribe() 1349 >>> my_dna 1350 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1351 >>> print(my_dna) 1352 NNNNNNNNNNNNNNNNNNNN 1353 """ 1354 #Offload the alphabet stuff 1355 s = Seq(self._character, self.alphabet).back_transcribe() 1356 return UnknownSeq(self._length, s.alphabet, self._character)
1357
1358 - def upper(self):
1359 """Returns an upper case copy of the sequence. 1360 1361 >>> from Bio.Alphabet import generic_dna 1362 >>> from Bio.Seq import UnknownSeq 1363 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1364 >>> my_seq 1365 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1366 >>> print(my_seq) 1367 nnnnnnnnnnnnnnnnnnnn 1368 >>> my_seq.upper() 1369 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1370 >>> print(my_seq.upper()) 1371 NNNNNNNNNNNNNNNNNNNN 1372 1373 This will adjust the alphabet if required. See also the lower method. 1374 """ 1375 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1376
1377 - def lower(self):
1378 """Returns a lower case copy of the sequence. 1379 1380 This will adjust the alphabet if required: 1381 1382 >>> from Bio.Alphabet import IUPAC 1383 >>> from Bio.Seq import UnknownSeq 1384 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1385 >>> my_seq 1386 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1387 >>> print(my_seq) 1388 XXXXXXXXXXXXXXXXXXXX 1389 >>> my_seq.lower() 1390 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1391 >>> print(my_seq.lower()) 1392 xxxxxxxxxxxxxxxxxxxx 1393 1394 See also the upper method. 1395 """ 1396 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1397
1398 - def translate(self, **kwargs):
1399 """Translate an unknown nucleotide sequence into an unknown protein. 1400 1401 e.g. 1402 1403 >>> my_seq = UnknownSeq(9, character="N") 1404 >>> print(my_seq) 1405 NNNNNNNNN 1406 >>> my_protein = my_seq.translate() 1407 >>> my_protein 1408 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1409 >>> print(my_protein) 1410 XXX 1411 1412 In comparison, using a normal Seq object: 1413 1414 >>> my_seq = Seq("NNNNNNNNN") 1415 >>> print(my_seq) 1416 NNNNNNNNN 1417 >>> my_protein = my_seq.translate() 1418 >>> my_protein 1419 Seq('XXX', ExtendedIUPACProtein()) 1420 >>> print(my_protein) 1421 XXX 1422 1423 """ 1424 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1425 Alphabet.ProteinAlphabet): 1426 raise ValueError("Proteins cannot be translated!") 1427 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1428
1429 - def ungap(self, gap=None):
1430 """Return a copy of the sequence without the gap character(s). 1431 1432 The gap character can be specified in two ways - either as an explicit 1433 argument, or via the sequence's alphabet. For example: 1434 1435 >>> from Bio.Seq import UnknownSeq 1436 >>> from Bio.Alphabet import Gapped, generic_dna 1437 >>> my_dna = UnknownSeq(20, Gapped(generic_dna, "-")) 1438 >>> my_dna 1439 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1440 >>> my_dna.ungap() 1441 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1442 >>> my_dna.ungap("-") 1443 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1444 1445 If the UnknownSeq is using the gap character, then an empty Seq is 1446 returned: 1447 1448 >>> my_gap = UnknownSeq(20, Gapped(generic_dna, "-"), character="-") 1449 >>> my_gap 1450 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1451 >>> my_gap.ungap() 1452 Seq('', DNAAlphabet()) 1453 >>> my_gap.ungap("-") 1454 Seq('', DNAAlphabet()) 1455 1456 Notice that the returned sequence's alphabet is adjusted to remove any 1457 explicit gap character declaration. 1458 """ 1459 #Offload the alphabet stuff 1460 s = Seq(self._character, self.alphabet).ungap() 1461 if s: 1462 return UnknownSeq(self._length, s.alphabet, self._character) 1463 else: 1464 return Seq("", s.alphabet)
1465 1466
1467 -class MutableSeq(object):
1468 """An editable sequence object (with an alphabet). 1469 1470 Unlike normal python strings and our basic sequence object (the Seq class) 1471 which are immuatable, the MutableSeq lets you edit the sequence in place. 1472 However, this means you cannot use a MutableSeq object as a dictionary key. 1473 1474 >>> from Bio.Seq import MutableSeq 1475 >>> from Bio.Alphabet import generic_dna 1476 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1477 >>> my_seq 1478 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1479 >>> my_seq[5] 1480 'T' 1481 >>> my_seq[5] = "A" 1482 >>> my_seq 1483 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1484 >>> my_seq[5] 1485 'A' 1486 >>> my_seq[5:8] = "NNN" 1487 >>> my_seq 1488 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1489 >>> len(my_seq) 1490 11 1491 1492 Note that the MutableSeq object does not support as many string-like 1493 or biological methods as the Seq object. 1494 """
1495 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
1496 if sys.version_info[0] == 3: 1497 self.array_indicator = "u" 1498 else: 1499 self.array_indicator = "c" 1500 if isinstance(data, str): # TODO - What about unicode? 1501 self.data = array.array(self.array_indicator, data) 1502 else: 1503 self.data = data # assumes the input is an array 1504 self.alphabet = alphabet
1505
1506 - def __repr__(self):
1507 """Returns a (truncated) representation of the sequence for debugging.""" 1508 if len(self) > 60: 1509 #Shows the last three letters as it is often useful to see if there 1510 #is a stop codon at the end of a sequence. 1511 #Note total length is 54+3+3=60 1512 return "%s('%s...%s', %s)" % (self.__class__.__name__, 1513 str(self[:54]), str(self[-3:]), 1514 repr(self.alphabet)) 1515 else: 1516 return "%s('%s', %s)" % (self.__class__.__name__, 1517 str(self), 1518 repr(self.alphabet))
1519
1520 - def __str__(self):
1521 """Returns the full sequence as a python string. 1522 1523 Note that Biopython 1.44 and earlier would give a truncated 1524 version of repr(my_seq) for str(my_seq). If you are writing code 1525 which needs to be backwards compatible with old Biopython, you 1526 should continue to use my_seq.tostring() rather than str(my_seq). 1527 """ 1528 #See test_GAQueens.py for an historic usage of a non-string alphabet! 1529 return "".join(self.data)
1530
1531 - def __cmp__(self, other):
1532 """Compare the sequence to another sequence or a string (README). 1533 1534 Currently if compared to another sequence the alphabets must be 1535 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1536 an exception. Otherwise only the sequence itself is compared, not the 1537 precise alphabet. 1538 1539 A future release of Biopython will change this (and the Seq object etc) 1540 to use simple string comparison. The plan is that comparing sequences 1541 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1542 but not an exception. 1543 1544 During this transition period, please just do explicit comparisons: 1545 1546 >>> seq1 = MutableSeq("ACGT") 1547 >>> seq2 = MutableSeq("ACGT") 1548 >>> id(seq1) == id(seq2) 1549 False 1550 >>> str(seq1) == str(seq2) 1551 True 1552 1553 This method indirectly supports ==, < , etc. 1554 """ 1555 if hasattr(other, "alphabet"): 1556 #other should be a Seq or a MutableSeq 1557 warnings.warn("In future comparing incompatible alphabets will " 1558 "only trigger a warning (not an exception). In " 1559 "the interim please use id(seq1)==id(seq2) or " 1560 "str(seq1)==str(seq2) to make your code explicit " 1561 "and to avoid this warning.", FutureWarning) 1562 if not Alphabet._check_type_compatible([self.alphabet, 1563 other.alphabet]): 1564 raise TypeError("Incompatable alphabets %s and %s" 1565 % (repr(self.alphabet), repr(other.alphabet))) 1566 #They should be the same sequence type (or one of them is generic) 1567 if isinstance(other, MutableSeq): 1568 #See test_GAQueens.py for an historic usage of a non-string 1569 #alphabet! Comparing the arrays supports this. 1570 return cmp(self.data, other.data) 1571 else: 1572 return cmp(str(self), str(other)) 1573 elif isinstance(other, basestring): 1574 return cmp(str(self), other) 1575 else: 1576 raise TypeError
1577
1578 - def __len__(self):
1579 return len(self.data)
1580
1581 - def __getitem__(self, index):
1582 #Note since Python 2.0, __getslice__ is deprecated 1583 #and __getitem__ is used instead. 1584 #See http://docs.python.org/ref/sequence-methods.html 1585 if isinstance(index, int): 1586 #Return a single letter as a string 1587 return self.data[index] 1588 else: 1589 #Return the (sub)sequence as another Seq object 1590 return MutableSeq(self.data[index], self.alphabet)
1591
1592 - def __setitem__(self, index, value):
1593 #Note since Python 2.0, __setslice__ is deprecated 1594 #and __setitem__ is used instead. 1595 #See http://docs.python.org/ref/sequence-methods.html 1596 if isinstance(index, int): 1597 #Replacing a single letter with a new string 1598 self.data[index] = value 1599 else: 1600 #Replacing a sub-sequence 1601 if isinstance(value, MutableSeq): 1602 self.data[index] = value.data 1603 elif isinstance(value, type(self.data)): 1604 self.data[index] = value 1605 else: 1606 self.data[index] = array.array(self.array_indicator, 1607 str(value))
1608
1609 - def __delitem__(self, index):
1610 #Note since Python 2.0, __delslice__ is deprecated 1611 #and __delitem__ is used instead. 1612 #See http://docs.python.org/ref/sequence-methods.html 1613 1614 #Could be deleting a single letter, or a slice 1615 del self.data[index]
1616
1617 - def __add__(self, other):
1618 """Add another sequence or string to this sequence. 1619 1620 Returns a new MutableSeq object.""" 1621 if hasattr(other, "alphabet"): 1622 #other should be a Seq or a MutableSeq 1623 if not Alphabet._check_type_compatible([self.alphabet, 1624 other.alphabet]): 1625 raise TypeError("Incompatable alphabets %s and %s" 1626 % (repr(self.alphabet), repr(other.alphabet))) 1627 #They should be the same sequence type (or one of them is generic) 1628 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1629 if isinstance(other, MutableSeq): 1630 #See test_GAQueens.py for an historic usage of a non-string 1631 #alphabet! Adding the arrays should support this. 1632 return self.__class__(self.data + other.data, a) 1633 else: 1634 return self.__class__(str(self) + str(other), a) 1635 elif isinstance(other, basestring): 1636 #other is a plain string - use the current alphabet 1637 return self.__class__(str(self) + str(other), self.alphabet) 1638 else: 1639 raise TypeError
1640
1641 - def __radd__(self, other):
1642 if hasattr(other, "alphabet"): 1643 #other should be a Seq or a MutableSeq 1644 if not Alphabet._check_type_compatible([self.alphabet, 1645 other.alphabet]): 1646 raise TypeError("Incompatable alphabets %s and %s" 1647 % (repr(self.alphabet), repr(other.alphabet))) 1648 #They should be the same sequence type (or one of them is generic) 1649 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1650 if isinstance(other, MutableSeq): 1651 #See test_GAQueens.py for an historic usage of a non-string 1652 #alphabet! Adding the arrays should support this. 1653 return self.__class__(other.data + self.data, a) 1654 else: 1655 return self.__class__(str(other) + str(self), a) 1656 elif isinstance(other, basestring): 1657 #other is a plain string - use the current alphabet 1658 return self.__class__(str(other) + str(self), self.alphabet) 1659 else: 1660 raise TypeError
1661
1662 - def append(self, c):
1663 self.data.append(c)
1664
1665 - def insert(self, i, c):
1666 self.data.insert(i, c)
1667
1668 - def pop(self, i = (-1)):
1669 c = self.data[i] 1670 del self.data[i] 1671 return c
1672
1673 - def remove(self, item):
1674 for i in range(len(self.data)): 1675 if self.data[i] == item: 1676 del self.data[i] 1677 return 1678 raise ValueError("MutableSeq.remove(x): x not in list")
1679
1680 - def count(self, sub, start=0, end=sys.maxsize):
1681 """Non-overlapping count method, like that of a python string. 1682 1683 This behaves like the python string method of the same name, 1684 which does a non-overlapping count! 1685 1686 Returns an integer, the number of occurrences of substring 1687 argument sub in the (sub)sequence given by [start:end]. 1688 Optional arguments start and end are interpreted as in slice 1689 notation. 1690 1691 Arguments: 1692 - sub - a string or another Seq object to look for 1693 - start - optional integer, slice start 1694 - end - optional integer, slice end 1695 1696 e.g. 1697 1698 >>> from Bio.Seq import MutableSeq 1699 >>> my_mseq = MutableSeq("AAAATGA") 1700 >>> print(my_mseq.count("A")) 1701 5 1702 >>> print(my_mseq.count("ATG")) 1703 1 1704 >>> print(my_mseq.count(Seq("AT"))) 1705 1 1706 >>> print(my_mseq.count("AT", 2, -1)) 1707 1 1708 1709 HOWEVER, please note because that python strings, Seq objects and 1710 MutableSeq objects do a non-overlapping search, this may not give 1711 the answer you expect: 1712 1713 >>> "AAAA".count("AA") 1714 2 1715 >>> print(MutableSeq("AAAA").count("AA")) 1716 2 1717 1718 An overlapping search would give the answer as three! 1719 """ 1720 try: 1721 #TODO - Should we check the alphabet? 1722 search = str(sub) 1723 except AttributeError: 1724 search = sub 1725 1726 if not isinstance(search, basestring): 1727 raise TypeError("expected a string, Seq or MutableSeq") 1728 1729 if len(search) == 1: 1730 #Try and be efficient and work directly from the array. 1731 count = 0 1732 for c in self.data[start:end]: 1733 if c == search: 1734 count += 1 1735 return count 1736 else: 1737 #TODO - Can we do this more efficiently? 1738 return str(self).count(search, start, end)
1739
1740 - def index(self, item):
1741 for i in range(len(self.data)): 1742 if self.data[i] == item: 1743 return i 1744 raise ValueError("MutableSeq.index(x): x not in list")
1745
1746 - def reverse(self):
1747 """Modify the mutable sequence to reverse itself. 1748 1749 No return value. 1750 """ 1751 self.data.reverse()
1752
1753 - def complement(self):
1754 """Modify the mutable sequence to take on its complement. 1755 1756 Trying to complement a protein sequence raises an exception. 1757 1758 No return value. 1759 """ 1760 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1761 Alphabet.ProteinAlphabet): 1762 raise ValueError("Proteins do not have complements!") 1763 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1764 d = ambiguous_dna_complement 1765 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1766 d = ambiguous_rna_complement 1767 elif 'U' in self.data and 'T' in self.data: 1768 #TODO - Handle this cleanly? 1769 raise ValueError("Mixed RNA/DNA found") 1770 elif 'U' in self.data: 1771 d = ambiguous_rna_complement 1772 else: 1773 d = ambiguous_dna_complement 1774 c = dict([(x.lower(), y.lower()) for x, y in d.items()]) 1775 d.update(c) 1776 self.data = [d[c] for c in self.data] 1777 self.data = array.array(self.array_indicator, self.data)
1778
1779 - def reverse_complement(self):
1780 """Modify the mutable sequence to take on its reverse complement. 1781 1782 Trying to reverse complement a protein sequence raises an exception. 1783 1784 No return value. 1785 """ 1786 self.complement() 1787 self.data.reverse()
1788 1789 ## Sorting a sequence makes no sense. 1790 # def sort(self, *args): self.data.sort(*args) 1791
1792 - def extend(self, other):
1793 if isinstance(other, MutableSeq): 1794 for c in other.data: 1795 self.data.append(c) 1796 else: 1797 for c in other: 1798 self.data.append(c)
1799
1800 - def tostring(self):
1801 """Returns the full sequence as a python string (DEPRECATED). 1802 1803 You are now encouraged to use str(my_seq) instead of my_seq.tostring() 1804 as this method is officially deprecated. 1805 1806 Because str(my_seq) will give you the full sequence as a python string, 1807 there is often no need to make an explicit conversion. For example, 1808 1809 print("ID={%s}, sequence={%s}" % (my_name, my_seq)) 1810 1811 On Biopython 1.44 or older you would have to have done this: 1812 1813 print("ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())) 1814 """ 1815 from Bio import BiopythonDeprecationWarning 1816 warnings.warn("This method is obsolete; please use str(my_seq) " 1817 "instead of my_seq.tostring().", 1818 BiopythonDeprecationWarning) 1819 return "".join(self.data)
1820
1821 - def toseq(self):
1822 """Returns the full sequence as a new immutable Seq object. 1823 1824 >>> from Bio.Seq import Seq 1825 >>> from Bio.Alphabet import IUPAC 1826 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 1827 ... IUPAC.protein) 1828 >>> my_mseq 1829 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1830 >>> my_mseq.toseq() 1831 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1832 1833 Note that the alphabet is preserved. 1834 """ 1835 return Seq("".join(self.data), self.alphabet)
1836 1837 1838 # The transcribe, backward_transcribe, and translate functions are 1839 # user-friendly versions of the corresponding functions in Bio.Transcribe 1840 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1841
1842 -def transcribe(dna):
1843 """Transcribes a DNA sequence into RNA. 1844 1845 If given a string, returns a new string object. 1846 1847 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1848 1849 Trying to transcribe a protein or RNA sequence raises an exception. 1850 1851 e.g. 1852 1853 >>> transcribe("ACTGN") 1854 'ACUGN' 1855 """ 1856 if isinstance(dna, Seq): 1857 return dna.transcribe() 1858 elif isinstance(dna, MutableSeq): 1859 return dna.toseq().transcribe() 1860 else: 1861 return dna.replace('T', 'U').replace('t', 'u')
1862 1863
1864 -def back_transcribe(rna):
1865 """Back-transcribes an RNA sequence into DNA. 1866 1867 If given a string, returns a new string object. 1868 1869 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1870 1871 Trying to transcribe a protein or DNA sequence raises an exception. 1872 1873 e.g. 1874 1875 >>> back_transcribe("ACUGN") 1876 'ACTGN' 1877 """ 1878 if isinstance(rna, Seq): 1879 return rna.back_transcribe() 1880 elif isinstance(rna, MutableSeq): 1881 return rna.toseq().back_transcribe() 1882 else: 1883 return rna.replace('U', 'T').replace('u', 't')
1884 1885
1886 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 1887 cds=False, pos_stop="X"):
1888 """Helper function to translate a nucleotide string (PRIVATE). 1889 1890 Arguments: 1891 - sequence - a string 1892 - table - a CodonTable object (NOT a table name or id number) 1893 - stop_symbol - a single character string, what to use for terminators. 1894 - to_stop - boolean, should translation terminate at the first 1895 in frame stop codon? If there is no in-frame stop codon 1896 then translation continues to the end. 1897 - pos_stop - a single character string for a possible stop codon 1898 (e.g. TAN or NNN) 1899 - cds - Boolean, indicates this is a complete CDS. If True, this 1900 checks the sequence starts with a valid alternative start 1901 codon (which will be translated as methionine, M), that the 1902 sequence length is a multiple of three, and that there is a 1903 single in frame stop codon at the end (this will be excluded 1904 from the protein sequence, regardless of the to_stop option). 1905 If these tests fail, an exception is raised. 1906 1907 Returns a string. 1908 1909 e.g. 1910 1911 >>> from Bio.Data import CodonTable 1912 >>> table = CodonTable.ambiguous_dna_by_id[1] 1913 >>> _translate_str("AAA", table) 1914 'K' 1915 >>> _translate_str("TAR", table) 1916 '*' 1917 >>> _translate_str("TAN", table) 1918 'X' 1919 >>> _translate_str("TAN", table, pos_stop="@") 1920 '@' 1921 >>> _translate_str("TA?", table) 1922 Traceback (most recent call last): 1923 ... 1924 TranslationError: Codon 'TA?' is invalid 1925 1926 In a change to older verions of Biopython, partial codons are now 1927 always regarded as an error (previously only checked if cds=True) 1928 and will trigger a warning (likely to become an exception in a 1929 future release). 1930 1931 If cds=True, the start and stop codons are checked, and the start 1932 codon will be translated at methionine. The sequence must be an 1933 while number of codons. 1934 1935 >>> _translate_str("ATGCCCTAG", table, cds=True) 1936 'MP' 1937 >>> _translate_str("AAACCCTAG", table, cds=True) 1938 Traceback (most recent call last): 1939 ... 1940 TranslationError: First codon 'AAA' is not a start codon 1941 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 1942 Traceback (most recent call last): 1943 ... 1944 TranslationError: Extra in frame stop codon found. 1945 """ 1946 sequence = sequence.upper() 1947 amino_acids = [] 1948 forward_table = table.forward_table 1949 stop_codons = table.stop_codons 1950 if table.nucleotide_alphabet.letters is not None: 1951 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 1952 else: 1953 #Assume the worst case, ambiguous DNA or RNA: 1954 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + 1955 IUPAC.ambiguous_rna.letters.upper()) 1956 n = len(sequence) 1957 if cds: 1958 if str(sequence[:3]).upper() not in table.start_codons: 1959 raise CodonTable.TranslationError( 1960 "First codon '%s' is not a start codon" % sequence[:3]) 1961 if n % 3 != 0: 1962 raise CodonTable.TranslationError( 1963 "Sequence length %i is not a multiple of three" % n) 1964 if str(sequence[-3:]).upper() not in stop_codons: 1965 raise CodonTable.TranslationError( 1966 "Final codon '%s' is not a stop codon" % sequence[-3:]) 1967 #Don't translate the stop symbol, and manually translate the M 1968 sequence = sequence[3:-3] 1969 n -= 6 1970 amino_acids = ["M"] 1971 elif n % 3 != 0: 1972 from Bio import BiopythonWarning 1973 warnings.warn("Partial codon, len(sequence) not a multiple of three. " 1974 "Explicitly trim the sequence or add trailing N before " 1975 "translation. This may become an error in future.", 1976 BiopythonWarning) 1977 for i in range(0, n - n%3, 3): 1978 codon = sequence[i:i+3] 1979 try: 1980 amino_acids.append(forward_table[codon]) 1981 except (KeyError, CodonTable.TranslationError): 1982 #Todo? Treat "---" as a special case (gapped translation) 1983 if codon in table.stop_codons: 1984 if cds: 1985 raise CodonTable.TranslationError( 1986 "Extra in frame stop codon found.") 1987 if to_stop: 1988 break 1989 amino_acids.append(stop_symbol) 1990 elif valid_letters.issuperset(set(codon)): 1991 #Possible stop codon (e.g. NNN or TAN) 1992 amino_acids.append(pos_stop) 1993 else: 1994 raise CodonTable.TranslationError( 1995 "Codon '%s' is invalid" % codon) 1996 return "".join(amino_acids)
1997 1998
1999 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 2000 cds=False):
2001 """Translate a nucleotide sequence into amino acids. 2002 2003 If given a string, returns a new string object. Given a Seq or 2004 MutableSeq, returns a Seq object with a protein alphabet. 2005 2006 Arguments: 2007 - table - Which codon table to use? This can be either a name (string), 2008 an NCBI identifier (integer), or a CodonTable object (useful 2009 for non-standard genetic codes). Defaults to the "Standard" 2010 table. 2011 - stop_symbol - Single character string, what to use for any 2012 terminators, defaults to the asterisk, "*". 2013 - to_stop - Boolean, defaults to False meaning do a full 2014 translation continuing on past any stop codons 2015 (translated as the specified stop_symbol). If 2016 True, translation is terminated at the first in 2017 frame stop codon (and the stop_symbol is not 2018 appended to the returned protein sequence). 2019 - cds - Boolean, indicates this is a complete CDS. If True, this 2020 checks the sequence starts with a valid alternative start 2021 codon (which will be translated as methionine, M), that the 2022 sequence length is a multiple of three, and that there is a 2023 single in frame stop codon at the end (this will be excluded 2024 from the protein sequence, regardless of the to_stop option). 2025 If these tests fail, an exception is raised. 2026 2027 A simple string example using the default (standard) genetic code: 2028 2029 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2030 >>> translate(coding_dna) 2031 'VAIVMGR*KGAR*' 2032 >>> translate(coding_dna, stop_symbol="@") 2033 'VAIVMGR@KGAR@' 2034 >>> translate(coding_dna, to_stop=True) 2035 'VAIVMGR' 2036 2037 Now using NCBI table 2, where TGA is not a stop codon: 2038 2039 >>> translate(coding_dna, table=2) 2040 'VAIVMGRWKGAR*' 2041 >>> translate(coding_dna, table=2, to_stop=True) 2042 'VAIVMGRWKGAR' 2043 2044 In fact this example uses an alternative start codon valid under NCBI table 2, 2045 GTG, which means this example is a complete valid CDS which when translated 2046 should really start with methionine (not valine): 2047 2048 >>> translate(coding_dna, table=2, cds=True) 2049 'MAIVMGRWKGAR' 2050 2051 Note that if the sequence has no in-frame stop codon, then the to_stop 2052 argument has no effect: 2053 2054 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2055 >>> translate(coding_dna2) 2056 'VAIVMGR' 2057 >>> translate(coding_dna2, to_stop=True) 2058 'VAIVMGR' 2059 2060 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2061 or a stop codon. These are translated as "X". Any invalid codon 2062 (e.g. "TA?" or "T-A") will throw a TranslationError. 2063 2064 NOTE - Does NOT support gapped sequences. 2065 2066 It will however translate either DNA or RNA. 2067 """ 2068 if isinstance(sequence, Seq): 2069 return sequence.translate(table, stop_symbol, to_stop, cds) 2070 elif isinstance(sequence, MutableSeq): 2071 #Return a Seq object 2072 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2073 else: 2074 #Assume its a string, return a string 2075 try: 2076 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2077 except ValueError: 2078 codon_table = CodonTable.ambiguous_generic_by_name[table] 2079 except (AttributeError, TypeError): 2080 if isinstance(table, CodonTable.CodonTable): 2081 codon_table = table 2082 else: 2083 raise ValueError('Bad table argument') 2084 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2085 2086
2087 -def reverse_complement(sequence):
2088 """Returns the reverse complement sequence of a nucleotide string. 2089 2090 If given a string, returns a new string object. 2091 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 2092 2093 Supports unambiguous and ambiguous nucleotide sequences. 2094 2095 e.g. 2096 2097 >>> reverse_complement("ACTG-NH") 2098 'DN-CAGT' 2099 """ 2100 if isinstance(sequence, Seq): 2101 #Return a Seq 2102 return sequence.reverse_complement() 2103 elif isinstance(sequence, MutableSeq): 2104 #Return a Seq 2105 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 2106 return sequence.toseq().reverse_complement() 2107 2108 #Assume its a string. 2109 #In order to avoid some code duplication, the old code would turn the string 2110 #into a Seq, use the reverse_complement method, and convert back to a string. 2111 #This worked, but is over five times slower on short sequences! 2112 if ('U' in sequence or 'u' in sequence) \ 2113 and ('T' in sequence or 't' in sequence): 2114 raise ValueError("Mixed RNA/DNA found") 2115 elif 'U' in sequence or 'u' in sequence: 2116 ttable = _rna_complement_table 2117 else: 2118 ttable = _dna_complement_table 2119 return sequence.translate(ttable)[::-1]
2120 2121
2122 -def _test():
2123 """Run the Bio.Seq module's doctests (PRIVATE).""" 2124 if sys.version_info[0:2] == (3, 1): 2125 print("Not running Bio.Seq doctest on Python 3.1") 2126 print("See http://bugs.python.org/issue7490") 2127 else: 2128 print("Running doctests...") 2129 import doctest 2130 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2131 print("Done")
2132 2133 if __name__ == "__main__": 2134 _test() 2135