Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial: 
  11   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html} 
  12   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf} 
  13  """ 
  14  from __future__ import print_function 
  15   
  16  __docformat__ ="epytext en"  # Don't just use plain text in epydoc API pages! 
  17   
  18  import string  # for maketrans only 
  19  import array 
  20  import sys 
  21  import warnings 
  22   
  23  from Bio._py3k import range 
  24  from Bio._py3k import basestring 
  25   
  26  from Bio import Alphabet 
  27  from Bio.Alphabet import IUPAC 
  28  from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  29  from Bio.Data import CodonTable 
  30   
  31   
32 -def _maketrans(complement_mapping):
33 """Makes a python string translation table (PRIVATE). 34 35 Arguments: 36 - complement_mapping - a dictionary such as ambiguous_dna_complement 37 and ambiguous_rna_complement from Data.IUPACData. 38 39 Returns a translation table (a string of length 256) for use with the 40 python string's translate method to use in a (reverse) complement. 41 42 Compatible with lower case and upper case sequences. 43 44 For internal use only. 45 """ 46 before = ''.join(complement_mapping.keys()) 47 after = ''.join(complement_mapping.values()) 48 before = before + before.lower() 49 after = after + after.lower() 50 if sys.version_info[0] == 3: 51 return str.maketrans(before, after) 52 else: 53 return string.maketrans(before, after)
54 55 _dna_complement_table = _maketrans(ambiguous_dna_complement) 56 _rna_complement_table = _maketrans(ambiguous_rna_complement) 57 58
59 -class Seq(object):
60 """A read-only sequence object (essentially a string with an alphabet). 61 62 Like normal python strings, our basic sequence object is immutable. 63 This prevents you from doing my_seq[5] = "A" for example, but does allow 64 Seq objects to be used as dictionary keys. 65 66 The Seq object provides a number of string like methods (such as count, 67 find, split and strip), which are alphabet aware where appropriate. 68 69 In addition to the string like sequence, the Seq object has an alphabet 70 property. This is an instance of an Alphabet class from Bio.Alphabet, 71 for example generic DNA, or IUPAC DNA. This describes the type of molecule 72 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 73 (letters). 74 75 The Seq object also provides some biological methods, such as complement, 76 reverse_complement, transcribe, back_transcribe and translate (which are 77 not applicable to sequences with a protein alphabet). 78 """
79 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
80 """Create a Seq object. 81 82 Arguments: 83 - seq - Sequence, required (string) 84 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 85 86 You will typically use Bio.SeqIO to read in sequences from files as 87 SeqRecord objects, whose sequence will be exposed as a Seq object via 88 the seq property. 89 90 However, will often want to create your own Seq objects directly: 91 92 >>> from Bio.Seq import Seq 93 >>> from Bio.Alphabet import IUPAC 94 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 95 ... IUPAC.protein) 96 >>> my_seq 97 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 98 >>> print(my_seq) 99 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 100 >>> my_seq.alphabet 101 IUPACProtein() 102 103 """ 104 # Enforce string storage 105 if not isinstance(data, basestring): 106 raise TypeError("The sequence data given to a Seq object should " 107 "be a string (not another Seq object etc)") 108 self._data = data 109 self.alphabet = alphabet # Seq API requirement
110
111 - def __repr__(self):
112 """Returns a (truncated) representation of the sequence for debugging.""" 113 if len(self) > 60: 114 #Shows the last three letters as it is often useful to see if there 115 #is a stop codon at the end of a sequence. 116 #Note total length is 54+3+3=60 117 return "%s('%s...%s', %s)" % (self.__class__.__name__, 118 str(self)[:54], str(self)[-3:], 119 repr(self.alphabet)) 120 else: 121 return "%s(%s, %s)" % (self.__class__.__name__, 122 repr(self._data), 123 repr(self.alphabet))
124
125 - def __str__(self):
126 """Returns the full sequence as a python string, use str(my_seq). 127 128 Note that Biopython 1.44 and earlier would give a truncated 129 version of repr(my_seq) for str(my_seq). If you are writing code 130 which need to be backwards compatible with old Biopython, you 131 should continue to use my_seq.tostring() rather than str(my_seq). 132 """ 133 return self._data
134
135 - def __hash__(self):
136 """Hash for comparison. 137 138 See the __cmp__ documentation - we plan to change this! 139 """ 140 warnings.warn("In future comparing Seq objects will use string " 141 "comparison (not object comparison). Please use " 142 "hash(id(my_seq)) or my_dict[id(my_seq)] if you " 143 "want the current behaviour, or for string hashing " 144 "use hash(str(my_seq)) or my_dict[str(my_seq)] to " 145 "to make your code explicit and to avoid this " 146 "warning.", FutureWarning) 147 return id(self) # Currently use object identity for equality testing
148
149 - def __cmp__(self, other):
150 """Compare the sequence to another sequence or a string (README). 151 152 Historically comparing Seq objects has done Python object comparison. 153 After considerable discussion (keeping in mind constraints of the 154 Python language, hashes and dictionary support) a future release of 155 Biopython will change this to use simple string comparison. The plan is 156 that comparing incompatible alphabets (e.g. DNA to RNA) will trigger a 157 warning. 158 159 This version of Biopython still does Python object comparison, but with 160 a warning about this future change. During this transition period, 161 please just do explicit comparisons: 162 163 >>> seq1 = Seq("ACGT") 164 >>> seq2 = Seq("ACGT") 165 >>> id(seq1) == id(seq2) 166 False 167 >>> str(seq1) == str(seq2) 168 True 169 170 Note - This method indirectly supports ==, < , etc. 171 """ 172 if hasattr(other, "alphabet"): 173 #other should be a Seq or a MutableSeq 174 warnings.warn("In future comparing Seq objects will use string " 175 "comparison (not object comparison). Incompatible " 176 "alphabets will trigger a warning (not an exception). " 177 "In the interim please use id(seq1)==id(seq2) or " 178 "str(seq1)==str(seq2) to make your code explicit " 179 "and to avoid this warning.", FutureWarning) 180 return cmp(id(self), id(other))
181
182 - def __len__(self):
183 """Returns the length of the sequence, use len(my_seq).""" 184 return len(self._data) # Seq API requirement
185
186 - def __getitem__(self, index): # Seq API requirement
187 """Returns a subsequence of single letter, use my_seq[index].""" 188 #Note since Python 2.0, __getslice__ is deprecated 189 #and __getitem__ is used instead. 190 #See http://docs.python.org/ref/sequence-methods.html 191 if isinstance(index, int): 192 #Return a single letter as a string 193 return self._data[index] 194 else: 195 #Return the (sub)sequence as another Seq object 196 return Seq(self._data[index], self.alphabet)
197
198 - def __add__(self, other):
199 """Add another sequence or string to this sequence. 200 201 If adding a string to a Seq, the alphabet is preserved: 202 203 >>> from Bio.Seq import Seq 204 >>> from Bio.Alphabet import generic_protein 205 >>> Seq("MELKI", generic_protein) + "LV" 206 Seq('MELKILV', ProteinAlphabet()) 207 208 When adding two Seq (like) objects, the alphabets are important. 209 Consider this example: 210 211 >>> from Bio.Seq import Seq 212 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 213 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 214 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 215 >>> unamb_dna_seq 216 Seq('ACGT', IUPACUnambiguousDNA()) 217 >>> ambig_dna_seq 218 Seq('ACRGT', IUPACAmbiguousDNA()) 219 220 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 221 the more general ambiguous IUPAC DNA alphabet: 222 223 >>> unamb_dna_seq + ambig_dna_seq 224 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 225 226 However, if the default generic alphabet is included, the result is 227 a generic alphabet: 228 229 >>> Seq("") + ambig_dna_seq 230 Seq('ACRGT', Alphabet()) 231 232 You can't add RNA and DNA sequences: 233 234 >>> from Bio.Alphabet import generic_dna, generic_rna 235 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 236 Traceback (most recent call last): 237 ... 238 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 239 240 You can't add nucleotide and protein sequences: 241 242 >>> from Bio.Alphabet import generic_dna, generic_protein 243 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 244 Traceback (most recent call last): 245 ... 246 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 247 """ 248 if hasattr(other, "alphabet"): 249 #other should be a Seq or a MutableSeq 250 if not Alphabet._check_type_compatible([self.alphabet, 251 other.alphabet]): 252 raise TypeError("Incompatible alphabets %s and %s" 253 % (repr(self.alphabet), repr(other.alphabet))) 254 #They should be the same sequence type (or one of them is generic) 255 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 256 return self.__class__(str(self) + str(other), a) 257 elif isinstance(other, basestring): 258 #other is a plain string - use the current alphabet 259 return self.__class__(str(self) + other, self.alphabet) 260 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports 261 if isinstance(other, SeqRecord): 262 #Get the SeqRecord's __radd__ to handle this 263 return NotImplemented 264 else: 265 raise TypeError
266
267 - def __radd__(self, other):
268 """Adding a sequence on the left. 269 270 If adding a string to a Seq, the alphabet is preserved: 271 272 >>> from Bio.Seq import Seq 273 >>> from Bio.Alphabet import generic_protein 274 >>> "LV" + Seq("MELKI", generic_protein) 275 Seq('LVMELKI', ProteinAlphabet()) 276 277 Adding two Seq (like) objects is handled via the __add__ method. 278 """ 279 if hasattr(other, "alphabet"): 280 #other should be a Seq or a MutableSeq 281 if not Alphabet._check_type_compatible([self.alphabet, 282 other.alphabet]): 283 raise TypeError("Incompatable alphabets %s and %s" 284 % (repr(self.alphabet), repr(other.alphabet))) 285 #They should be the same sequence type (or one of them is generic) 286 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 287 return self.__class__(str(other) + str(self), a) 288 elif isinstance(other, basestring): 289 #other is a plain string - use the current alphabet 290 return self.__class__(other + str(self), self.alphabet) 291 else: 292 raise TypeError
293
294 - def tostring(self): # Seq API requirement
295 """Returns the full sequence as a python string (semi-obsolete). 296 297 Although not formally deprecated, you are now encouraged to use 298 str(my_seq) instead of my_seq.tostring().""" 299 #TODO - Fix all places elsewhere in Biopython using this method, 300 #then start deprecation process? 301 #import warnings 302 #warnings.warn("This method is obsolete; please use str(my_seq) " 303 # "instead of my_seq.tostring().", 304 # PendingDeprecationWarning) 305 return str(self) 306
307 - def tomutable(self): # Needed? Or use a function?
308 """Returns the full sequence as a MutableSeq object. 309 310 >>> from Bio.Seq import Seq 311 >>> from Bio.Alphabet import IUPAC 312 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 313 ... IUPAC.protein) 314 >>> my_seq 315 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 316 >>> my_seq.tomutable() 317 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 318 319 Note that the alphabet is preserved. 320 """ 321 return MutableSeq(str(self), self.alphabet) 322
323 - def _get_seq_str_and_check_alphabet(self, other_sequence):
324 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 325 326 For a string argument, returns the string. 327 328 For a Seq or MutableSeq, it checks the alphabet is compatible 329 (raising an exception if it isn't), and then returns a string. 330 """ 331 try: 332 other_alpha = other_sequence.alphabet 333 except AttributeError: 334 #Assume other_sequence is a string 335 return other_sequence 336 337 #Other should be a Seq or a MutableSeq 338 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 339 raise TypeError("Incompatable alphabets %s and %s" 340 % (repr(self.alphabet), repr(other_alpha))) 341 #Return as a string 342 return str(other_sequence)
343
344 - def count(self, sub, start=0, end=sys.maxsize):
345 """Non-overlapping count method, like that of a python string. 346 347 This behaves like the python string method of the same name, 348 which does a non-overlapping count! 349 350 Returns an integer, the number of occurrences of substring 351 argument sub in the (sub)sequence given by [start:end]. 352 Optional arguments start and end are interpreted as in slice 353 notation. 354 355 Arguments: 356 - sub - a string or another Seq object to look for 357 - start - optional integer, slice start 358 - end - optional integer, slice end 359 360 e.g. 361 362 >>> from Bio.Seq import Seq 363 >>> my_seq = Seq("AAAATGA") 364 >>> print(my_seq.count("A")) 365 5 366 >>> print(my_seq.count("ATG")) 367 1 368 >>> print(my_seq.count(Seq("AT"))) 369 1 370 >>> print(my_seq.count("AT", 2, -1)) 371 1 372 373 HOWEVER, please note because python strings and Seq objects (and 374 MutableSeq objects) do a non-overlapping search, this may not give 375 the answer you expect: 376 377 >>> "AAAA".count("AA") 378 2 379 >>> print(Seq("AAAA").count("AA")) 380 2 381 382 A non-overlapping search would give the answer as three! 383 """ 384 #If it has one, check the alphabet: 385 sub_str = self._get_seq_str_and_check_alphabet(sub) 386 return str(self).count(sub_str, start, end)
387
388 - def __contains__(self, char):
389 """Implements the 'in' keyword, like a python string. 390 391 e.g. 392 393 >>> from Bio.Seq import Seq 394 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 395 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 396 >>> "AAA" in my_dna 397 True 398 >>> Seq("AAA") in my_dna 399 True 400 >>> Seq("AAA", generic_dna) in my_dna 401 True 402 403 Like other Seq methods, this will raise a type error if another Seq 404 (or Seq like) object with an incompatible alphabet is used: 405 406 >>> Seq("AAA", generic_rna) in my_dna 407 Traceback (most recent call last): 408 ... 409 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet() 410 >>> Seq("AAA", generic_protein) in my_dna 411 Traceback (most recent call last): 412 ... 413 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet() 414 """ 415 #If it has one, check the alphabet: 416 sub_str = self._get_seq_str_and_check_alphabet(char) 417 return sub_str in str(self)
418
419 - def find(self, sub, start=0, end=sys.maxsize):
420 """Find method, like that of a python string. 421 422 This behaves like the python string method of the same name. 423 424 Returns an integer, the index of the first occurrence of substring 425 argument sub in the (sub)sequence given by [start:end]. 426 427 Arguments: 428 - sub - a string or another Seq object to look for 429 - start - optional integer, slice start 430 - end - optional integer, slice end 431 432 Returns -1 if the subsequence is NOT found. 433 434 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 435 436 >>> from Bio.Seq import Seq 437 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 438 >>> my_rna.find("AUG") 439 3 440 """ 441 #If it has one, check the alphabet: 442 sub_str = self._get_seq_str_and_check_alphabet(sub) 443 return str(self).find(sub_str, start, end)
444
445 - def rfind(self, sub, start=0, end=sys.maxsize):
446 """Find from right method, like that of a python string. 447 448 This behaves like the python string method of the same name. 449 450 Returns an integer, the index of the last (right most) occurrence of 451 substring argument sub in the (sub)sequence given by [start:end]. 452 453 Arguments: 454 - sub - a string or another Seq object to look for 455 - start - optional integer, slice start 456 - end - optional integer, slice end 457 458 Returns -1 if the subsequence is NOT found. 459 460 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 461 462 >>> from Bio.Seq import Seq 463 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 464 >>> my_rna.rfind("AUG") 465 15 466 """ 467 #If it has one, check the alphabet: 468 sub_str = self._get_seq_str_and_check_alphabet(sub) 469 return str(self).rfind(sub_str, start, end)
470
471 - def startswith(self, prefix, start=0, end=sys.maxsize):
472 """Does the Seq start with the given prefix? Returns True/False. 473 474 This behaves like the python string method of the same name. 475 476 Return True if the sequence starts with the specified prefix 477 (a string or another Seq object), False otherwise. 478 With optional start, test sequence beginning at that position. 479 With optional end, stop comparing sequence at that position. 480 prefix can also be a tuple of strings to try. e.g. 481 482 >>> from Bio.Seq import Seq 483 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 484 >>> my_rna.startswith("GUC") 485 True 486 >>> my_rna.startswith("AUG") 487 False 488 >>> my_rna.startswith("AUG", 3) 489 True 490 >>> my_rna.startswith(("UCC", "UCA", "UCG"), 1) 491 True 492 """ 493 #If it has one, check the alphabet: 494 if isinstance(prefix, tuple): 495 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 496 for p in prefix) 497 return str(self).startswith(prefix_strs, start, end) 498 else: 499 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 500 return str(self).startswith(prefix_str, start, end)
501
502 - def endswith(self, suffix, start=0, end=sys.maxsize):
503 """Does the Seq end with the given suffix? Returns True/False. 504 505 This behaves like the python string method of the same name. 506 507 Return True if the sequence ends with the specified suffix 508 (a string or another Seq object), False otherwise. 509 With optional start, test sequence beginning at that position. 510 With optional end, stop comparing sequence at that position. 511 suffix can also be a tuple of strings to try. e.g. 512 513 >>> from Bio.Seq import Seq 514 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 515 >>> my_rna.endswith("UUG") 516 True 517 >>> my_rna.endswith("AUG") 518 False 519 >>> my_rna.endswith("AUG", 0, 18) 520 True 521 >>> my_rna.endswith(("UCC", "UCA", "UUG")) 522 True 523 """ 524 #If it has one, check the alphabet: 525 if isinstance(suffix, tuple): 526 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 527 for p in suffix) 528 return str(self).endswith(suffix_strs, start, end) 529 else: 530 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 531 return str(self).endswith(suffix_str, start, end)
532
533 - def split(self, sep=None, maxsplit=-1):
534 """Split method, like that of a python string. 535 536 This behaves like the python string method of the same name. 537 538 Return a list of the 'words' in the string (as Seq objects), 539 using sep as the delimiter string. If maxsplit is given, at 540 most maxsplit splits are done. If maxsplit is omitted, all 541 splits are made. 542 543 Following the python string method, sep will by default be any 544 white space (tabs, spaces, newlines) but this is unlikely to 545 apply to biological sequences. 546 547 e.g. 548 549 >>> from Bio.Seq import Seq 550 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 551 >>> my_aa = my_rna.translate() 552 >>> my_aa 553 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 554 >>> my_aa.split("*") 555 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 556 >>> my_aa.split("*", 1) 557 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 558 559 See also the rsplit method: 560 561 >>> my_aa.rsplit("*", 1) 562 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 563 """ 564 #If it has one, check the alphabet: 565 sep_str = self._get_seq_str_and_check_alphabet(sep) 566 #TODO - If the sep is the defined stop symbol, or gap char, 567 #should we adjust the alphabet? 568 return [Seq(part, self.alphabet) 569 for part in str(self).split(sep_str, maxsplit)]
570
571 - def rsplit(self, sep=None, maxsplit=-1):
572 """Right split method, like that of a python string. 573 574 This behaves like the python string method of the same name. 575 576 Return a list of the 'words' in the string (as Seq objects), 577 using sep as the delimiter string. If maxsplit is given, at 578 most maxsplit splits are done COUNTING FROM THE RIGHT. 579 If maxsplit is omitted, all splits are made. 580 581 Following the python string method, sep will by default be any 582 white space (tabs, spaces, newlines) but this is unlikely to 583 apply to biological sequences. 584 585 e.g. print(my_seq.rsplit("*",1)) 586 587 See also the split method. 588 """ 589 #If it has one, check the alphabet: 590 sep_str = self._get_seq_str_and_check_alphabet(sep) 591 return [Seq(part, self.alphabet) 592 for part in str(self).rsplit(sep_str, maxsplit)]
593
594 - def strip(self, chars=None):
595 """Returns a new Seq object with leading and trailing ends stripped. 596 597 This behaves like the python string method of the same name. 598 599 Optional argument chars defines which characters to remove. If 600 omitted or None (default) then as for the python string method, 601 this defaults to removing any white space. 602 603 e.g. print(my_seq.strip("-")) 604 605 See also the lstrip and rstrip methods. 606 """ 607 #If it has one, check the alphabet: 608 strip_str = self._get_seq_str_and_check_alphabet(chars) 609 return Seq(str(self).strip(strip_str), self.alphabet)
610
611 - def lstrip(self, chars=None):
612 """Returns a new Seq object with leading (left) end stripped. 613 614 This behaves like the python string method of the same name. 615 616 Optional argument chars defines which characters to remove. If 617 omitted or None (default) then as for the python string method, 618 this defaults to removing any white space. 619 620 e.g. print(my_seq.lstrip("-")) 621 622 See also the strip and rstrip methods. 623 """ 624 #If it has one, check the alphabet: 625 strip_str = self._get_seq_str_and_check_alphabet(chars) 626 return Seq(str(self).lstrip(strip_str), self.alphabet)
627
628 - def rstrip(self, chars=None):
629 """Returns a new Seq object with trailing (right) end stripped. 630 631 This behaves like the python string method of the same name. 632 633 Optional argument chars defines which characters to remove. If 634 omitted or None (default) then as for the python string method, 635 this defaults to removing any white space. 636 637 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 638 639 >>> from Bio.Alphabet import IUPAC 640 >>> from Bio.Seq import Seq 641 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 642 >>> my_seq 643 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 644 >>> my_seq.rstrip("A") 645 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 646 647 See also the strip and lstrip methods. 648 """ 649 #If it has one, check the alphabet: 650 strip_str = self._get_seq_str_and_check_alphabet(chars) 651 return Seq(str(self).rstrip(strip_str), self.alphabet)
652
653 - def upper(self):
654 """Returns an upper case copy of the sequence. 655 656 >>> from Bio.Alphabet import HasStopCodon, generic_protein 657 >>> from Bio.Seq import Seq 658 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 659 >>> my_seq 660 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 661 >>> my_seq.lower() 662 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 663 >>> my_seq.upper() 664 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 665 666 This will adjust the alphabet if required. See also the lower method. 667 """ 668 return Seq(str(self).upper(), self.alphabet._upper())
669
670 - def lower(self):
671 """Returns a lower case copy of the sequence. 672 673 This will adjust the alphabet if required. Note that the IUPAC alphabets 674 are upper case only, and thus a generic alphabet must be substituted. 675 676 >>> from Bio.Alphabet import Gapped, generic_dna 677 >>> from Bio.Alphabet import IUPAC 678 >>> from Bio.Seq import Seq 679 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*")) 680 >>> my_seq 681 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 682 >>> my_seq.lower() 683 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 684 685 See also the upper method. 686 """ 687 return Seq(str(self).lower(), self.alphabet._lower())
688
689 - def complement(self):
690 """Returns the complement sequence. New Seq object. 691 692 >>> from Bio.Seq import Seq 693 >>> from Bio.Alphabet import IUPAC 694 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 695 >>> my_dna 696 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 697 >>> my_dna.complement() 698 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 699 700 You can of course used mixed case sequences, 701 702 >>> from Bio.Seq import Seq 703 >>> from Bio.Alphabet import generic_dna 704 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 705 >>> my_dna 706 Seq('CCCCCgatA-GD', DNAAlphabet()) 707 >>> my_dna.complement() 708 Seq('GGGGGctaT-CH', DNAAlphabet()) 709 710 Note in the above example, ambiguous character D denotes 711 G, A or T so its complement is H (for C, T or A). 712 713 Trying to complement a protein sequence raises an exception. 714 715 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 716 >>> my_protein.complement() 717 Traceback (most recent call last): 718 ... 719 ValueError: Proteins do not have complements! 720 """ 721 base = Alphabet._get_base_alphabet(self.alphabet) 722 if isinstance(base, Alphabet.ProteinAlphabet): 723 raise ValueError("Proteins do not have complements!") 724 if isinstance(base, Alphabet.DNAAlphabet): 725 ttable = _dna_complement_table 726 elif isinstance(base, Alphabet.RNAAlphabet): 727 ttable = _rna_complement_table 728 elif ('U' in self._data or 'u' in self._data) \ 729 and ('T' in self._data or 't' in self._data): 730 #TODO - Handle this cleanly? 731 raise ValueError("Mixed RNA/DNA found") 732 elif 'U' in self._data or 'u' in self._data: 733 ttable = _rna_complement_table 734 else: 735 ttable = _dna_complement_table 736 #Much faster on really long sequences than the previous loop based one. 737 #thx to Michael Palmer, University of Waterloo 738 return Seq(str(self).translate(ttable), self.alphabet)
739
740 - def reverse_complement(self):
741 """Returns the reverse complement sequence. New Seq object. 742 743 >>> from Bio.Seq import Seq 744 >>> from Bio.Alphabet import IUPAC 745 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 746 >>> my_dna 747 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 748 >>> my_dna.reverse_complement() 749 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 750 751 Note in the above example, since R = G or A, its complement 752 is Y (which denotes C or T). 753 754 You can of course used mixed case sequences, 755 756 >>> from Bio.Seq import Seq 757 >>> from Bio.Alphabet import generic_dna 758 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 759 >>> my_dna 760 Seq('CCCCCgatA-G', DNAAlphabet()) 761 >>> my_dna.reverse_complement() 762 Seq('C-TatcGGGGG', DNAAlphabet()) 763 764 Trying to complement a protein sequence raises an exception: 765 766 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 767 >>> my_protein.reverse_complement() 768 Traceback (most recent call last): 769 ... 770 ValueError: Proteins do not have complements! 771 """ 772 #Use -1 stride/step to reverse the complement 773 return self.complement()[::-1]
774
775 - def transcribe(self):
776 """Returns the RNA sequence from a DNA sequence. New Seq object. 777 778 >>> from Bio.Seq import Seq 779 >>> from Bio.Alphabet import IUPAC 780 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 781 ... IUPAC.unambiguous_dna) 782 >>> coding_dna 783 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 784 >>> coding_dna.transcribe() 785 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 786 787 Trying to transcribe a protein or RNA sequence raises an exception: 788 789 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 790 >>> my_protein.transcribe() 791 Traceback (most recent call last): 792 ... 793 ValueError: Proteins cannot be transcribed! 794 """ 795 base = Alphabet._get_base_alphabet(self.alphabet) 796 if isinstance(base, Alphabet.ProteinAlphabet): 797 raise ValueError("Proteins cannot be transcribed!") 798 if isinstance(base, Alphabet.RNAAlphabet): 799 raise ValueError("RNA cannot be transcribed!") 800 801 if self.alphabet==IUPAC.unambiguous_dna: 802 alphabet = IUPAC.unambiguous_rna 803 elif self.alphabet==IUPAC.ambiguous_dna: 804 alphabet = IUPAC.ambiguous_rna 805 else: 806 alphabet = Alphabet.generic_rna 807 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
808
809 - def back_transcribe(self):
810 """Returns the DNA sequence from an RNA sequence. New Seq object. 811 812 >>> from Bio.Seq import Seq 813 >>> from Bio.Alphabet import IUPAC 814 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 815 ... IUPAC.unambiguous_rna) 816 >>> messenger_rna 817 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 818 >>> messenger_rna.back_transcribe() 819 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 820 821 Trying to back-transcribe a protein or DNA sequence raises an 822 exception: 823 824 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 825 >>> my_protein.back_transcribe() 826 Traceback (most recent call last): 827 ... 828 ValueError: Proteins cannot be back transcribed! 829 """ 830 base = Alphabet._get_base_alphabet(self.alphabet) 831 if isinstance(base, Alphabet.ProteinAlphabet): 832 raise ValueError("Proteins cannot be back transcribed!") 833 if isinstance(base, Alphabet.DNAAlphabet): 834 raise ValueError("DNA cannot be back transcribed!") 835 836 if self.alphabet==IUPAC.unambiguous_rna: 837 alphabet = IUPAC.unambiguous_dna 838 elif self.alphabet==IUPAC.ambiguous_rna: 839 alphabet = IUPAC.ambiguous_dna 840 else: 841 alphabet = Alphabet.generic_dna 842 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
843
844 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 845 cds=False):
846 """Turns a nucleotide sequence into a protein sequence. New Seq object. 847 848 This method will translate DNA or RNA sequences, and those with a 849 nucleotide or generic alphabet. Trying to translate a protein 850 sequence raises an exception. 851 852 Arguments: 853 - table - Which codon table to use? This can be either a name 854 (string), an NCBI identifier (integer), or a CodonTable 855 object (useful for non-standard genetic codes). This 856 defaults to the "Standard" table. 857 - stop_symbol - Single character string, what to use for terminators. 858 This defaults to the asterisk, "*". 859 - to_stop - Boolean, defaults to False meaning do a full translation 860 continuing on past any stop codons (translated as the 861 specified stop_symbol). If True, translation is 862 terminated at the first in frame stop codon (and the 863 stop_symbol is not appended to the returned protein 864 sequence). 865 - cds - Boolean, indicates this is a complete CDS. If True, 866 this checks the sequence starts with a valid alternative start 867 codon (which will be translated as methionine, M), that the 868 sequence length is a multiple of three, and that there is a 869 single in frame stop codon at the end (this will be excluded 870 from the protein sequence, regardless of the to_stop option). 871 If these tests fail, an exception is raised. 872 873 e.g. Using the standard table: 874 875 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 876 >>> coding_dna.translate() 877 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 878 >>> coding_dna.translate(stop_symbol="@") 879 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 880 >>> coding_dna.translate(to_stop=True) 881 Seq('VAIVMGR', ExtendedIUPACProtein()) 882 883 Now using NCBI table 2, where TGA is not a stop codon: 884 885 >>> coding_dna.translate(table=2) 886 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 887 >>> coding_dna.translate(table=2, to_stop=True) 888 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 889 890 In fact, GTG is an alternative start codon under NCBI table 2, meaning 891 this sequence could be a complete CDS: 892 893 >>> coding_dna.translate(table=2, cds=True) 894 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 895 896 It isn't a valid CDS under NCBI table 1, due to both the start codon and 897 also the in frame stop codons: 898 899 >>> coding_dna.translate(table=1, cds=True) 900 Traceback (most recent call last): 901 ... 902 TranslationError: First codon 'GTG' is not a start codon 903 904 If the sequence has no in-frame stop codon, then the to_stop argument 905 has no effect: 906 907 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 908 >>> coding_dna2.translate() 909 Seq('LAIVMGR', ExtendedIUPACProtein()) 910 >>> coding_dna2.translate(to_stop=True) 911 Seq('LAIVMGR', ExtendedIUPACProtein()) 912 913 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 914 or a stop codon. These are translated as "X". Any invalid codon 915 (e.g. "TA?" or "T-A") will throw a TranslationError. 916 917 NOTE - Does NOT support gapped sequences. 918 919 NOTE - This does NOT behave like the python string's translate 920 method. For that use str(my_seq).translate(...) instead. 921 """ 922 if isinstance(table, str) and len(table)==256: 923 raise ValueError("The Seq object translate method DOES NOT take " 924 + "a 256 character string mapping table like " 925 + "the python string object's translate method. " 926 + "Use str(my_seq).translate(...) instead.") 927 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 928 Alphabet.ProteinAlphabet): 929 raise ValueError("Proteins cannot be translated!") 930 try: 931 table_id = int(table) 932 except ValueError: 933 #Assume its a table name 934 if self.alphabet==IUPAC.unambiguous_dna: 935 #Will use standard IUPAC protein alphabet, no need for X 936 codon_table = CodonTable.unambiguous_dna_by_name[table] 937 elif self.alphabet==IUPAC.unambiguous_rna: 938 #Will use standard IUPAC protein alphabet, no need for X 939 codon_table = CodonTable.unambiguous_rna_by_name[table] 940 else: 941 #This will use the extended IUPAC protein alphabet with X etc. 942 #The same table can be used for RNA or DNA (we use this for 943 #translating strings). 944 codon_table = CodonTable.ambiguous_generic_by_name[table] 945 except (AttributeError, TypeError): 946 #Assume its a CodonTable object 947 if isinstance(table, CodonTable.CodonTable): 948 codon_table = table 949 else: 950 raise ValueError('Bad table argument') 951 else: 952 #Assume its a table ID 953 if self.alphabet==IUPAC.unambiguous_dna: 954 #Will use standard IUPAC protein alphabet, no need for X 955 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 956 elif self.alphabet==IUPAC.unambiguous_rna: 957 #Will use standard IUPAC protein alphabet, no need for X 958 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 959 else: 960 #This will use the extended IUPAC protein alphabet with X etc. 961 #The same table can be used for RNA or DNA (we use this for 962 #translating strings). 963 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 964 protein = _translate_str(str(self), codon_table, 965 stop_symbol, to_stop, cds) 966 if stop_symbol in protein: 967 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 968 stop_symbol = stop_symbol) 969 else: 970 alphabet = codon_table.protein_alphabet 971 return Seq(protein, alphabet)
972
973 - def ungap(self, gap=None):
974 """Return a copy of the sequence without the gap character(s). 975 976 The gap character can be specified in two ways - either as an explicit 977 argument, or via the sequence's alphabet. For example: 978 979 >>> from Bio.Seq import Seq 980 >>> from Bio.Alphabet import generic_dna 981 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 982 >>> my_dna 983 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 984 >>> my_dna.ungap("-") 985 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 986 987 If the gap character is not given as an argument, it will be taken from 988 the sequence's alphabet (if defined). Notice that the returned sequence's 989 alphabet is adjusted since it no longer requires a gapped alphabet: 990 991 >>> from Bio.Seq import Seq 992 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 993 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 994 >>> my_pro 995 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 996 >>> my_pro.ungap() 997 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 998 999 Or, with a simpler gapped DNA example: 1000 1001 >>> from Bio.Seq import Seq 1002 >>> from Bio.Alphabet import IUPAC, Gapped 1003 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 1004 >>> my_seq 1005 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1006 >>> my_seq.ungap() 1007 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1008 1009 As long as it is consistent with the alphabet, although it is redundant, 1010 you can still supply the gap character as an argument to this method: 1011 1012 >>> my_seq 1013 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1014 >>> my_seq.ungap("=") 1015 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1016 1017 However, if the gap character given as the argument disagrees with that 1018 declared in the alphabet, an exception is raised: 1019 1020 >>> my_seq 1021 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1022 >>> my_seq.ungap("-") 1023 Traceback (most recent call last): 1024 ... 1025 ValueError: Gap '-' does not match '=' from alphabet 1026 1027 Finally, if a gap character is not supplied, and the alphabet does not 1028 define one, an exception is raised: 1029 1030 >>> from Bio.Seq import Seq 1031 >>> from Bio.Alphabet import generic_dna 1032 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1033 >>> my_dna 1034 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1035 >>> my_dna.ungap() 1036 Traceback (most recent call last): 1037 ... 1038 ValueError: Gap character not given and not defined in alphabet 1039 1040 """ 1041 if hasattr(self.alphabet, "gap_char"): 1042 if not gap: 1043 gap = self.alphabet.gap_char 1044 elif gap != self.alphabet.gap_char: 1045 raise ValueError("Gap %s does not match %s from alphabet" 1046 % (repr(gap), repr(self.alphabet.gap_char))) 1047 alpha = Alphabet._ungap(self.alphabet) 1048 elif not gap: 1049 raise ValueError("Gap character not given and not defined in alphabet") 1050 else: 1051 alpha = self.alphabet # modify! 1052 if len(gap)!=1 or not isinstance(gap, str): 1053 raise ValueError("Unexpected gap character, %s" % repr(gap)) 1054 return Seq(str(self).replace(gap, ""), alpha)
1055 1056
1057 -class UnknownSeq(Seq):
1058 """A read-only sequence object of known length but unknown contents. 1059 1060 If you have an unknown sequence, you can represent this with a normal 1061 Seq object, for example: 1062 1063 >>> my_seq = Seq("N"*5) 1064 >>> my_seq 1065 Seq('NNNNN', Alphabet()) 1066 >>> len(my_seq) 1067 5 1068 >>> print(my_seq) 1069 NNNNN 1070 1071 However, this is rather wasteful of memory (especially for large 1072 sequences), which is where this class is most usefull: 1073 1074 >>> unk_five = UnknownSeq(5) 1075 >>> unk_five 1076 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1077 >>> len(unk_five) 1078 5 1079 >>> print(unk_five) 1080 ????? 1081 1082 You can add unknown sequence together, provided their alphabets and 1083 characters are compatible, and get another memory saving UnknownSeq: 1084 1085 >>> unk_four = UnknownSeq(4) 1086 >>> unk_four 1087 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1088 >>> unk_four + unk_five 1089 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1090 1091 If the alphabet or characters don't match up, the addition gives an 1092 ordinary Seq object: 1093 1094 >>> unk_nnnn = UnknownSeq(4, character = "N") 1095 >>> unk_nnnn 1096 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1097 >>> unk_nnnn + unk_four 1098 Seq('NNNN????', Alphabet()) 1099 1100 Combining with a real Seq gives a new Seq object: 1101 1102 >>> known_seq = Seq("ACGT") 1103 >>> unk_four + known_seq 1104 Seq('????ACGT', Alphabet()) 1105 >>> known_seq + unk_four 1106 Seq('ACGT????', Alphabet()) 1107 """
1108 - def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None):
1109 """Create a new UnknownSeq object. 1110 1111 If character is omitted, it is determined from the alphabet, "N" for 1112 nucleotides, "X" for proteins, and "?" otherwise. 1113 """ 1114 self._length = int(length) 1115 if self._length < 0: 1116 #TODO - Block zero length UnknownSeq? You can just use a Seq! 1117 raise ValueError("Length must not be negative.") 1118 self.alphabet = alphabet 1119 if character: 1120 if len(character) != 1: 1121 raise ValueError("character argument should be a single letter string.") 1122 self._character = character 1123 else: 1124 base = Alphabet._get_base_alphabet(alphabet) 1125 #TODO? Check the case of the letters in the alphabet? 1126 #We may have to use "n" instead of "N" etc. 1127 if isinstance(base, Alphabet.NucleotideAlphabet): 1128 self._character = "N" 1129 elif isinstance(base, Alphabet.ProteinAlphabet): 1130 self._character = "X" 1131 else: 1132 self._character = "?"
1133
1134 - def __len__(self):
1135 """Returns the stated length of the unknown sequence.""" 1136 return self._length
1137
1138 - def __str__(self):
1139 """Returns the unknown sequence as full string of the given length.""" 1140 return self._character * self._length
1141
1142 - def __repr__(self):
1143 return "UnknownSeq(%i, alphabet = %s, character = %s)" \ 1144 % (self._length, repr(self.alphabet), repr(self._character))
1145
1146 - def __add__(self, other):
1147 """Add another sequence or string to this sequence. 1148 1149 Adding two UnknownSeq objects returns another UnknownSeq object 1150 provided the character is the same and the alphabets are compatible. 1151 1152 >>> from Bio.Seq import UnknownSeq 1153 >>> from Bio.Alphabet import generic_protein 1154 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1155 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1156 1157 If the characters differ, an UnknownSeq object cannot be used, so a 1158 Seq object is returned: 1159 1160 >>> from Bio.Seq import UnknownSeq 1161 >>> from Bio.Alphabet import generic_protein 1162 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1163 ... character="x") 1164 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1165 1166 If adding a string to an UnknownSeq, a new Seq is returned with the 1167 same alphabet: 1168 1169 >>> from Bio.Seq import UnknownSeq 1170 >>> from Bio.Alphabet import generic_protein 1171 >>> UnknownSeq(5, generic_protein) + "LV" 1172 Seq('XXXXXLV', ProteinAlphabet()) 1173 """ 1174 if isinstance(other, UnknownSeq) \ 1175 and other._character == self._character: 1176 #TODO - Check the alphabets match 1177 return UnknownSeq(len(self)+len(other), 1178 self.alphabet, self._character) 1179 #Offload to the base class... 1180 return Seq(str(self), self.alphabet) + other
1181
1182 - def __radd__(self, other):
1183 #If other is an UnknownSeq, then __add__ would be called. 1184 #Offload to the base class... 1185 return other + Seq(str(self), self.alphabet)
1186
1187 - def __getitem__(self, index):
1188 """Get a subsequence from the UnknownSeq object. 1189 1190 >>> unk = UnknownSeq(8, character="N") 1191 >>> print(unk[:]) 1192 NNNNNNNN 1193 >>> print(unk[5:3]) 1194 <BLANKLINE> 1195 >>> print(unk[1:-1]) 1196 NNNNNN 1197 >>> print(unk[1:-1:2]) 1198 NNN 1199 """ 1200 if isinstance(index, int): 1201 #TODO - Check the bounds without wasting memory 1202 return str(self)[index] 1203 old_length = self._length 1204 step = index.step 1205 if step is None or step == 1: 1206 #This calculates the length you'd get from ("N"*old_length)[index] 1207 start = index.start 1208 end = index.stop 1209 if start is None: 1210 start = 0 1211 elif start < 0: 1212 start = max(0, old_length + start) 1213 elif start > old_length: 1214 start = old_length 1215 if end is None: 1216 end = old_length 1217 elif end < 0: 1218 end = max(0, old_length + end) 1219 elif end > old_length: 1220 end = old_length 1221 new_length = max(0, end-start) 1222 elif step == 0: 1223 raise ValueError("slice step cannot be zero") 1224 else: 1225 #TODO - handle step efficiently 1226 new_length = len(("X"*old_length)[index]) 1227 #assert new_length == len(("X"*old_length)[index]), \ 1228 # (index, start, end, step, old_length, 1229 # new_length, len(("X"*old_length)[index])) 1230 return UnknownSeq(new_length, self.alphabet, self._character)
1231
1232 - def count(self, sub, start=0, end=sys.maxsize):
1233 """Non-overlapping count method, like that of a python string. 1234 1235 This behaves like the python string (and Seq object) method of the 1236 same name, which does a non-overlapping count! 1237 1238 Returns an integer, the number of occurrences of substring 1239 argument sub in the (sub)sequence given by [start:end]. 1240 Optional arguments start and end are interpreted as in slice 1241 notation. 1242 1243 Arguments: 1244 - sub - a string or another Seq object to look for 1245 - start - optional integer, slice start 1246 - end - optional integer, slice end 1247 1248 >>> "NNNN".count("N") 1249 4 1250 >>> Seq("NNNN").count("N") 1251 4 1252 >>> UnknownSeq(4, character="N").count("N") 1253 4 1254 >>> UnknownSeq(4, character="N").count("A") 1255 0 1256 >>> UnknownSeq(4, character="N").count("AA") 1257 0 1258 1259 HOWEVER, please note because that python strings and Seq objects (and 1260 MutableSeq objects) do a non-overlapping search, this may not give 1261 the answer you expect: 1262 1263 >>> UnknownSeq(4, character="N").count("NN") 1264 2 1265 >>> UnknownSeq(4, character="N").count("NNN") 1266 1 1267 """ 1268 sub_str = self._get_seq_str_and_check_alphabet(sub) 1269 if len(sub_str) == 1: 1270 if str(sub_str) == self._character: 1271 if start==0 and end >= self._length: 1272 return self._length 1273 else: 1274 #This could be done more cleverly... 1275 return str(self).count(sub_str, start, end) 1276 else: 1277 return 0 1278 else: 1279 if set(sub_str) == set(self._character): 1280 if start==0 and end >= self._length: 1281 return self._length // len(sub_str) 1282 else: 1283 #This could be done more cleverly... 1284 return str(self).count(sub_str, start, end) 1285 else: 1286 return 0
1287
1288 - def complement(self):
1289 """The complement of an unknown nucleotide equals itself. 1290 1291 >>> my_nuc = UnknownSeq(8) 1292 >>> my_nuc 1293 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1294 >>> print(my_nuc) 1295 ???????? 1296 >>> my_nuc.complement() 1297 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1298 >>> print(my_nuc.complement()) 1299 ???????? 1300 """ 1301 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1302 Alphabet.ProteinAlphabet): 1303 raise ValueError("Proteins do not have complements!") 1304 return self
1305
1306 - def reverse_complement(self):
1307 """The reverse complement of an unknown nucleotide equals itself. 1308 1309 >>> my_nuc = UnknownSeq(10) 1310 >>> my_nuc 1311 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1312 >>> print(my_nuc) 1313 ?????????? 1314 >>> my_nuc.reverse_complement() 1315 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1316 >>> print(my_nuc.reverse_complement()) 1317 ?????????? 1318 """ 1319 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1320 Alphabet.ProteinAlphabet): 1321 raise ValueError("Proteins do not have complements!") 1322 return self
1323
1324 - def transcribe(self):
1325 """Returns unknown RNA sequence from an unknown DNA sequence. 1326 1327 >>> my_dna = UnknownSeq(10, character="N") 1328 >>> my_dna 1329 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1330 >>> print(my_dna) 1331 NNNNNNNNNN 1332 >>> my_rna = my_dna.transcribe() 1333 >>> my_rna 1334 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1335 >>> print(my_rna) 1336 NNNNNNNNNN 1337 """ 1338 #Offload the alphabet stuff 1339 s = Seq(self._character, self.alphabet).transcribe() 1340 return UnknownSeq(self._length, s.alphabet, self._character)
1341
1342 - def back_transcribe(self):
1343 """Returns unknown DNA sequence from an unknown RNA sequence. 1344 1345 >>> my_rna = UnknownSeq(20, character="N") 1346 >>> my_rna 1347 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1348 >>> print(my_rna) 1349 NNNNNNNNNNNNNNNNNNNN 1350 >>> my_dna = my_rna.back_transcribe() 1351 >>> my_dna 1352 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1353 >>> print(my_dna) 1354 NNNNNNNNNNNNNNNNNNNN 1355 """ 1356 #Offload the alphabet stuff 1357 s = Seq(self._character, self.alphabet).back_transcribe() 1358 return UnknownSeq(self._length, s.alphabet, self._character)
1359
1360 - def upper(self):
1361 """Returns an upper case copy of the sequence. 1362 1363 >>> from Bio.Alphabet import generic_dna 1364 >>> from Bio.Seq import UnknownSeq 1365 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1366 >>> my_seq 1367 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1368 >>> print(my_seq) 1369 nnnnnnnnnnnnnnnnnnnn 1370 >>> my_seq.upper() 1371 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1372 >>> print(my_seq.upper()) 1373 NNNNNNNNNNNNNNNNNNNN 1374 1375 This will adjust the alphabet if required. See also the lower method. 1376 """ 1377 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1378
1379 - def lower(self):
1380 """Returns a lower case copy of the sequence. 1381 1382 This will adjust the alphabet if required: 1383 1384 >>> from Bio.Alphabet import IUPAC 1385 >>> from Bio.Seq import UnknownSeq 1386 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1387 >>> my_seq 1388 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1389 >>> print(my_seq) 1390 XXXXXXXXXXXXXXXXXXXX 1391 >>> my_seq.lower() 1392 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1393 >>> print(my_seq.lower()) 1394 xxxxxxxxxxxxxxxxxxxx 1395 1396 See also the upper method. 1397 """ 1398 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1399
1400 - def translate(self, **kwargs):
1401 """Translate an unknown nucleotide sequence into an unknown protein. 1402 1403 e.g. 1404 1405 >>> my_seq = UnknownSeq(9, character="N") 1406 >>> print(my_seq) 1407 NNNNNNNNN 1408 >>> my_protein = my_seq.translate() 1409 >>> my_protein 1410 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1411 >>> print(my_protein) 1412 XXX 1413 1414 In comparison, using a normal Seq object: 1415 1416 >>> my_seq = Seq("NNNNNNNNN") 1417 >>> print(my_seq) 1418 NNNNNNNNN 1419 >>> my_protein = my_seq.translate() 1420 >>> my_protein 1421 Seq('XXX', ExtendedIUPACProtein()) 1422 >>> print(my_protein) 1423 XXX 1424 1425 """ 1426 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1427 Alphabet.ProteinAlphabet): 1428 raise ValueError("Proteins cannot be translated!") 1429 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1430
1431 - def ungap(self, gap=None):
1432 """Return a copy of the sequence without the gap character(s). 1433 1434 The gap character can be specified in two ways - either as an explicit 1435 argument, or via the sequence's alphabet. For example: 1436 1437 >>> from Bio.Seq import UnknownSeq 1438 >>> from Bio.Alphabet import Gapped, generic_dna 1439 >>> my_dna = UnknownSeq(20, Gapped(generic_dna, "-")) 1440 >>> my_dna 1441 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1442 >>> my_dna.ungap() 1443 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1444 >>> my_dna.ungap("-") 1445 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1446 1447 If the UnknownSeq is using the gap character, then an empty Seq is 1448 returned: 1449 1450 >>> my_gap = UnknownSeq(20, Gapped(generic_dna, "-"), character="-") 1451 >>> my_gap 1452 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1453 >>> my_gap.ungap() 1454 Seq('', DNAAlphabet()) 1455 >>> my_gap.ungap("-") 1456 Seq('', DNAAlphabet()) 1457 1458 Notice that the returned sequence's alphabet is adjusted to remove any 1459 explicit gap character declaration. 1460 """ 1461 #Offload the alphabet stuff 1462 s = Seq(self._character, self.alphabet).ungap() 1463 if s: 1464 return UnknownSeq(self._length, s.alphabet, self._character) 1465 else: 1466 return Seq("", s.alphabet)
1467 1468
1469 -class MutableSeq(object):
1470 """An editable sequence object (with an alphabet). 1471 1472 Unlike normal python strings and our basic sequence object (the Seq class) 1473 which are immuatable, the MutableSeq lets you edit the sequence in place. 1474 However, this means you cannot use a MutableSeq object as a dictionary key. 1475 1476 >>> from Bio.Seq import MutableSeq 1477 >>> from Bio.Alphabet import generic_dna 1478 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1479 >>> my_seq 1480 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1481 >>> my_seq[5] 1482 'T' 1483 >>> my_seq[5] = "A" 1484 >>> my_seq 1485 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1486 >>> my_seq[5] 1487 'A' 1488 >>> my_seq[5:8] = "NNN" 1489 >>> my_seq 1490 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1491 >>> len(my_seq) 1492 11 1493 1494 Note that the MutableSeq object does not support as many string-like 1495 or biological methods as the Seq object. 1496 """
1497 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
1498 if sys.version_info[0] == 3: 1499 self.array_indicator = "u" 1500 else: 1501 self.array_indicator = "c" 1502 if isinstance(data, str): # TODO - What about unicode? 1503 self.data = array.array(self.array_indicator, data) 1504 else: 1505 self.data = data # assumes the input is an array 1506 self.alphabet = alphabet
1507
1508 - def __repr__(self):
1509 """Returns a (truncated) representation of the sequence for debugging.""" 1510 if len(self) > 60: 1511 #Shows the last three letters as it is often useful to see if there 1512 #is a stop codon at the end of a sequence. 1513 #Note total length is 54+3+3=60 1514 return "%s('%s...%s', %s)" % (self.__class__.__name__, 1515 str(self[:54]), str(self[-3:]), 1516 repr(self.alphabet)) 1517 else: 1518 return "%s('%s', %s)" % (self.__class__.__name__, 1519 str(self), 1520 repr(self.alphabet))
1521
1522 - def __str__(self):
1523 """Returns the full sequence as a python string. 1524 1525 Note that Biopython 1.44 and earlier would give a truncated 1526 version of repr(my_seq) for str(my_seq). If you are writing code 1527 which needs to be backwards compatible with old Biopython, you 1528 should continue to use my_seq.tostring() rather than str(my_seq). 1529 """ 1530 #See test_GAQueens.py for an historic usage of a non-string alphabet! 1531 return "".join(self.data)
1532
1533 - def __cmp__(self, other):
1534 """Compare the sequence to another sequence or a string (README). 1535 1536 Currently if compared to another sequence the alphabets must be 1537 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1538 an exception. Otherwise only the sequence itself is compared, not the 1539 precise alphabet. 1540 1541 A future release of Biopython will change this (and the Seq object etc) 1542 to use simple string comparison. The plan is that comparing sequences 1543 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1544 but not an exception. 1545 1546 During this transition period, please just do explicit comparisons: 1547 1548 >>> seq1 = MutableSeq("ACGT") 1549 >>> seq2 = MutableSeq("ACGT") 1550 >>> id(seq1) == id(seq2) 1551 False 1552 >>> str(seq1) == str(seq2) 1553 True 1554 1555 This method indirectly supports ==, < , etc. 1556 """ 1557 if hasattr(other, "alphabet"): 1558 #other should be a Seq or a MutableSeq 1559 import warnings 1560 warnings.warn("In future comparing incompatible alphabets will " 1561 "only trigger a warning (not an exception). In " 1562 "the interim please use id(seq1)==id(seq2) or " 1563 "str(seq1)==str(seq2) to make your code explicit " 1564 "and to avoid this warning.", FutureWarning) 1565 if not Alphabet._check_type_compatible([self.alphabet, 1566 other.alphabet]): 1567 raise TypeError("Incompatable alphabets %s and %s" 1568 % (repr(self.alphabet), repr(other.alphabet))) 1569 #They should be the same sequence type (or one of them is generic) 1570 if isinstance(other, MutableSeq): 1571 #See test_GAQueens.py for an historic usage of a non-string 1572 #alphabet! Comparing the arrays supports this. 1573 return cmp(self.data, other.data) 1574 else: 1575 return cmp(str(self), str(other)) 1576 elif isinstance(other, basestring): 1577 return cmp(str(self), other) 1578 else: 1579 raise TypeError
1580
1581 - def __len__(self):
1582 return len(self.data)
1583
1584 - def __getitem__(self, index):
1585 #Note since Python 2.0, __getslice__ is deprecated 1586 #and __getitem__ is used instead. 1587 #See http://docs.python.org/ref/sequence-methods.html 1588 if isinstance(index, int): 1589 #Return a single letter as a string 1590 return self.data[index] 1591 else: 1592 #Return the (sub)sequence as another Seq object 1593 return MutableSeq(self.data[index], self.alphabet)
1594
1595 - def __setitem__(self, index, value):
1596 #Note since Python 2.0, __setslice__ is deprecated 1597 #and __setitem__ is used instead. 1598 #See http://docs.python.org/ref/sequence-methods.html 1599 if isinstance(index, int): 1600 #Replacing a single letter with a new string 1601 self.data[index] = value 1602 else: 1603 #Replacing a sub-sequence 1604 if isinstance(value, MutableSeq): 1605 self.data[index] = value.data 1606 elif isinstance(value, type(self.data)): 1607 self.data[index] = value 1608 else: 1609 self.data[index] = array.array(self.array_indicator, 1610 str(value))
1611
1612 - def __delitem__(self, index):
1613 #Note since Python 2.0, __delslice__ is deprecated 1614 #and __delitem__ is used instead. 1615 #See http://docs.python.org/ref/sequence-methods.html 1616 1617 #Could be deleting a single letter, or a slice 1618 del self.data[index]
1619
1620 - def __add__(self, other):
1621 """Add another sequence or string to this sequence. 1622 1623 Returns a new MutableSeq object.""" 1624 if hasattr(other, "alphabet"): 1625 #other should be a Seq or a MutableSeq 1626 if not Alphabet._check_type_compatible([self.alphabet, 1627 other.alphabet]): 1628 raise TypeError("Incompatable alphabets %s and %s" 1629 % (repr(self.alphabet), repr(other.alphabet))) 1630 #They should be the same sequence type (or one of them is generic) 1631 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1632 if isinstance(other, MutableSeq): 1633 #See test_GAQueens.py for an historic usage of a non-string 1634 #alphabet! Adding the arrays should support this. 1635 return self.__class__(self.data + other.data, a) 1636 else: 1637 return self.__class__(str(self) + str(other), a) 1638 elif isinstance(other, basestring): 1639 #other is a plain string - use the current alphabet 1640 return self.__class__(str(self) + str(other), self.alphabet) 1641 else: 1642 raise TypeError
1643
1644 - def __radd__(self, other):
1645 if hasattr(other, "alphabet"): 1646 #other should be a Seq or a MutableSeq 1647 if not Alphabet._check_type_compatible([self.alphabet, 1648 other.alphabet]): 1649 raise TypeError("Incompatable alphabets %s and %s" 1650 % (repr(self.alphabet), repr(other.alphabet))) 1651 #They should be the same sequence type (or one of them is generic) 1652 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1653 if isinstance(other, MutableSeq): 1654 #See test_GAQueens.py for an historic usage of a non-string 1655 #alphabet! Adding the arrays should support this. 1656 return self.__class__(other.data + self.data, a) 1657 else: 1658 return self.__class__(str(other) + str(self), a) 1659 elif isinstance(other, basestring): 1660 #other is a plain string - use the current alphabet 1661 return self.__class__(str(other) + str(self), self.alphabet) 1662 else: 1663 raise TypeError
1664
1665 - def append(self, c):
1666 self.data.append(c)
1667
1668 - def insert(self, i, c):
1669 self.data.insert(i, c)
1670
1671 - def pop(self, i = (-1)):
1672 c = self.data[i] 1673 del self.data[i] 1674 return c
1675
1676 - def remove(self, item):
1677 for i in range(len(self.data)): 1678 if self.data[i] == item: 1679 del self.data[i] 1680 return 1681 raise ValueError("MutableSeq.remove(x): x not in list")
1682
1683 - def count(self, sub, start=0, end=sys.maxsize):
1684 """Non-overlapping count method, like that of a python string. 1685 1686 This behaves like the python string method of the same name, 1687 which does a non-overlapping count! 1688 1689 Returns an integer, the number of occurrences of substring 1690 argument sub in the (sub)sequence given by [start:end]. 1691 Optional arguments start and end are interpreted as in slice 1692 notation. 1693 1694 Arguments: 1695 - sub - a string or another Seq object to look for 1696 - start - optional integer, slice start 1697 - end - optional integer, slice end 1698 1699 e.g. 1700 1701 >>> from Bio.Seq import MutableSeq 1702 >>> my_mseq = MutableSeq("AAAATGA") 1703 >>> print(my_mseq.count("A")) 1704 5 1705 >>> print(my_mseq.count("ATG")) 1706 1 1707 >>> print(my_mseq.count(Seq("AT"))) 1708 1 1709 >>> print(my_mseq.count("AT", 2, -1)) 1710 1 1711 1712 HOWEVER, please note because that python strings, Seq objects and 1713 MutableSeq objects do a non-overlapping search, this may not give 1714 the answer you expect: 1715 1716 >>> "AAAA".count("AA") 1717 2 1718 >>> print(MutableSeq("AAAA").count("AA")) 1719 2 1720 1721 A non-overlapping search would give the answer as three! 1722 """ 1723 try: 1724 #TODO - Should we check the alphabet? 1725 search = sub.tostring() 1726 except AttributeError: 1727 search = sub 1728 1729 if not isinstance(search, basestring): 1730 raise TypeError("expected a string, Seq or MutableSeq") 1731 1732 if len(search) == 1: 1733 #Try and be efficient and work directly from the array. 1734 count = 0 1735 for c in self.data[start:end]: 1736 if c == search: 1737 count += 1 1738 return count 1739 else: 1740 #TODO - Can we do this more efficiently? 1741 return self.tostring().count(search, start, end)
1742
1743 - def index(self, item):
1744 for i in range(len(self.data)): 1745 if self.data[i] == item: 1746 return i 1747 raise ValueError("MutableSeq.index(x): x not in list")
1748
1749 - def reverse(self):
1750 """Modify the mutable sequence to reverse itself. 1751 1752 No return value. 1753 """ 1754 self.data.reverse()
1755
1756 - def complement(self):
1757 """Modify the mutable sequence to take on its complement. 1758 1759 Trying to complement a protein sequence raises an exception. 1760 1761 No return value. 1762 """ 1763 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1764 Alphabet.ProteinAlphabet): 1765 raise ValueError("Proteins do not have complements!") 1766 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1767 d = ambiguous_dna_complement 1768 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1769 d = ambiguous_rna_complement 1770 elif 'U' in self.data and 'T' in self.data: 1771 #TODO - Handle this cleanly? 1772 raise ValueError("Mixed RNA/DNA found") 1773 elif 'U' in self.data: 1774 d = ambiguous_rna_complement 1775 else: 1776 d = ambiguous_dna_complement 1777 c = dict([(x.lower(), y.lower()) for x, y in d.items()]) 1778 d.update(c) 1779 self.data = [d[c] for c in self.data] 1780 self.data = array.array(self.array_indicator, self.data)
1781
1782 - def reverse_complement(self):
1783 """Modify the mutable sequence to take on its reverse complement. 1784 1785 Trying to reverse complement a protein sequence raises an exception. 1786 1787 No return value. 1788 """ 1789 self.complement() 1790 self.data.reverse()
1791 1792 ## Sorting a sequence makes no sense. 1793 # def sort(self, *args): self.data.sort(*args) 1794
1795 - def extend(self, other):
1796 if isinstance(other, MutableSeq): 1797 for c in other.data: 1798 self.data.append(c) 1799 else: 1800 for c in other: 1801 self.data.append(c)
1802
1803 - def tostring(self):
1804 """Returns the full sequence as a python string (semi-obsolete). 1805 1806 Although not formally deprecated, you are now encouraged to use 1807 str(my_seq) instead of my_seq.tostring(). 1808 1809 Because str(my_seq) will give you the full sequence as a python string, 1810 there is often no need to make an explicit conversion. For example, 1811 1812 print("ID={%s}, sequence={%s}" % (my_name, my_seq)) 1813 1814 On Biopython 1.44 or older you would have to have done this: 1815 1816 print("ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())) 1817 """ 1818 return "".join(self.data)
1819
1820 - def toseq(self):
1821 """Returns the full sequence as a new immutable Seq object. 1822 1823 >>> from Bio.Seq import Seq 1824 >>> from Bio.Alphabet import IUPAC 1825 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 1826 ... IUPAC.protein) 1827 >>> my_mseq 1828 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1829 >>> my_mseq.toseq() 1830 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1831 1832 Note that the alphabet is preserved. 1833 """ 1834 return Seq("".join(self.data), self.alphabet)
1835 1836 1837 # The transcribe, backward_transcribe, and translate functions are 1838 # user-friendly versions of the corresponding functions in Bio.Transcribe 1839 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1840
1841 -def transcribe(dna):
1842 """Transcribes a DNA sequence into RNA. 1843 1844 If given a string, returns a new string object. 1845 1846 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1847 1848 Trying to transcribe a protein or RNA sequence raises an exception. 1849 1850 e.g. 1851 1852 >>> transcribe("ACTGN") 1853 'ACUGN' 1854 """ 1855 if isinstance(dna, Seq): 1856 return dna.transcribe() 1857 elif isinstance(dna, MutableSeq): 1858 return dna.toseq().transcribe() 1859 else: 1860 return dna.replace('T', 'U').replace('t', 'u')
1861 1862
1863 -def back_transcribe(rna):
1864 """Back-transcribes an RNA sequence into DNA. 1865 1866 If given a string, returns a new string object. 1867 1868 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1869 1870 Trying to transcribe a protein or DNA sequence raises an exception. 1871 1872 e.g. 1873 1874 >>> back_transcribe("ACUGN") 1875 'ACTGN' 1876 """ 1877 if isinstance(rna, Seq): 1878 return rna.back_transcribe() 1879 elif isinstance(rna, MutableSeq): 1880 return rna.toseq().back_transcribe() 1881 else: 1882 return rna.replace('U', 'T').replace('u', 't')
1883 1884
1885 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 1886 cds=False, pos_stop="X"):
1887 """Helper function to translate a nucleotide string (PRIVATE). 1888 1889 Arguments: 1890 - sequence - a string 1891 - table - a CodonTable object (NOT a table name or id number) 1892 - stop_symbol - a single character string, what to use for terminators. 1893 - to_stop - boolean, should translation terminate at the first 1894 in frame stop codon? If there is no in-frame stop codon 1895 then translation continues to the end. 1896 - pos_stop - a single character string for a possible stop codon 1897 (e.g. TAN or NNN) 1898 - cds - Boolean, indicates this is a complete CDS. If True, this 1899 checks the sequence starts with a valid alternative start 1900 codon (which will be translated as methionine, M), that the 1901 sequence length is a multiple of three, and that there is a 1902 single in frame stop codon at the end (this will be excluded 1903 from the protein sequence, regardless of the to_stop option). 1904 If these tests fail, an exception is raised. 1905 1906 Returns a string. 1907 1908 e.g. 1909 1910 >>> from Bio.Data import CodonTable 1911 >>> table = CodonTable.ambiguous_dna_by_id[1] 1912 >>> _translate_str("AAA", table) 1913 'K' 1914 >>> _translate_str("TAR", table) 1915 '*' 1916 >>> _translate_str("TAN", table) 1917 'X' 1918 >>> _translate_str("TAN", table, pos_stop="@") 1919 '@' 1920 >>> _translate_str("TA?", table) 1921 Traceback (most recent call last): 1922 ... 1923 TranslationError: Codon 'TA?' is invalid 1924 1925 In a change to older verions of Biopython, partial codons are now 1926 always regarded as an error (previously only checked if cds=True) 1927 and will trigger a warning (likely to become an exception in a 1928 future release). 1929 1930 If cds=True, the start and stop codons are checked, and the start 1931 codon will be translated at methionine. The sequence must be an 1932 while number of codons. 1933 1934 >>> _translate_str("ATGCCCTAG", table, cds=True) 1935 'MP' 1936 >>> _translate_str("AAACCCTAG", table, cds=True) 1937 Traceback (most recent call last): 1938 ... 1939 TranslationError: First codon 'AAA' is not a start codon 1940 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 1941 Traceback (most recent call last): 1942 ... 1943 TranslationError: Extra in frame stop codon found. 1944 """ 1945 sequence = sequence.upper() 1946 amino_acids = [] 1947 forward_table = table.forward_table 1948 stop_codons = table.stop_codons 1949 if table.nucleotide_alphabet.letters is not None: 1950 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 1951 else: 1952 #Assume the worst case, ambiguous DNA or RNA: 1953 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + 1954 IUPAC.ambiguous_rna.letters.upper()) 1955 n = len(sequence) 1956 if cds: 1957 if str(sequence[:3]).upper() not in table.start_codons: 1958 raise CodonTable.TranslationError( 1959 "First codon '%s' is not a start codon" % sequence[:3]) 1960 if n % 3 != 0: 1961 raise CodonTable.TranslationError( 1962 "Sequence length %i is not a multiple of three" % n) 1963 if str(sequence[-3:]).upper() not in stop_codons: 1964 raise CodonTable.TranslationError( 1965 "Final codon '%s' is not a stop codon" % sequence[-3:]) 1966 #Don't translate the stop symbol, and manually translate the M 1967 sequence = sequence[3:-3] 1968 n -= 6 1969 amino_acids = ["M"] 1970 elif n % 3 != 0: 1971 import warnings 1972 from Bio import BiopythonWarning 1973 warnings.warn("Partial codon, len(sequence) not a multiple of three. " 1974 "Explicitly trim the sequence or add trailing N before " 1975 "translation. This may become an error in future.", 1976 BiopythonWarning) 1977 for i in range(0, n - n%3, 3): 1978 codon = sequence[i:i+3] 1979 try: 1980 amino_acids.append(forward_table[codon]) 1981 except (KeyError, CodonTable.TranslationError): 1982 #Todo? Treat "---" as a special case (gapped translation) 1983 if codon in table.stop_codons: 1984 if cds: 1985 raise CodonTable.TranslationError( 1986 "Extra in frame stop codon found.") 1987 if to_stop: 1988 break 1989 amino_acids.append(stop_symbol) 1990 elif valid_letters.issuperset(set(codon)): 1991 #Possible stop codon (e.g. NNN or TAN) 1992 amino_acids.append(pos_stop) 1993 else: 1994 raise CodonTable.TranslationError( 1995 "Codon '%s' is invalid" % codon) 1996 return "".join(amino_acids)
1997 1998
1999 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 2000 cds=False):
2001 """Translate a nucleotide sequence into amino acids. 2002 2003 If given a string, returns a new string object. Given a Seq or 2004 MutableSeq, returns a Seq object with a protein alphabet. 2005 2006 Arguments: 2007 - table - Which codon table to use? This can be either a name (string), 2008 an NCBI identifier (integer), or a CodonTable object (useful 2009 for non-standard genetic codes). Defaults to the "Standard" 2010 table. 2011 - stop_symbol - Single character string, what to use for any 2012 terminators, defaults to the asterisk, "*". 2013 - to_stop - Boolean, defaults to False meaning do a full 2014 translation continuing on past any stop codons 2015 (translated as the specified stop_symbol). If 2016 True, translation is terminated at the first in 2017 frame stop codon (and the stop_symbol is not 2018 appended to the returned protein sequence). 2019 - cds - Boolean, indicates this is a complete CDS. If True, this 2020 checks the sequence starts with a valid alternative start 2021 codon (which will be translated as methionine, M), that the 2022 sequence length is a multiple of three, and that there is a 2023 single in frame stop codon at the end (this will be excluded 2024 from the protein sequence, regardless of the to_stop option). 2025 If these tests fail, an exception is raised. 2026 2027 A simple string example using the default (standard) genetic code: 2028 2029 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2030 >>> translate(coding_dna) 2031 'VAIVMGR*KGAR*' 2032 >>> translate(coding_dna, stop_symbol="@") 2033 'VAIVMGR@KGAR@' 2034 >>> translate(coding_dna, to_stop=True) 2035 'VAIVMGR' 2036 2037 Now using NCBI table 2, where TGA is not a stop codon: 2038 2039 >>> translate(coding_dna, table=2) 2040 'VAIVMGRWKGAR*' 2041 >>> translate(coding_dna, table=2, to_stop=True) 2042 'VAIVMGRWKGAR' 2043 2044 In fact this example uses an alternative start codon valid under NCBI table 2, 2045 GTG, which means this example is a complete valid CDS which when translated 2046 should really start with methionine (not valine): 2047 2048 >>> translate(coding_dna, table=2, cds=True) 2049 'MAIVMGRWKGAR' 2050 2051 Note that if the sequence has no in-frame stop codon, then the to_stop 2052 argument has no effect: 2053 2054 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2055 >>> translate(coding_dna2) 2056 'VAIVMGR' 2057 >>> translate(coding_dna2, to_stop=True) 2058 'VAIVMGR' 2059 2060 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2061 or a stop codon. These are translated as "X". Any invalid codon 2062 (e.g. "TA?" or "T-A") will throw a TranslationError. 2063 2064 NOTE - Does NOT support gapped sequences. 2065 2066 It will however translate either DNA or RNA. 2067 """ 2068 if isinstance(sequence, Seq): 2069 return sequence.translate(table, stop_symbol, to_stop, cds) 2070 elif isinstance(sequence, MutableSeq): 2071 #Return a Seq object 2072 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2073 else: 2074 #Assume its a string, return a string 2075 try: 2076 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2077 except ValueError: 2078 codon_table = CodonTable.ambiguous_generic_by_name[table] 2079 except (AttributeError, TypeError): 2080 if isinstance(table, CodonTable.CodonTable): 2081 codon_table = table 2082 else: 2083 raise ValueError('Bad table argument') 2084 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2085 2086
2087 -def reverse_complement(sequence):
2088 """Returns the reverse complement sequence of a nucleotide string. 2089 2090 If given a string, returns a new string object. 2091 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 2092 2093 Supports unambiguous and ambiguous nucleotide sequences. 2094 2095 e.g. 2096 2097 >>> reverse_complement("ACTG-NH") 2098 'DN-CAGT' 2099 """ 2100 if isinstance(sequence, Seq): 2101 #Return a Seq 2102 return sequence.reverse_complement() 2103 elif isinstance(sequence, MutableSeq): 2104 #Return a Seq 2105 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 2106 return sequence.toseq().reverse_complement() 2107 2108 #Assume its a string. 2109 #In order to avoid some code duplication, the old code would turn the string 2110 #into a Seq, use the reverse_complement method, and convert back to a string. 2111 #This worked, but is over five times slower on short sequences! 2112 if ('U' in sequence or 'u' in sequence) \ 2113 and ('T' in sequence or 't' in sequence): 2114 raise ValueError("Mixed RNA/DNA found") 2115 elif 'U' in sequence or 'u' in sequence: 2116 ttable = _rna_complement_table 2117 else: 2118 ttable = _dna_complement_table 2119 return sequence.translate(ttable)[::-1]
2120 2121
2122 -def _test():
2123 """Run the Bio.Seq module's doctests (PRIVATE).""" 2124 if sys.version_info[0:2] == (3, 1): 2125 print("Not running Bio.Seq doctest on Python 3.1") 2126 print("See http://bugs.python.org/issue7490") 2127 else: 2128 print("Running doctests...") 2129 import doctest 2130 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2131 print("Done")
2132 2133 if __name__ == "__main__": 2134 _test() 2135