Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial: 
  11   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html} 
  12   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf} 
  13  """ 
  14  __docformat__ ="epytext en"  # Don't just use plain text in epydoc API pages! 
  15   
  16  import string  # for maketrans only 
  17  import array 
  18  import sys 
  19   
  20  from Bio import Alphabet 
  21  from Bio.Alphabet import IUPAC 
  22  from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  23  from Bio.Data import CodonTable 
  24   
  25   
26 -def _maketrans(complement_mapping):
27 """Makes a python string translation table (PRIVATE). 28 29 Arguments: 30 - complement_mapping - a dictionary such as ambiguous_dna_complement 31 and ambiguous_rna_complement from Data.IUPACData. 32 33 Returns a translation table (a string of length 256) for use with the 34 python string's translate method to use in a (reverse) complement. 35 36 Compatible with lower case and upper case sequences. 37 38 For internal use only. 39 """ 40 before = ''.join(complement_mapping.keys()) 41 after = ''.join(complement_mapping.values()) 42 before = before + before.lower() 43 after = after + after.lower() 44 if sys.version_info[0] == 3: 45 return str.maketrans(before, after) 46 else: 47 return string.maketrans(before, after)
48 49 _dna_complement_table = _maketrans(ambiguous_dna_complement) 50 _rna_complement_table = _maketrans(ambiguous_rna_complement) 51 52
53 -class Seq(object):
54 """A read-only sequence object (essentially a string with an alphabet). 55 56 Like normal python strings, our basic sequence object is immutable. 57 This prevents you from doing my_seq[5] = "A" for example, but does allow 58 Seq objects to be used as dictionary keys. 59 60 The Seq object provides a number of string like methods (such as count, 61 find, split and strip), which are alphabet aware where appropriate. 62 63 In addition to the string like sequence, the Seq object has an alphabet 64 property. This is an instance of an Alphabet class from Bio.Alphabet, 65 for example generic DNA, or IUPAC DNA. This describes the type of molecule 66 (e.g. RNA, DNA, protein) and may also indicate the expected symbols 67 (letters). 68 69 The Seq object also provides some biological methods, such as complement, 70 reverse_complement, transcribe, back_transcribe and translate (which are 71 not applicable to sequences with a protein alphabet). 72 """
73 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
74 """Create a Seq object. 75 76 Arguments: 77 - seq - Sequence, required (string) 78 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 79 80 You will typically use Bio.SeqIO to read in sequences from files as 81 SeqRecord objects, whose sequence will be exposed as a Seq object via 82 the seq property. 83 84 However, will often want to create your own Seq objects directly: 85 86 >>> from Bio.Seq import Seq 87 >>> from Bio.Alphabet import IUPAC 88 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 89 ... IUPAC.protein) 90 >>> my_seq 91 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 92 >>> print my_seq 93 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 94 >>> my_seq.alphabet 95 IUPACProtein() 96 97 """ 98 # Enforce string storage 99 if not isinstance(data, basestring): 100 raise TypeError("The sequence data given to a Seq object should " 101 "be a string (not another Seq object etc)") 102 self._data = data 103 self.alphabet = alphabet # Seq API requirement
104
105 - def __repr__(self):
106 """Returns a (truncated) representation of the sequence for debugging.""" 107 if len(self) > 60: 108 #Shows the last three letters as it is often useful to see if there 109 #is a stop codon at the end of a sequence. 110 #Note total length is 54+3+3=60 111 return "%s('%s...%s', %s)" % (self.__class__.__name__, 112 str(self)[:54], str(self)[-3:], 113 repr(self.alphabet)) 114 else: 115 return "%s(%s, %s)" % (self.__class__.__name__, 116 repr(self._data), 117 repr(self.alphabet))
118
119 - def __str__(self):
120 """Returns the full sequence as a python string, use str(my_seq). 121 122 Note that Biopython 1.44 and earlier would give a truncated 123 version of repr(my_seq) for str(my_seq). If you are writing code 124 which need to be backwards compatible with old Biopython, you 125 should continue to use my_seq.tostring() rather than str(my_seq). 126 """ 127 return self._data
128
129 - def __hash__(self):
130 """Hash for comparison. 131 132 See the __cmp__ documentation - we plan to change this! 133 """ 134 return id(self) # Currently use object identity for equality testing
135
136 - def __cmp__(self, other):
137 """Compare the sequence to another sequence or a string (README). 138 139 Historically comparing Seq objects has done Python object comparison. 140 After considerable discussion (keeping in mind constraints of the 141 Python language, hashes and dictionary support) a future release of 142 Biopython will change this to use simple string comparison. The plan is 143 that comparing incompatible alphabets (e.g. DNA to RNA) will trigger a 144 warning. 145 146 This version of Biopython still does Python object comparison, but with 147 a warning about this future change. During this transition period, 148 please just do explicit comparisons: 149 150 >>> seq1 = Seq("ACGT") 151 >>> seq2 = Seq("ACGT") 152 >>> id(seq1) == id(seq2) 153 False 154 >>> str(seq1) == str(seq2) 155 True 156 157 Note - This method indirectly supports ==, < , etc. 158 """ 159 if hasattr(other, "alphabet"): 160 #other should be a Seq or a MutableSeq 161 import warnings 162 warnings.warn("In future comparing Seq objects will use string " 163 "comparison (not object comparison). Incompatible " 164 "alphabets will trigger a warning (not an exception). " 165 "In the interim please use id(seq1)==id(seq2) or " 166 "str(seq1)==str(seq2) to make your code explicit " 167 "and to avoid this warning.", FutureWarning) 168 return cmp(id(self), id(other))
169
170 - def __len__(self):
171 """Returns the length of the sequence, use len(my_seq).""" 172 return len(self._data) # Seq API requirement
173
174 - def __getitem__(self, index): # Seq API requirement
175 """Returns a subsequence of single letter, use my_seq[index].""" 176 #Note since Python 2.0, __getslice__ is deprecated 177 #and __getitem__ is used instead. 178 #See http://docs.python.org/ref/sequence-methods.html 179 if isinstance(index, int): 180 #Return a single letter as a string 181 return self._data[index] 182 else: 183 #Return the (sub)sequence as another Seq object 184 return Seq(self._data[index], self.alphabet)
185
186 - def __add__(self, other):
187 """Add another sequence or string to this sequence. 188 189 If adding a string to a Seq, the alphabet is preserved: 190 191 >>> from Bio.Seq import Seq 192 >>> from Bio.Alphabet import generic_protein 193 >>> Seq("MELKI", generic_protein) + "LV" 194 Seq('MELKILV', ProteinAlphabet()) 195 196 When adding two Seq (like) objects, the alphabets are important. 197 Consider this example: 198 199 >>> from Bio.Seq import Seq 200 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna 201 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna) 202 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna) 203 >>> unamb_dna_seq 204 Seq('ACGT', IUPACUnambiguousDNA()) 205 >>> ambig_dna_seq 206 Seq('ACRGT', IUPACAmbiguousDNA()) 207 208 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get 209 the more general ambiguous IUPAC DNA alphabet: 210 211 >>> unamb_dna_seq + ambig_dna_seq 212 Seq('ACGTACRGT', IUPACAmbiguousDNA()) 213 214 However, if the default generic alphabet is included, the result is 215 a generic alphabet: 216 217 >>> Seq("") + ambig_dna_seq 218 Seq('ACRGT', Alphabet()) 219 220 You can't add RNA and DNA sequences: 221 222 >>> from Bio.Alphabet import generic_dna, generic_rna 223 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna) 224 Traceback (most recent call last): 225 ... 226 TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet() 227 228 You can't add nucleotide and protein sequences: 229 230 >>> from Bio.Alphabet import generic_dna, generic_protein 231 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein) 232 Traceback (most recent call last): 233 ... 234 TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet() 235 """ 236 if hasattr(other, "alphabet"): 237 #other should be a Seq or a MutableSeq 238 if not Alphabet._check_type_compatible([self.alphabet, 239 other.alphabet]): 240 raise TypeError("Incompatible alphabets %s and %s" 241 % (repr(self.alphabet), repr(other.alphabet))) 242 #They should be the same sequence type (or one of them is generic) 243 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 244 return self.__class__(str(self) + str(other), a) 245 elif isinstance(other, basestring): 246 #other is a plain string - use the current alphabet 247 return self.__class__(str(self) + other, self.alphabet) 248 from Bio.SeqRecord import SeqRecord # Lazy to avoid circular imports 249 if isinstance(other, SeqRecord): 250 #Get the SeqRecord's __radd__ to handle this 251 return NotImplemented 252 else: 253 raise TypeError
254
255 - def __radd__(self, other):
256 """Adding a sequence on the left. 257 258 If adding a string to a Seq, the alphabet is preserved: 259 260 >>> from Bio.Seq import Seq 261 >>> from Bio.Alphabet import generic_protein 262 >>> "LV" + Seq("MELKI", generic_protein) 263 Seq('LVMELKI', ProteinAlphabet()) 264 265 Adding two Seq (like) objects is handled via the __add__ method. 266 """ 267 if hasattr(other, "alphabet"): 268 #other should be a Seq or a MutableSeq 269 if not Alphabet._check_type_compatible([self.alphabet, 270 other.alphabet]): 271 raise TypeError("Incompatable alphabets %s and %s" 272 % (repr(self.alphabet), repr(other.alphabet))) 273 #They should be the same sequence type (or one of them is generic) 274 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 275 return self.__class__(str(other) + str(self), a) 276 elif isinstance(other, basestring): 277 #other is a plain string - use the current alphabet 278 return self.__class__(other + str(self), self.alphabet) 279 else: 280 raise TypeError
281
282 - def tostring(self): # Seq API requirement
283 """Returns the full sequence as a python string (semi-obsolete). 284 285 Although not formally deprecated, you are now encouraged to use 286 str(my_seq) instead of my_seq.tostring().""" 287 #TODO - Fix all places elsewhere in Biopython using this method, 288 #then start deprecation process? 289 #import warnings 290 #warnings.warn("This method is obsolete; please use str(my_seq) " 291 # "instead of my_seq.tostring().", 292 # PendingDeprecationWarning) 293 return str(self) 294
295 - def tomutable(self): # Needed? Or use a function?
296 """Returns the full sequence as a MutableSeq object. 297 298 >>> from Bio.Seq import Seq 299 >>> from Bio.Alphabet import IUPAC 300 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 301 ... IUPAC.protein) 302 >>> my_seq 303 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 304 >>> my_seq.tomutable() 305 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 306 307 Note that the alphabet is preserved. 308 """ 309 return MutableSeq(str(self), self.alphabet) 310
311 - def _get_seq_str_and_check_alphabet(self, other_sequence):
312 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 313 314 For a string argument, returns the string. 315 316 For a Seq or MutableSeq, it checks the alphabet is compatible 317 (raising an exception if it isn't), and then returns a string. 318 """ 319 try: 320 other_alpha = other_sequence.alphabet 321 except AttributeError: 322 #Assume other_sequence is a string 323 return other_sequence 324 325 #Other should be a Seq or a MutableSeq 326 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]): 327 raise TypeError("Incompatable alphabets %s and %s" 328 % (repr(self.alphabet), repr(other_alpha))) 329 #Return as a string 330 return str(other_sequence)
331
332 - def count(self, sub, start=0, end=sys.maxint):
333 """Non-overlapping count method, like that of a python string. 334 335 This behaves like the python string method of the same name, 336 which does a non-overlapping count! 337 338 Returns an integer, the number of occurrences of substring 339 argument sub in the (sub)sequence given by [start:end]. 340 Optional arguments start and end are interpreted as in slice 341 notation. 342 343 Arguments: 344 - sub - a string or another Seq object to look for 345 - start - optional integer, slice start 346 - end - optional integer, slice end 347 348 e.g. 349 350 >>> from Bio.Seq import Seq 351 >>> my_seq = Seq("AAAATGA") 352 >>> print my_seq.count("A") 353 5 354 >>> print my_seq.count("ATG") 355 1 356 >>> print my_seq.count(Seq("AT")) 357 1 358 >>> print my_seq.count("AT", 2, -1) 359 1 360 361 HOWEVER, please note because python strings and Seq objects (and 362 MutableSeq objects) do a non-overlapping search, this may not give 363 the answer you expect: 364 365 >>> "AAAA".count("AA") 366 2 367 >>> print Seq("AAAA").count("AA") 368 2 369 370 A non-overlapping search would give the answer as three! 371 """ 372 #If it has one, check the alphabet: 373 sub_str = self._get_seq_str_and_check_alphabet(sub) 374 return str(self).count(sub_str, start, end)
375
376 - def __contains__(self, char):
377 """Implements the 'in' keyword, like a python string. 378 379 e.g. 380 381 >>> from Bio.Seq import Seq 382 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein 383 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna) 384 >>> "AAA" in my_dna 385 True 386 >>> Seq("AAA") in my_dna 387 True 388 >>> Seq("AAA", generic_dna) in my_dna 389 True 390 391 Like other Seq methods, this will raise a type error if another Seq 392 (or Seq like) object with an incompatible alphabet is used: 393 394 >>> Seq("AAA", generic_rna) in my_dna 395 Traceback (most recent call last): 396 ... 397 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet() 398 >>> Seq("AAA", generic_protein) in my_dna 399 Traceback (most recent call last): 400 ... 401 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet() 402 """ 403 #If it has one, check the alphabet: 404 sub_str = self._get_seq_str_and_check_alphabet(char) 405 return sub_str in str(self)
406
407 - def find(self, sub, start=0, end=sys.maxint):
408 """Find method, like that of a python string. 409 410 This behaves like the python string method of the same name. 411 412 Returns an integer, the index of the first occurrence of substring 413 argument sub in the (sub)sequence given by [start:end]. 414 415 Arguments: 416 - sub - a string or another Seq object to look for 417 - start - optional integer, slice start 418 - end - optional integer, slice end 419 420 Returns -1 if the subsequence is NOT found. 421 422 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 423 424 >>> from Bio.Seq import Seq 425 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 426 >>> my_rna.find("AUG") 427 3 428 """ 429 #If it has one, check the alphabet: 430 sub_str = self._get_seq_str_and_check_alphabet(sub) 431 return str(self).find(sub_str, start, end)
432
433 - def rfind(self, sub, start=0, end=sys.maxint):
434 """Find from right method, like that of a python string. 435 436 This behaves like the python string method of the same name. 437 438 Returns an integer, the index of the last (right most) occurrence of 439 substring argument sub in the (sub)sequence given by [start:end]. 440 441 Arguments: 442 - sub - a string or another Seq object to look for 443 - start - optional integer, slice start 444 - end - optional integer, slice end 445 446 Returns -1 if the subsequence is NOT found. 447 448 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 449 450 >>> from Bio.Seq import Seq 451 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 452 >>> my_rna.rfind("AUG") 453 15 454 """ 455 #If it has one, check the alphabet: 456 sub_str = self._get_seq_str_and_check_alphabet(sub) 457 return str(self).rfind(sub_str, start, end)
458
459 - def startswith(self, prefix, start=0, end=sys.maxint):
460 """Does the Seq start with the given prefix? Returns True/False. 461 462 This behaves like the python string method of the same name. 463 464 Return True if the sequence starts with the specified prefix 465 (a string or another Seq object), False otherwise. 466 With optional start, test sequence beginning at that position. 467 With optional end, stop comparing sequence at that position. 468 prefix can also be a tuple of strings to try. e.g. 469 470 >>> from Bio.Seq import Seq 471 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 472 >>> my_rna.startswith("GUC") 473 True 474 >>> my_rna.startswith("AUG") 475 False 476 >>> my_rna.startswith("AUG", 3) 477 True 478 >>> my_rna.startswith(("UCC","UCA","UCG"),1) 479 True 480 """ 481 #If it has one, check the alphabet: 482 if isinstance(prefix, tuple): 483 prefix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 484 for p in prefix) 485 return str(self).startswith(prefix_strs, start, end) 486 else: 487 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 488 return str(self).startswith(prefix_str, start, end)
489
490 - def endswith(self, suffix, start=0, end=sys.maxint):
491 """Does the Seq end with the given suffix? Returns True/False. 492 493 This behaves like the python string method of the same name. 494 495 Return True if the sequence ends with the specified suffix 496 (a string or another Seq object), False otherwise. 497 With optional start, test sequence beginning at that position. 498 With optional end, stop comparing sequence at that position. 499 suffix can also be a tuple of strings to try. e.g. 500 501 >>> from Bio.Seq import Seq 502 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 503 >>> my_rna.endswith("UUG") 504 True 505 >>> my_rna.endswith("AUG") 506 False 507 >>> my_rna.endswith("AUG", 0, 18) 508 True 509 >>> my_rna.endswith(("UCC","UCA","UUG")) 510 True 511 """ 512 #If it has one, check the alphabet: 513 if isinstance(suffix, tuple): 514 suffix_strs = tuple(self._get_seq_str_and_check_alphabet(p) 515 for p in suffix) 516 return str(self).endswith(suffix_strs, start, end) 517 else: 518 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 519 return str(self).endswith(suffix_str, start, end)
520
521 - def split(self, sep=None, maxsplit=-1):
522 """Split method, like that of a python string. 523 524 This behaves like the python string method of the same name. 525 526 Return a list of the 'words' in the string (as Seq objects), 527 using sep as the delimiter string. If maxsplit is given, at 528 most maxsplit splits are done. If maxsplit is omitted, all 529 splits are made. 530 531 Following the python string method, sep will by default be any 532 white space (tabs, spaces, newlines) but this is unlikely to 533 apply to biological sequences. 534 535 e.g. 536 537 >>> from Bio.Seq import Seq 538 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 539 >>> my_aa = my_rna.translate() 540 >>> my_aa 541 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 542 >>> my_aa.split("*") 543 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 544 >>> my_aa.split("*",1) 545 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 546 547 See also the rsplit method: 548 549 >>> my_aa.rsplit("*",1) 550 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 551 """ 552 #If it has one, check the alphabet: 553 sep_str = self._get_seq_str_and_check_alphabet(sep) 554 #TODO - If the sep is the defined stop symbol, or gap char, 555 #should we adjust the alphabet? 556 return [Seq(part, self.alphabet) 557 for part in str(self).split(sep_str, maxsplit)]
558
559 - def rsplit(self, sep=None, maxsplit=-1):
560 """Right split method, like that of a python string. 561 562 This behaves like the python string method of the same name. 563 564 Return a list of the 'words' in the string (as Seq objects), 565 using sep as the delimiter string. If maxsplit is given, at 566 most maxsplit splits are done COUNTING FROM THE RIGHT. 567 If maxsplit is omitted, all splits are made. 568 569 Following the python string method, sep will by default be any 570 white space (tabs, spaces, newlines) but this is unlikely to 571 apply to biological sequences. 572 573 e.g. print my_seq.rsplit("*",1) 574 575 See also the split method. 576 """ 577 #If it has one, check the alphabet: 578 sep_str = self._get_seq_str_and_check_alphabet(sep) 579 return [Seq(part, self.alphabet) 580 for part in str(self).rsplit(sep_str, maxsplit)]
581
582 - def strip(self, chars=None):
583 """Returns a new Seq object with leading and trailing ends stripped. 584 585 This behaves like the python string method of the same name. 586 587 Optional argument chars defines which characters to remove. If 588 omitted or None (default) then as for the python string method, 589 this defaults to removing any white space. 590 591 e.g. print my_seq.strip("-") 592 593 See also the lstrip and rstrip methods. 594 """ 595 #If it has one, check the alphabet: 596 strip_str = self._get_seq_str_and_check_alphabet(chars) 597 return Seq(str(self).strip(strip_str), self.alphabet)
598
599 - def lstrip(self, chars=None):
600 """Returns a new Seq object with leading (left) end stripped. 601 602 This behaves like the python string method of the same name. 603 604 Optional argument chars defines which characters to remove. If 605 omitted or None (default) then as for the python string method, 606 this defaults to removing any white space. 607 608 e.g. print my_seq.lstrip("-") 609 610 See also the strip and rstrip methods. 611 """ 612 #If it has one, check the alphabet: 613 strip_str = self._get_seq_str_and_check_alphabet(chars) 614 return Seq(str(self).lstrip(strip_str), self.alphabet)
615
616 - def rstrip(self, chars=None):
617 """Returns a new Seq object with trailing (right) end stripped. 618 619 This behaves like the python string method of the same name. 620 621 Optional argument chars defines which characters to remove. If 622 omitted or None (default) then as for the python string method, 623 this defaults to removing any white space. 624 625 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 626 627 >>> from Bio.Alphabet import IUPAC 628 >>> from Bio.Seq import Seq 629 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 630 >>> my_seq 631 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 632 >>> my_seq.rstrip("A") 633 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 634 635 See also the strip and lstrip methods. 636 """ 637 #If it has one, check the alphabet: 638 strip_str = self._get_seq_str_and_check_alphabet(chars) 639 return Seq(str(self).rstrip(strip_str), self.alphabet)
640
641 - def upper(self):
642 """Returns an upper case copy of the sequence. 643 644 >>> from Bio.Alphabet import HasStopCodon, generic_protein 645 >>> from Bio.Seq import Seq 646 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein)) 647 >>> my_seq 648 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*')) 649 >>> my_seq.lower() 650 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*')) 651 >>> my_seq.upper() 652 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*')) 653 654 This will adjust the alphabet if required. See also the lower method. 655 """ 656 return Seq(str(self).upper(), self.alphabet._upper())
657
658 - def lower(self):
659 """Returns a lower case copy of the sequence. 660 661 This will adjust the alphabet if required. Note that the IUPAC alphabets 662 are upper case only, and thus a generic alphabet must be substituted. 663 664 >>> from Bio.Alphabet import Gapped, generic_dna 665 >>> from Bio.Alphabet import IUPAC 666 >>> from Bio.Seq import Seq 667 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*")) 668 >>> my_seq 669 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*')) 670 >>> my_seq.lower() 671 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*')) 672 673 See also the upper method. 674 """ 675 return Seq(str(self).lower(), self.alphabet._lower())
676
677 - def complement(self):
678 """Returns the complement sequence. New Seq object. 679 680 >>> from Bio.Seq import Seq 681 >>> from Bio.Alphabet import IUPAC 682 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 683 >>> my_dna 684 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 685 >>> my_dna.complement() 686 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 687 688 You can of course used mixed case sequences, 689 690 >>> from Bio.Seq import Seq 691 >>> from Bio.Alphabet import generic_dna 692 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 693 >>> my_dna 694 Seq('CCCCCgatA-GD', DNAAlphabet()) 695 >>> my_dna.complement() 696 Seq('GGGGGctaT-CH', DNAAlphabet()) 697 698 Note in the above example, ambiguous character D denotes 699 G, A or T so its complement is H (for C, T or A). 700 701 Trying to complement a protein sequence raises an exception. 702 703 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 704 >>> my_protein.complement() 705 Traceback (most recent call last): 706 ... 707 ValueError: Proteins do not have complements! 708 """ 709 base = Alphabet._get_base_alphabet(self.alphabet) 710 if isinstance(base, Alphabet.ProteinAlphabet): 711 raise ValueError("Proteins do not have complements!") 712 if isinstance(base, Alphabet.DNAAlphabet): 713 ttable = _dna_complement_table 714 elif isinstance(base, Alphabet.RNAAlphabet): 715 ttable = _rna_complement_table 716 elif ('U' in self._data or 'u' in self._data) \ 717 and ('T' in self._data or 't' in self._data): 718 #TODO - Handle this cleanly? 719 raise ValueError("Mixed RNA/DNA found") 720 elif 'U' in self._data or 'u' in self._data: 721 ttable = _rna_complement_table 722 else: 723 ttable = _dna_complement_table 724 #Much faster on really long sequences than the previous loop based one. 725 #thx to Michael Palmer, University of Waterloo 726 return Seq(str(self).translate(ttable), self.alphabet)
727
728 - def reverse_complement(self):
729 """Returns the reverse complement sequence. New Seq object. 730 731 >>> from Bio.Seq import Seq 732 >>> from Bio.Alphabet import IUPAC 733 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 734 >>> my_dna 735 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 736 >>> my_dna.reverse_complement() 737 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 738 739 Note in the above example, since R = G or A, its complement 740 is Y (which denotes C or T). 741 742 You can of course used mixed case sequences, 743 744 >>> from Bio.Seq import Seq 745 >>> from Bio.Alphabet import generic_dna 746 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 747 >>> my_dna 748 Seq('CCCCCgatA-G', DNAAlphabet()) 749 >>> my_dna.reverse_complement() 750 Seq('C-TatcGGGGG', DNAAlphabet()) 751 752 Trying to complement a protein sequence raises an exception: 753 754 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 755 >>> my_protein.reverse_complement() 756 Traceback (most recent call last): 757 ... 758 ValueError: Proteins do not have complements! 759 """ 760 #Use -1 stride/step to reverse the complement 761 return self.complement()[::-1]
762
763 - def transcribe(self):
764 """Returns the RNA sequence from a DNA sequence. New Seq object. 765 766 >>> from Bio.Seq import Seq 767 >>> from Bio.Alphabet import IUPAC 768 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 769 ... IUPAC.unambiguous_dna) 770 >>> coding_dna 771 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 772 >>> coding_dna.transcribe() 773 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 774 775 Trying to transcribe a protein or RNA sequence raises an exception: 776 777 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 778 >>> my_protein.transcribe() 779 Traceback (most recent call last): 780 ... 781 ValueError: Proteins cannot be transcribed! 782 """ 783 base = Alphabet._get_base_alphabet(self.alphabet) 784 if isinstance(base, Alphabet.ProteinAlphabet): 785 raise ValueError("Proteins cannot be transcribed!") 786 if isinstance(base, Alphabet.RNAAlphabet): 787 raise ValueError("RNA cannot be transcribed!") 788 789 if self.alphabet==IUPAC.unambiguous_dna: 790 alphabet = IUPAC.unambiguous_rna 791 elif self.alphabet==IUPAC.ambiguous_dna: 792 alphabet = IUPAC.ambiguous_rna 793 else: 794 alphabet = Alphabet.generic_rna 795 return Seq(str(self).replace('T', 'U').replace('t', 'u'), alphabet)
796
797 - def back_transcribe(self):
798 """Returns the DNA sequence from an RNA sequence. New Seq object. 799 800 >>> from Bio.Seq import Seq 801 >>> from Bio.Alphabet import IUPAC 802 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 803 ... IUPAC.unambiguous_rna) 804 >>> messenger_rna 805 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 806 >>> messenger_rna.back_transcribe() 807 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 808 809 Trying to back-transcribe a protein or DNA sequence raises an 810 exception: 811 812 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 813 >>> my_protein.back_transcribe() 814 Traceback (most recent call last): 815 ... 816 ValueError: Proteins cannot be back transcribed! 817 """ 818 base = Alphabet._get_base_alphabet(self.alphabet) 819 if isinstance(base, Alphabet.ProteinAlphabet): 820 raise ValueError("Proteins cannot be back transcribed!") 821 if isinstance(base, Alphabet.DNAAlphabet): 822 raise ValueError("DNA cannot be back transcribed!") 823 824 if self.alphabet==IUPAC.unambiguous_rna: 825 alphabet = IUPAC.unambiguous_dna 826 elif self.alphabet==IUPAC.ambiguous_rna: 827 alphabet = IUPAC.ambiguous_dna 828 else: 829 alphabet = Alphabet.generic_dna 830 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
831
832 - def translate(self, table="Standard", stop_symbol="*", to_stop=False, 833 cds=False):
834 """Turns a nucleotide sequence into a protein sequence. New Seq object. 835 836 This method will translate DNA or RNA sequences, and those with a 837 nucleotide or generic alphabet. Trying to translate a protein 838 sequence raises an exception. 839 840 Arguments: 841 - table - Which codon table to use? This can be either a name 842 (string), an NCBI identifier (integer), or a CodonTable 843 object (useful for non-standard genetic codes). This 844 defaults to the "Standard" table. 845 - stop_symbol - Single character string, what to use for terminators. 846 This defaults to the asterisk, "*". 847 - to_stop - Boolean, defaults to False meaning do a full translation 848 continuing on past any stop codons (translated as the 849 specified stop_symbol). If True, translation is 850 terminated at the first in frame stop codon (and the 851 stop_symbol is not appended to the returned protein 852 sequence). 853 - cds - Boolean, indicates this is a complete CDS. If True, 854 this checks the sequence starts with a valid alternative start 855 codon (which will be translated as methionine, M), that the 856 sequence length is a multiple of three, and that there is a 857 single in frame stop codon at the end (this will be excluded 858 from the protein sequence, regardless of the to_stop option). 859 If these tests fail, an exception is raised. 860 861 e.g. Using the standard table: 862 863 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 864 >>> coding_dna.translate() 865 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 866 >>> coding_dna.translate(stop_symbol="@") 867 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 868 >>> coding_dna.translate(to_stop=True) 869 Seq('VAIVMGR', ExtendedIUPACProtein()) 870 871 Now using NCBI table 2, where TGA is not a stop codon: 872 873 >>> coding_dna.translate(table=2) 874 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 875 >>> coding_dna.translate(table=2, to_stop=True) 876 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 877 878 In fact, GTG is an alternative start codon under NCBI table 2, meaning 879 this sequence could be a complete CDS: 880 881 >>> coding_dna.translate(table=2, cds=True) 882 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein()) 883 884 It isn't a valid CDS under NCBI table 1, due to both the start codon and 885 also the in frame stop codons: 886 887 >>> coding_dna.translate(table=1, cds=True) 888 Traceback (most recent call last): 889 ... 890 TranslationError: First codon 'GTG' is not a start codon 891 892 If the sequence has no in-frame stop codon, then the to_stop argument 893 has no effect: 894 895 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 896 >>> coding_dna2.translate() 897 Seq('LAIVMGR', ExtendedIUPACProtein()) 898 >>> coding_dna2.translate(to_stop=True) 899 Seq('LAIVMGR', ExtendedIUPACProtein()) 900 901 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 902 or a stop codon. These are translated as "X". Any invalid codon 903 (e.g. "TA?" or "T-A") will throw a TranslationError. 904 905 NOTE - Does NOT support gapped sequences. 906 907 NOTE - This does NOT behave like the python string's translate 908 method. For that use str(my_seq).translate(...) instead. 909 """ 910 if isinstance(table, str) and len(table)==256: 911 raise ValueError("The Seq object translate method DOES NOT take " 912 + "a 256 character string mapping table like " 913 + "the python string object's translate method. " 914 + "Use str(my_seq).translate(...) instead.") 915 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 916 Alphabet.ProteinAlphabet): 917 raise ValueError("Proteins cannot be translated!") 918 try: 919 table_id = int(table) 920 except ValueError: 921 #Assume its a table name 922 if self.alphabet==IUPAC.unambiguous_dna: 923 #Will use standard IUPAC protein alphabet, no need for X 924 codon_table = CodonTable.unambiguous_dna_by_name[table] 925 elif self.alphabet==IUPAC.unambiguous_rna: 926 #Will use standard IUPAC protein alphabet, no need for X 927 codon_table = CodonTable.unambiguous_rna_by_name[table] 928 else: 929 #This will use the extended IUPAC protein alphabet with X etc. 930 #The same table can be used for RNA or DNA (we use this for 931 #translating strings). 932 codon_table = CodonTable.ambiguous_generic_by_name[table] 933 except (AttributeError, TypeError): 934 #Assume its a CodonTable object 935 if isinstance(table, CodonTable.CodonTable): 936 codon_table = table 937 else: 938 raise ValueError('Bad table argument') 939 else: 940 #Assume its a table ID 941 if self.alphabet==IUPAC.unambiguous_dna: 942 #Will use standard IUPAC protein alphabet, no need for X 943 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 944 elif self.alphabet==IUPAC.unambiguous_rna: 945 #Will use standard IUPAC protein alphabet, no need for X 946 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 947 else: 948 #This will use the extended IUPAC protein alphabet with X etc. 949 #The same table can be used for RNA or DNA (we use this for 950 #translating strings). 951 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 952 protein = _translate_str(str(self), codon_table, 953 stop_symbol, to_stop, cds) 954 if stop_symbol in protein: 955 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 956 stop_symbol = stop_symbol) 957 else: 958 alphabet = codon_table.protein_alphabet 959 return Seq(protein, alphabet)
960
961 - def ungap(self, gap=None):
962 """Return a copy of the sequence without the gap character(s). 963 964 The gap character can be specified in two ways - either as an explicit 965 argument, or via the sequence's alphabet. For example: 966 967 >>> from Bio.Seq import Seq 968 >>> from Bio.Alphabet import generic_dna 969 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna) 970 >>> my_dna 971 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 972 >>> my_dna.ungap("-") 973 Seq('ATATGAAATTTGAAAA', DNAAlphabet()) 974 975 If the gap character is not given as an argument, it will be taken from 976 the sequence's alphabet (if defined). Notice that the returned sequence's 977 alphabet is adjusted since it no longer requires a gapped alphabet: 978 979 >>> from Bio.Seq import Seq 980 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon 981 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "="))) 982 >>> my_pro 983 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*')) 984 >>> my_pro.ungap() 985 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*')) 986 987 Or, with a simpler gapped DNA example: 988 989 >>> from Bio.Seq import Seq 990 >>> from Bio.Alphabet import IUPAC, Gapped 991 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "=")) 992 >>> my_seq 993 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 994 >>> my_seq.ungap() 995 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 996 997 As long as it is consistent with the alphabet, although it is redundant, 998 you can still supply the gap character as an argument to this method: 999 1000 >>> my_seq 1001 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1002 >>> my_seq.ungap("=") 1003 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA()) 1004 1005 However, if the gap character given as the argument disagrees with that 1006 declared in the alphabet, an exception is raised: 1007 1008 >>> my_seq 1009 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '=')) 1010 >>> my_seq.ungap("-") 1011 Traceback (most recent call last): 1012 ... 1013 ValueError: Gap '-' does not match '=' from alphabet 1014 1015 Finally, if a gap character is not supplied, and the alphabet does not 1016 define one, an exception is raised: 1017 1018 >>> from Bio.Seq import Seq 1019 >>> from Bio.Alphabet import generic_dna 1020 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna) 1021 >>> my_dna 1022 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet()) 1023 >>> my_dna.ungap() 1024 Traceback (most recent call last): 1025 ... 1026 ValueError: Gap character not given and not defined in alphabet 1027 1028 """ 1029 if hasattr(self.alphabet, "gap_char"): 1030 if not gap: 1031 gap = self.alphabet.gap_char 1032 elif gap != self.alphabet.gap_char: 1033 raise ValueError("Gap %s does not match %s from alphabet" 1034 % (repr(gap), repr(self.alphabet.gap_char))) 1035 alpha = Alphabet._ungap(self.alphabet) 1036 elif not gap: 1037 raise ValueError("Gap character not given and not defined in alphabet") 1038 else: 1039 alpha = self.alphabet # modify! 1040 if len(gap)!=1 or not isinstance(gap, str): 1041 raise ValueError("Unexpected gap character, %s" % repr(gap)) 1042 return Seq(str(self).replace(gap, ""), alpha)
1043 1044
1045 -class UnknownSeq(Seq):
1046 """A read-only sequence object of known length but unknown contents. 1047 1048 If you have an unknown sequence, you can represent this with a normal 1049 Seq object, for example: 1050 1051 >>> my_seq = Seq("N"*5) 1052 >>> my_seq 1053 Seq('NNNNN', Alphabet()) 1054 >>> len(my_seq) 1055 5 1056 >>> print my_seq 1057 NNNNN 1058 1059 However, this is rather wasteful of memory (especially for large 1060 sequences), which is where this class is most usefull: 1061 1062 >>> unk_five = UnknownSeq(5) 1063 >>> unk_five 1064 UnknownSeq(5, alphabet = Alphabet(), character = '?') 1065 >>> len(unk_five) 1066 5 1067 >>> print(unk_five) 1068 ????? 1069 1070 You can add unknown sequence together, provided their alphabets and 1071 characters are compatible, and get another memory saving UnknownSeq: 1072 1073 >>> unk_four = UnknownSeq(4) 1074 >>> unk_four 1075 UnknownSeq(4, alphabet = Alphabet(), character = '?') 1076 >>> unk_four + unk_five 1077 UnknownSeq(9, alphabet = Alphabet(), character = '?') 1078 1079 If the alphabet or characters don't match up, the addition gives an 1080 ordinary Seq object: 1081 1082 >>> unk_nnnn = UnknownSeq(4, character = "N") 1083 >>> unk_nnnn 1084 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 1085 >>> unk_nnnn + unk_four 1086 Seq('NNNN????', Alphabet()) 1087 1088 Combining with a real Seq gives a new Seq object: 1089 1090 >>> known_seq = Seq("ACGT") 1091 >>> unk_four + known_seq 1092 Seq('????ACGT', Alphabet()) 1093 >>> known_seq + unk_four 1094 Seq('ACGT????', Alphabet()) 1095 """
1096 - def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None):
1097 """Create a new UnknownSeq object. 1098 1099 If character is omitted, it is determined from the alphabet, "N" for 1100 nucleotides, "X" for proteins, and "?" otherwise. 1101 """ 1102 self._length = int(length) 1103 if self._length < 0: 1104 #TODO - Block zero length UnknownSeq? You can just use a Seq! 1105 raise ValueError("Length must not be negative.") 1106 self.alphabet = alphabet 1107 if character: 1108 if len(character) != 1: 1109 raise ValueError("character argument should be a single letter string.") 1110 self._character = character 1111 else: 1112 base = Alphabet._get_base_alphabet(alphabet) 1113 #TODO? Check the case of the letters in the alphabet? 1114 #We may have to use "n" instead of "N" etc. 1115 if isinstance(base, Alphabet.NucleotideAlphabet): 1116 self._character = "N" 1117 elif isinstance(base, Alphabet.ProteinAlphabet): 1118 self._character = "X" 1119 else: 1120 self._character = "?"
1121
1122 - def __len__(self):
1123 """Returns the stated length of the unknown sequence.""" 1124 return self._length
1125
1126 - def __str__(self):
1127 """Returns the unknown sequence as full string of the given length.""" 1128 return self._character * self._length
1129
1130 - def __repr__(self):
1131 return "UnknownSeq(%i, alphabet = %s, character = %s)" \ 1132 % (self._length, repr(self.alphabet), repr(self._character))
1133
1134 - def __add__(self, other):
1135 """Add another sequence or string to this sequence. 1136 1137 Adding two UnknownSeq objects returns another UnknownSeq object 1138 provided the character is the same and the alphabets are compatible. 1139 1140 >>> from Bio.Seq import UnknownSeq 1141 >>> from Bio.Alphabet import generic_protein 1142 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein) 1143 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X') 1144 1145 If the characters differ, an UnknownSeq object cannot be used, so a 1146 Seq object is returned: 1147 1148 >>> from Bio.Seq import UnknownSeq 1149 >>> from Bio.Alphabet import generic_protein 1150 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein, 1151 ... character="x") 1152 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet()) 1153 1154 If adding a string to an UnknownSeq, a new Seq is returned with the 1155 same alphabet: 1156 1157 >>> from Bio.Seq import UnknownSeq 1158 >>> from Bio.Alphabet import generic_protein 1159 >>> UnknownSeq(5, generic_protein) + "LV" 1160 Seq('XXXXXLV', ProteinAlphabet()) 1161 """ 1162 if isinstance(other, UnknownSeq) \ 1163 and other._character == self._character: 1164 #TODO - Check the alphabets match 1165 return UnknownSeq(len(self)+len(other), 1166 self.alphabet, self._character) 1167 #Offload to the base class... 1168 return Seq(str(self), self.alphabet) + other
1169
1170 - def __radd__(self, other):
1171 #If other is an UnknownSeq, then __add__ would be called. 1172 #Offload to the base class... 1173 return other + Seq(str(self), self.alphabet)
1174
1175 - def __getitem__(self, index):
1176 """Get a subsequence from the UnknownSeq object. 1177 1178 >>> unk = UnknownSeq(8, character="N") 1179 >>> print unk[:] 1180 NNNNNNNN 1181 >>> print unk[5:3] 1182 <BLANKLINE> 1183 >>> print unk[1:-1] 1184 NNNNNN 1185 >>> print unk[1:-1:2] 1186 NNN 1187 """ 1188 if isinstance(index, int): 1189 #TODO - Check the bounds without wasting memory 1190 return str(self)[index] 1191 old_length = self._length 1192 step = index.step 1193 if step is None or step == 1: 1194 #This calculates the length you'd get from ("N"*old_length)[index] 1195 start = index.start 1196 end = index.stop 1197 if start is None: 1198 start = 0 1199 elif start < 0: 1200 start = max(0, old_length + start) 1201 elif start > old_length: 1202 start = old_length 1203 if end is None: 1204 end = old_length 1205 elif end < 0: 1206 end = max(0, old_length + end) 1207 elif end > old_length: 1208 end = old_length 1209 new_length = max(0, end-start) 1210 elif step == 0: 1211 raise ValueError("slice step cannot be zero") 1212 else: 1213 #TODO - handle step efficiently 1214 new_length = len(("X"*old_length)[index]) 1215 #assert new_length == len(("X"*old_length)[index]), \ 1216 # (index, start, end, step, old_length, 1217 # new_length, len(("X"*old_length)[index])) 1218 return UnknownSeq(new_length, self.alphabet, self._character)
1219
1220 - def count(self, sub, start=0, end=sys.maxint):
1221 """Non-overlapping count method, like that of a python string. 1222 1223 This behaves like the python string (and Seq object) method of the 1224 same name, which does a non-overlapping count! 1225 1226 Returns an integer, the number of occurrences of substring 1227 argument sub in the (sub)sequence given by [start:end]. 1228 Optional arguments start and end are interpreted as in slice 1229 notation. 1230 1231 Arguments: 1232 - sub - a string or another Seq object to look for 1233 - start - optional integer, slice start 1234 - end - optional integer, slice end 1235 1236 >>> "NNNN".count("N") 1237 4 1238 >>> Seq("NNNN").count("N") 1239 4 1240 >>> UnknownSeq(4, character="N").count("N") 1241 4 1242 >>> UnknownSeq(4, character="N").count("A") 1243 0 1244 >>> UnknownSeq(4, character="N").count("AA") 1245 0 1246 1247 HOWEVER, please note because that python strings and Seq objects (and 1248 MutableSeq objects) do a non-overlapping search, this may not give 1249 the answer you expect: 1250 1251 >>> UnknownSeq(4, character="N").count("NN") 1252 2 1253 >>> UnknownSeq(4, character="N").count("NNN") 1254 1 1255 """ 1256 sub_str = self._get_seq_str_and_check_alphabet(sub) 1257 if len(sub_str) == 1: 1258 if str(sub_str) == self._character: 1259 if start==0 and end >= self._length: 1260 return self._length 1261 else: 1262 #This could be done more cleverly... 1263 return str(self).count(sub_str, start, end) 1264 else: 1265 return 0 1266 else: 1267 if set(sub_str) == set(self._character): 1268 if start==0 and end >= self._length: 1269 return self._length // len(sub_str) 1270 else: 1271 #This could be done more cleverly... 1272 return str(self).count(sub_str, start, end) 1273 else: 1274 return 0
1275
1276 - def complement(self):
1277 """The complement of an unknown nucleotide equals itself. 1278 1279 >>> my_nuc = UnknownSeq(8) 1280 >>> my_nuc 1281 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1282 >>> print my_nuc 1283 ???????? 1284 >>> my_nuc.complement() 1285 UnknownSeq(8, alphabet = Alphabet(), character = '?') 1286 >>> print my_nuc.complement() 1287 ???????? 1288 """ 1289 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1290 Alphabet.ProteinAlphabet): 1291 raise ValueError("Proteins do not have complements!") 1292 return self
1293
1294 - def reverse_complement(self):
1295 """The reverse complement of an unknown nucleotide equals itself. 1296 1297 >>> my_nuc = UnknownSeq(10) 1298 >>> my_nuc 1299 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1300 >>> print my_nuc 1301 ?????????? 1302 >>> my_nuc.reverse_complement() 1303 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1304 >>> print my_nuc.reverse_complement() 1305 ?????????? 1306 """ 1307 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1308 Alphabet.ProteinAlphabet): 1309 raise ValueError("Proteins do not have complements!") 1310 return self
1311
1312 - def transcribe(self):
1313 """Returns unknown RNA sequence from an unknown DNA sequence. 1314 1315 >>> my_dna = UnknownSeq(10, character="N") 1316 >>> my_dna 1317 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1318 >>> print my_dna 1319 NNNNNNNNNN 1320 >>> my_rna = my_dna.transcribe() 1321 >>> my_rna 1322 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1323 >>> print my_rna 1324 NNNNNNNNNN 1325 """ 1326 #Offload the alphabet stuff 1327 s = Seq(self._character, self.alphabet).transcribe() 1328 return UnknownSeq(self._length, s.alphabet, self._character)
1329
1330 - def back_transcribe(self):
1331 """Returns unknown DNA sequence from an unknown RNA sequence. 1332 1333 >>> my_rna = UnknownSeq(20, character="N") 1334 >>> my_rna 1335 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1336 >>> print my_rna 1337 NNNNNNNNNNNNNNNNNNNN 1338 >>> my_dna = my_rna.back_transcribe() 1339 >>> my_dna 1340 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1341 >>> print my_dna 1342 NNNNNNNNNNNNNNNNNNNN 1343 """ 1344 #Offload the alphabet stuff 1345 s = Seq(self._character, self.alphabet).back_transcribe() 1346 return UnknownSeq(self._length, s.alphabet, self._character)
1347
1348 - def upper(self):
1349 """Returns an upper case copy of the sequence. 1350 1351 >>> from Bio.Alphabet import generic_dna 1352 >>> from Bio.Seq import UnknownSeq 1353 >>> my_seq = UnknownSeq(20, generic_dna, character="n") 1354 >>> my_seq 1355 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n') 1356 >>> print my_seq 1357 nnnnnnnnnnnnnnnnnnnn 1358 >>> my_seq.upper() 1359 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1360 >>> print my_seq.upper() 1361 NNNNNNNNNNNNNNNNNNNN 1362 1363 This will adjust the alphabet if required. See also the lower method. 1364 """ 1365 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1366
1367 - def lower(self):
1368 """Returns a lower case copy of the sequence. 1369 1370 This will adjust the alphabet if required: 1371 1372 >>> from Bio.Alphabet import IUPAC 1373 >>> from Bio.Seq import UnknownSeq 1374 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein) 1375 >>> my_seq 1376 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X') 1377 >>> print my_seq 1378 XXXXXXXXXXXXXXXXXXXX 1379 >>> my_seq.lower() 1380 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x') 1381 >>> print my_seq.lower() 1382 xxxxxxxxxxxxxxxxxxxx 1383 1384 See also the upper method. 1385 """ 1386 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1387
1388 - def translate(self, **kwargs):
1389 """Translate an unknown nucleotide sequence into an unknown protein. 1390 1391 e.g. 1392 1393 >>> my_seq = UnknownSeq(11, character="N") 1394 >>> print my_seq 1395 NNNNNNNNNNN 1396 >>> my_protein = my_seq.translate() 1397 >>> my_protein 1398 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1399 >>> print my_protein 1400 XXX 1401 1402 In comparison, using a normal Seq object: 1403 1404 >>> my_seq = Seq("NNNNNNNNNNN") 1405 >>> print my_seq 1406 NNNNNNNNNNN 1407 >>> my_protein = my_seq.translate() 1408 >>> my_protein 1409 Seq('XXX', ExtendedIUPACProtein()) 1410 >>> print my_protein 1411 XXX 1412 1413 """ 1414 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1415 Alphabet.ProteinAlphabet): 1416 raise ValueError("Proteins cannot be translated!") 1417 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1418
1419 - def ungap(self, gap=None):
1420 """Return a copy of the sequence without the gap character(s). 1421 1422 The gap character can be specified in two ways - either as an explicit 1423 argument, or via the sequence's alphabet. For example: 1424 1425 >>> from Bio.Seq import UnknownSeq 1426 >>> from Bio.Alphabet import Gapped, generic_dna 1427 >>> my_dna = UnknownSeq(20, Gapped(generic_dna,"-")) 1428 >>> my_dna 1429 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N') 1430 >>> my_dna.ungap() 1431 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1432 >>> my_dna.ungap("-") 1433 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1434 1435 If the UnknownSeq is using the gap character, then an empty Seq is 1436 returned: 1437 1438 >>> my_gap = UnknownSeq(20, Gapped(generic_dna,"-"), character="-") 1439 >>> my_gap 1440 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-') 1441 >>> my_gap.ungap() 1442 Seq('', DNAAlphabet()) 1443 >>> my_gap.ungap("-") 1444 Seq('', DNAAlphabet()) 1445 1446 Notice that the returned sequence's alphabet is adjusted to remove any 1447 explicit gap character declaration. 1448 """ 1449 #Offload the alphabet stuff 1450 s = Seq(self._character, self.alphabet).ungap() 1451 if s: 1452 return UnknownSeq(self._length, s.alphabet, self._character) 1453 else: 1454 return Seq("", s.alphabet)
1455 1456
1457 -class MutableSeq(object):
1458 """An editable sequence object (with an alphabet). 1459 1460 Unlike normal python strings and our basic sequence object (the Seq class) 1461 which are immuatable, the MutableSeq lets you edit the sequence in place. 1462 However, this means you cannot use a MutableSeq object as a dictionary key. 1463 1464 >>> from Bio.Seq import MutableSeq 1465 >>> from Bio.Alphabet import generic_dna 1466 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1467 >>> my_seq 1468 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1469 >>> my_seq[5] 1470 'T' 1471 >>> my_seq[5] = "A" 1472 >>> my_seq 1473 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1474 >>> my_seq[5] 1475 'A' 1476 >>> my_seq[5:8] = "NNN" 1477 >>> my_seq 1478 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1479 >>> len(my_seq) 1480 11 1481 1482 Note that the MutableSeq object does not support as many string-like 1483 or biological methods as the Seq object. 1484 """
1485 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
1486 if sys.version_info[0] == 3: 1487 self.array_indicator = "u" 1488 else: 1489 self.array_indicator = "c" 1490 if isinstance(data, str): # TODO - What about unicode? 1491 self.data = array.array(self.array_indicator, data) 1492 else: 1493 self.data = data # assumes the input is an array 1494 self.alphabet = alphabet
1495
1496 - def __repr__(self):
1497 """Returns a (truncated) representation of the sequence for debugging.""" 1498 if len(self) > 60: 1499 #Shows the last three letters as it is often useful to see if there 1500 #is a stop codon at the end of a sequence. 1501 #Note total length is 54+3+3=60 1502 return "%s('%s...%s', %s)" % (self.__class__.__name__, 1503 str(self[:54]), str(self[-3:]), 1504 repr(self.alphabet)) 1505 else: 1506 return "%s('%s', %s)" % (self.__class__.__name__, 1507 str(self), 1508 repr(self.alphabet))
1509
1510 - def __str__(self):
1511 """Returns the full sequence as a python string. 1512 1513 Note that Biopython 1.44 and earlier would give a truncated 1514 version of repr(my_seq) for str(my_seq). If you are writing code 1515 which needs to be backwards compatible with old Biopython, you 1516 should continue to use my_seq.tostring() rather than str(my_seq). 1517 """ 1518 #See test_GAQueens.py for an historic usage of a non-string alphabet! 1519 return "".join(self.data)
1520
1521 - def __cmp__(self, other):
1522 """Compare the sequence to another sequence or a string (README). 1523 1524 Currently if compared to another sequence the alphabets must be 1525 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise 1526 an exception. Otherwise only the sequence itself is compared, not the 1527 precise alphabet. 1528 1529 A future release of Biopython will change this (and the Seq object etc) 1530 to use simple string comparison. The plan is that comparing sequences 1531 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning 1532 but not an exception. 1533 1534 During this transition period, please just do explicit comparisons: 1535 1536 >>> seq1 = MutableSeq("ACGT") 1537 >>> seq2 = MutableSeq("ACGT") 1538 >>> id(seq1) == id(seq2) 1539 False 1540 >>> str(seq1) == str(seq2) 1541 True 1542 1543 This method indirectly supports ==, < , etc. 1544 """ 1545 if hasattr(other, "alphabet"): 1546 #other should be a Seq or a MutableSeq 1547 import warnings 1548 warnings.warn("In future comparing incompatible alphabets will " 1549 "only trigger a warning (not an exception). In " 1550 "the interim please use id(seq1)==id(seq2) or " 1551 "str(seq1)==str(seq2) to make your code explicit " 1552 "and to avoid this warning.", FutureWarning) 1553 if not Alphabet._check_type_compatible([self.alphabet, 1554 other.alphabet]): 1555 raise TypeError("Incompatable alphabets %s and %s" 1556 % (repr(self.alphabet), repr(other.alphabet))) 1557 #They should be the same sequence type (or one of them is generic) 1558 if isinstance(other, MutableSeq): 1559 #See test_GAQueens.py for an historic usage of a non-string 1560 #alphabet! Comparing the arrays supports this. 1561 return cmp(self.data, other.data) 1562 else: 1563 return cmp(str(self), str(other)) 1564 elif isinstance(other, basestring): 1565 return cmp(str(self), other) 1566 else: 1567 raise TypeError
1568
1569 - def __len__(self):
1570 return len(self.data)
1571
1572 - def __getitem__(self, index):
1573 #Note since Python 2.0, __getslice__ is deprecated 1574 #and __getitem__ is used instead. 1575 #See http://docs.python.org/ref/sequence-methods.html 1576 if isinstance(index, int): 1577 #Return a single letter as a string 1578 return self.data[index] 1579 else: 1580 #Return the (sub)sequence as another Seq object 1581 return MutableSeq(self.data[index], self.alphabet)
1582
1583 - def __setitem__(self, index, value):
1584 #Note since Python 2.0, __setslice__ is deprecated 1585 #and __setitem__ is used instead. 1586 #See http://docs.python.org/ref/sequence-methods.html 1587 if isinstance(index, int): 1588 #Replacing a single letter with a new string 1589 self.data[index] = value 1590 else: 1591 #Replacing a sub-sequence 1592 if isinstance(value, MutableSeq): 1593 self.data[index] = value.data 1594 elif isinstance(value, type(self.data)): 1595 self.data[index] = value 1596 else: 1597 self.data[index] = array.array(self.array_indicator, 1598 str(value))
1599
1600 - def __delitem__(self, index):
1601 #Note since Python 2.0, __delslice__ is deprecated 1602 #and __delitem__ is used instead. 1603 #See http://docs.python.org/ref/sequence-methods.html 1604 1605 #Could be deleting a single letter, or a slice 1606 del self.data[index]
1607
1608 - def __add__(self, other):
1609 """Add another sequence or string to this sequence. 1610 1611 Returns a new MutableSeq object.""" 1612 if hasattr(other, "alphabet"): 1613 #other should be a Seq or a MutableSeq 1614 if not Alphabet._check_type_compatible([self.alphabet, 1615 other.alphabet]): 1616 raise TypeError("Incompatable alphabets %s and %s" 1617 % (repr(self.alphabet), repr(other.alphabet))) 1618 #They should be the same sequence type (or one of them is generic) 1619 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1620 if isinstance(other, MutableSeq): 1621 #See test_GAQueens.py for an historic usage of a non-string 1622 #alphabet! Adding the arrays should support this. 1623 return self.__class__(self.data + other.data, a) 1624 else: 1625 return self.__class__(str(self) + str(other), a) 1626 elif isinstance(other, basestring): 1627 #other is a plain string - use the current alphabet 1628 return self.__class__(str(self) + str(other), self.alphabet) 1629 else: 1630 raise TypeError
1631
1632 - def __radd__(self, other):
1633 if hasattr(other, "alphabet"): 1634 #other should be a Seq or a MutableSeq 1635 if not Alphabet._check_type_compatible([self.alphabet, 1636 other.alphabet]): 1637 raise TypeError("Incompatable alphabets %s and %s" 1638 % (repr(self.alphabet), repr(other.alphabet))) 1639 #They should be the same sequence type (or one of them is generic) 1640 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1641 if isinstance(other, MutableSeq): 1642 #See test_GAQueens.py for an historic usage of a non-string 1643 #alphabet! Adding the arrays should support this. 1644 return self.__class__(other.data + self.data, a) 1645 else: 1646 return self.__class__(str(other) + str(self), a) 1647 elif isinstance(other, basestring): 1648 #other is a plain string - use the current alphabet 1649 return self.__class__(str(other) + str(self), self.alphabet) 1650 else: 1651 raise TypeError
1652
1653 - def append(self, c):
1654 self.data.append(c)
1655
1656 - def insert(self, i, c):
1657 self.data.insert(i, c)
1658
1659 - def pop(self, i = (-1)):
1660 c = self.data[i] 1661 del self.data[i] 1662 return c
1663
1664 - def remove(self, item):
1665 for i in range(len(self.data)): 1666 if self.data[i] == item: 1667 del self.data[i] 1668 return 1669 raise ValueError("MutableSeq.remove(x): x not in list")
1670
1671 - def count(self, sub, start=0, end=sys.maxint):
1672 """Non-overlapping count method, like that of a python string. 1673 1674 This behaves like the python string method of the same name, 1675 which does a non-overlapping count! 1676 1677 Returns an integer, the number of occurrences of substring 1678 argument sub in the (sub)sequence given by [start:end]. 1679 Optional arguments start and end are interpreted as in slice 1680 notation. 1681 1682 Arguments: 1683 - sub - a string or another Seq object to look for 1684 - start - optional integer, slice start 1685 - end - optional integer, slice end 1686 1687 e.g. 1688 1689 >>> from Bio.Seq import MutableSeq 1690 >>> my_mseq = MutableSeq("AAAATGA") 1691 >>> print my_mseq.count("A") 1692 5 1693 >>> print my_mseq.count("ATG") 1694 1 1695 >>> print my_mseq.count(Seq("AT")) 1696 1 1697 >>> print my_mseq.count("AT", 2, -1) 1698 1 1699 1700 HOWEVER, please note because that python strings, Seq objects and 1701 MutableSeq objects do a non-overlapping search, this may not give 1702 the answer you expect: 1703 1704 >>> "AAAA".count("AA") 1705 2 1706 >>> print MutableSeq("AAAA").count("AA") 1707 2 1708 1709 A non-overlapping search would give the answer as three! 1710 """ 1711 try: 1712 #TODO - Should we check the alphabet? 1713 search = sub.tostring() 1714 except AttributeError: 1715 search = sub 1716 1717 if not isinstance(search, basestring): 1718 raise TypeError("expected a string, Seq or MutableSeq") 1719 1720 if len(search) == 1: 1721 #Try and be efficient and work directly from the array. 1722 count = 0 1723 for c in self.data[start:end]: 1724 if c == search: 1725 count += 1 1726 return count 1727 else: 1728 #TODO - Can we do this more efficiently? 1729 return self.tostring().count(search, start, end)
1730
1731 - def index(self, item):
1732 for i in range(len(self.data)): 1733 if self.data[i] == item: 1734 return i 1735 raise ValueError("MutableSeq.index(x): x not in list")
1736
1737 - def reverse(self):
1738 """Modify the mutable sequence to reverse itself. 1739 1740 No return value. 1741 """ 1742 self.data.reverse()
1743
1744 - def complement(self):
1745 """Modify the mutable sequence to take on its complement. 1746 1747 Trying to complement a protein sequence raises an exception. 1748 1749 No return value. 1750 """ 1751 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1752 Alphabet.ProteinAlphabet): 1753 raise ValueError("Proteins do not have complements!") 1754 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1755 d = ambiguous_dna_complement 1756 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1757 d = ambiguous_rna_complement 1758 elif 'U' in self.data and 'T' in self.data: 1759 #TODO - Handle this cleanly? 1760 raise ValueError("Mixed RNA/DNA found") 1761 elif 'U' in self.data: 1762 d = ambiguous_rna_complement 1763 else: 1764 d = ambiguous_dna_complement 1765 c = dict([(x.lower(), y.lower()) for x, y in d.iteritems()]) 1766 d.update(c) 1767 self.data = map(lambda c: d[c], self.data) 1768 self.data = array.array(self.array_indicator, self.data)
1769
1770 - def reverse_complement(self):
1771 """Modify the mutable sequence to take on its reverse complement. 1772 1773 Trying to reverse complement a protein sequence raises an exception. 1774 1775 No return value. 1776 """ 1777 self.complement() 1778 self.data.reverse()
1779 1780 ## Sorting a sequence makes no sense. 1781 # def sort(self, *args): self.data.sort(*args) 1782
1783 - def extend(self, other):
1784 if isinstance(other, MutableSeq): 1785 for c in other.data: 1786 self.data.append(c) 1787 else: 1788 for c in other: 1789 self.data.append(c)
1790
1791 - def tostring(self):
1792 """Returns the full sequence as a python string (semi-obsolete). 1793 1794 Although not formally deprecated, you are now encouraged to use 1795 str(my_seq) instead of my_seq.tostring(). 1796 1797 Because str(my_seq) will give you the full sequence as a python string, 1798 there is often no need to make an explicit conversion. For example, 1799 1800 print "ID={%s}, sequence={%s}" % (my_name, my_seq) 1801 1802 On Biopython 1.44 or older you would have to have done this: 1803 1804 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring()) 1805 """ 1806 return "".join(self.data)
1807
1808 - def toseq(self):
1809 """Returns the full sequence as a new immutable Seq object. 1810 1811 >>> from Bio.Seq import Seq 1812 >>> from Bio.Alphabet import IUPAC 1813 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 1814 ... IUPAC.protein) 1815 >>> my_mseq 1816 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1817 >>> my_mseq.toseq() 1818 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1819 1820 Note that the alphabet is preserved. 1821 """ 1822 return Seq("".join(self.data), self.alphabet)
1823 1824 1825 # The transcribe, backward_transcribe, and translate functions are 1826 # user-friendly versions of the corresponding functions in Bio.Transcribe 1827 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1828
1829 -def transcribe(dna):
1830 """Transcribes a DNA sequence into RNA. 1831 1832 If given a string, returns a new string object. 1833 1834 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1835 1836 Trying to transcribe a protein or RNA sequence raises an exception. 1837 1838 e.g. 1839 1840 >>> transcribe("ACTGN") 1841 'ACUGN' 1842 """ 1843 if isinstance(dna, Seq): 1844 return dna.transcribe() 1845 elif isinstance(dna, MutableSeq): 1846 return dna.toseq().transcribe() 1847 else: 1848 return dna.replace('T', 'U').replace('t', 'u')
1849 1850
1851 -def back_transcribe(rna):
1852 """Back-transcribes an RNA sequence into DNA. 1853 1854 If given a string, returns a new string object. 1855 1856 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1857 1858 Trying to transcribe a protein or DNA sequence raises an exception. 1859 1860 e.g. 1861 1862 >>> back_transcribe("ACUGN") 1863 'ACTGN' 1864 """ 1865 if isinstance(rna, Seq): 1866 return rna.back_transcribe() 1867 elif isinstance(rna, MutableSeq): 1868 return rna.toseq().back_transcribe() 1869 else: 1870 return rna.replace('U', 'T').replace('u', 't')
1871 1872
1873 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, 1874 cds=False, pos_stop="X"):
1875 """Helper function to translate a nucleotide string (PRIVATE). 1876 1877 Arguments: 1878 - sequence - a string 1879 - table - a CodonTable object (NOT a table name or id number) 1880 - stop_symbol - a single character string, what to use for terminators. 1881 - to_stop - boolean, should translation terminate at the first 1882 in frame stop codon? If there is no in-frame stop codon 1883 then translation continues to the end. 1884 - pos_stop - a single character string for a possible stop codon 1885 (e.g. TAN or NNN) 1886 - cds - Boolean, indicates this is a complete CDS. If True, this 1887 checks the sequence starts with a valid alternative start 1888 codon (which will be translated as methionine, M), that the 1889 sequence length is a multiple of three, and that there is a 1890 single in frame stop codon at the end (this will be excluded 1891 from the protein sequence, regardless of the to_stop option). 1892 If these tests fail, an exception is raised. 1893 1894 Returns a string. 1895 1896 e.g. 1897 1898 >>> from Bio.Data import CodonTable 1899 >>> table = CodonTable.ambiguous_dna_by_id[1] 1900 >>> _translate_str("AAA", table) 1901 'K' 1902 >>> _translate_str("TAR", table) 1903 '*' 1904 >>> _translate_str("TAN", table) 1905 'X' 1906 >>> _translate_str("TAN", table, pos_stop="@") 1907 '@' 1908 >>> _translate_str("TA?", table) 1909 Traceback (most recent call last): 1910 ... 1911 TranslationError: Codon 'TA?' is invalid 1912 >>> _translate_str("ATGCCCTAG", table, cds=True) 1913 'MP' 1914 >>> _translate_str("AAACCCTAG", table, cds=True) 1915 Traceback (most recent call last): 1916 ... 1917 TranslationError: First codon 'AAA' is not a start codon 1918 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True) 1919 Traceback (most recent call last): 1920 ... 1921 TranslationError: Extra in frame stop codon found. 1922 """ 1923 sequence = sequence.upper() 1924 amino_acids = [] 1925 forward_table = table.forward_table 1926 stop_codons = table.stop_codons 1927 if table.nucleotide_alphabet.letters is not None: 1928 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 1929 else: 1930 #Assume the worst case, ambiguous DNA or RNA: 1931 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + 1932 IUPAC.ambiguous_rna.letters.upper()) 1933 if cds: 1934 if str(sequence[:3]).upper() not in table.start_codons: 1935 raise CodonTable.TranslationError( 1936 "First codon '%s' is not a start codon" % sequence[:3]) 1937 if len(sequence) % 3 != 0: 1938 raise CodonTable.TranslationError( 1939 "Sequence length %i is not a multiple of three" % len(sequence)) 1940 if str(sequence[-3:]).upper() not in stop_codons: 1941 raise CodonTable.TranslationError( 1942 "Final codon '%s' is not a stop codon" % sequence[-3:]) 1943 #Don't translate the stop symbol, and manually translate the M 1944 sequence = sequence[3:-3] 1945 amino_acids = ["M"] 1946 n = len(sequence) 1947 for i in xrange(0, n-n%3, 3): 1948 codon = sequence[i:i+3] 1949 try: 1950 amino_acids.append(forward_table[codon]) 1951 except (KeyError, CodonTable.TranslationError): 1952 #Todo? Treat "---" as a special case (gapped translation) 1953 if codon in table.stop_codons: 1954 if cds: 1955 raise CodonTable.TranslationError( 1956 "Extra in frame stop codon found.") 1957 if to_stop: 1958 break 1959 amino_acids.append(stop_symbol) 1960 elif valid_letters.issuperset(set(codon)): 1961 #Possible stop codon (e.g. NNN or TAN) 1962 amino_acids.append(pos_stop) 1963 else: 1964 raise CodonTable.TranslationError( 1965 "Codon '%s' is invalid" % codon) 1966 return "".join(amino_acids)
1967 1968
1969 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False, 1970 cds=False):
1971 """Translate a nucleotide sequence into amino acids. 1972 1973 If given a string, returns a new string object. Given a Seq or 1974 MutableSeq, returns a Seq object with a protein alphabet. 1975 1976 Arguments: 1977 - table - Which codon table to use? This can be either a name (string), 1978 an NCBI identifier (integer), or a CodonTable object (useful 1979 for non-standard genetic codes). Defaults to the "Standard" 1980 table. 1981 - stop_symbol - Single character string, what to use for any 1982 terminators, defaults to the asterisk, "*". 1983 - to_stop - Boolean, defaults to False meaning do a full 1984 translation continuing on past any stop codons 1985 (translated as the specified stop_symbol). If 1986 True, translation is terminated at the first in 1987 frame stop codon (and the stop_symbol is not 1988 appended to the returned protein sequence). 1989 - cds - Boolean, indicates this is a complete CDS. If True, this 1990 checks the sequence starts with a valid alternative start 1991 codon (which will be translated as methionine, M), that the 1992 sequence length is a multiple of three, and that there is a 1993 single in frame stop codon at the end (this will be excluded 1994 from the protein sequence, regardless of the to_stop option). 1995 If these tests fail, an exception is raised. 1996 1997 A simple string example using the default (standard) genetic code: 1998 1999 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 2000 >>> translate(coding_dna) 2001 'VAIVMGR*KGAR*' 2002 >>> translate(coding_dna, stop_symbol="@") 2003 'VAIVMGR@KGAR@' 2004 >>> translate(coding_dna, to_stop=True) 2005 'VAIVMGR' 2006 2007 Now using NCBI table 2, where TGA is not a stop codon: 2008 2009 >>> translate(coding_dna, table=2) 2010 'VAIVMGRWKGAR*' 2011 >>> translate(coding_dna, table=2, to_stop=True) 2012 'VAIVMGRWKGAR' 2013 2014 In fact this example uses an alternative start codon valid under NCBI table 2, 2015 GTG, which means this example is a complete valid CDS which when translated 2016 should really start with methionine (not valine): 2017 2018 >>> translate(coding_dna, table=2, cds=True) 2019 'MAIVMGRWKGAR' 2020 2021 Note that if the sequence has no in-frame stop codon, then the to_stop 2022 argument has no effect: 2023 2024 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 2025 >>> translate(coding_dna2) 2026 'VAIVMGR' 2027 >>> translate(coding_dna2, to_stop=True) 2028 'VAIVMGR' 2029 2030 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 2031 or a stop codon. These are translated as "X". Any invalid codon 2032 (e.g. "TA?" or "T-A") will throw a TranslationError. 2033 2034 NOTE - Does NOT support gapped sequences. 2035 2036 It will however translate either DNA or RNA. 2037 """ 2038 if isinstance(sequence, Seq): 2039 return sequence.translate(table, stop_symbol, to_stop, cds) 2040 elif isinstance(sequence, MutableSeq): 2041 #Return a Seq object 2042 return sequence.toseq().translate(table, stop_symbol, to_stop, cds) 2043 else: 2044 #Assume its a string, return a string 2045 try: 2046 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 2047 except ValueError: 2048 codon_table = CodonTable.ambiguous_generic_by_name[table] 2049 except (AttributeError, TypeError): 2050 if isinstance(table, CodonTable.CodonTable): 2051 codon_table = table 2052 else: 2053 raise ValueError('Bad table argument') 2054 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2055 2056
2057 -def reverse_complement(sequence):
2058 """Returns the reverse complement sequence of a nucleotide string. 2059 2060 If given a string, returns a new string object. 2061 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 2062 2063 Supports unambiguous and ambiguous nucleotide sequences. 2064 2065 e.g. 2066 2067 >>> reverse_complement("ACTG-NH") 2068 'DN-CAGT' 2069 """ 2070 if isinstance(sequence, Seq): 2071 #Return a Seq 2072 return sequence.reverse_complement() 2073 elif isinstance(sequence, MutableSeq): 2074 #Return a Seq 2075 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 2076 return sequence.toseq().reverse_complement() 2077 2078 #Assume its a string. 2079 #In order to avoid some code duplication, the old code would turn the string 2080 #into a Seq, use the reverse_complement method, and convert back to a string. 2081 #This worked, but is over five times slower on short sequences! 2082 if ('U' in sequence or 'u' in sequence) \ 2083 and ('T' in sequence or 't' in sequence): 2084 raise ValueError("Mixed RNA/DNA found") 2085 elif 'U' in sequence or 'u' in sequence: 2086 ttable = _rna_complement_table 2087 else: 2088 ttable = _dna_complement_table 2089 return sequence.translate(ttable)[::-1]
2090 2091
2092 -def _test():
2093 """Run the Bio.Seq module's doctests (PRIVATE).""" 2094 if sys.version_info[0:2] == (3, 1): 2095 print "Not running Bio.Seq doctest on Python 3.1" 2096 print "See http://bugs.python.org/issue7490" 2097 else: 2098 print "Running doctests..." 2099 import doctest 2100 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 2101 print "Done"
2102 2103 if __name__ == "__main__": 2104 _test() 2105