Package Bio :: Package Alphabet
[hide private]
[frames] | no frames]

Source Code for Package Bio.Alphabet

  1  # Copyright 2000-2002 by Andrew Dalke. 
  2  # Revisions copyright 2007-2010 by Peter Cock. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Alphabets used in Seq objects etc to declare sequence type and letters. 
  9   
 10  This is used by sequences which contain a finite number of similar words. 
 11  """ 
 12   
 13   
14 -class Alphabet(object):
15 """Generic alphabet base class. 16 17 This class is used as a base class for other types of alphabets. 18 19 Attributes: 20 - letters - list-like object containing the letters of the alphabet. 21 Usually it is a string when letters are single characters. 22 - size - size of the alphabet's letters (e.g. 1 when letters are 23 single characters). 24 25 """ 26 27 size = None # default to no fixed size for words 28 letters = None # default to no fixed alphabet 29 # In general, a list-like object. However, 30 # assuming letters are single characters, use a 31 # string. This is expected for use with Seq like 32 # objects. 33
34 - def __repr__(self):
35 return self.__class__.__name__ + "()"
36
37 - def contains(self, other):
38 """Does this alphabet 'contain' the other (OBSOLETE?). 39 40 Returns a boolean. This relies on the Alphabet subclassing 41 hierarchy only, and does not check the letters property. 42 This isn't ideal, and doesn't seem to work as intended 43 with the AlphabetEncoder classes. 44 """ 45 return isinstance(other, self.__class__)
46
47 - def _case_less(self):
48 """Return a case-less variant of the current alphabet (PRIVATE).""" 49 # TODO - remove this method by dealing with things in subclasses? 50 if isinstance(self, ProteinAlphabet): 51 return generic_protein 52 elif isinstance(self, DNAAlphabet): 53 return generic_dna 54 elif isinstance(self, RNAAlphabet): 55 return generic_rna 56 elif isinstance(self, NucleotideAlphabet): 57 return generic_nucleotide 58 elif isinstance(self, SingleLetterAlphabet): 59 return single_letter_alphabet 60 else: 61 return generic_alphabet
62
63 - def _upper(self):
64 """Return an upper case variant of the current alphabet (PRIVATE).""" 65 if not self.letters or self.letters == self.letters.upper(): 66 # Easy case, no letters or already upper case! 67 return self 68 else: 69 # TODO - Raise NotImplementedError and handle via subclass? 70 return self._case_less()
71
72 - def _lower(self):
73 """Return a lower case variant of the current alphabet (PRIVATE).""" 74 if not self.letters or self.letters == self.letters.lower(): 75 # Easy case, no letters or already lower case! 76 return self 77 else: 78 # TODO - Raise NotImplementedError and handle via subclass? 79 return self._case_less()
80 81 82 generic_alphabet = Alphabet() 83 84
85 -class SingleLetterAlphabet(Alphabet):
86 """Generic alphabet with letters of size one.""" 87 88 size = 1 89 letters = None # string of all letters in the alphabet
90 91 92 single_letter_alphabet = SingleLetterAlphabet() 93 94 # ########## Protein 95 96
97 -class ProteinAlphabet(SingleLetterAlphabet):
98 """Generic single letter protein alphabet.""" 99 100 pass
101 102 103 generic_protein = ProteinAlphabet() 104 105 # ########## DNA 106 107
108 -class NucleotideAlphabet(SingleLetterAlphabet):
109 """Generic single letter nucleotide alphabet.""" 110 111 pass
112 113 114 generic_nucleotide = NucleotideAlphabet() 115 116
117 -class DNAAlphabet(NucleotideAlphabet):
118 """Generic single letter DNA alphabet.""" 119 120 pass
121 122 123 generic_dna = DNAAlphabet() 124 125 126 # ########## RNA 127 128
129 -class RNAAlphabet(NucleotideAlphabet):
130 """Generic single letter RNA alphabet.""" 131 132 pass
133 134 135 generic_rna = RNAAlphabet() 136 137 # ########## Other per-sequence encodings 138 139
140 -class SecondaryStructure(SingleLetterAlphabet):
141 """Alphabet used to describe secondary structure. 142 143 Letters are 'H' (helix), 'S' (strand), 'T' (turn) and 'C' (coil). 144 """ 145 146 letters = "HSTC"
147 148
149 -class ThreeLetterProtein(Alphabet):
150 """Three letter protein alphabet.""" 151 152 size = 3 153 letters = [ 154 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 155 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 156 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", 157 ] 158
159 - def _upper(self):
160 raise NotImplementedError("We don't have an uppercase three letter protein alphabet.")
161
162 - def _lower(self):
163 raise NotImplementedError("We don't have a lowercase three letter protein alphabet.")
164 165 # ##### Non per-sequence modifications 166 167 # (These are Decorator classes) 168 169
170 -class AlphabetEncoder(object):
171 - def __init__(self, alphabet, new_letters):
172 self.alphabet = alphabet 173 self.new_letters = new_letters 174 if alphabet.letters is not None: 175 self.letters = alphabet.letters + new_letters 176 else: 177 self.letters = None
178
179 - def __getattr__(self, key):
180 if key[:2] == "__" and key[-2:] == "__": 181 raise AttributeError(key) 182 return getattr(self.alphabet, key)
183
184 - def __repr__(self):
185 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, 186 self.new_letters)
187
188 - def contains(self, other):
189 """Does this alphabet 'contain' the other (OBSOLETE?). 190 191 This is isn't implemented for the base AlphabetEncoder, 192 which will always return 0 (False). 193 """ 194 return 0
195
196 - def _upper(self):
197 """Return an upper case variant of the current alphabet (PRIVATE).""" 198 return AlphabetEncoder(self.alphabet._upper(), self.new_letters.upper())
199
200 - def _lower(self):
201 """Return a lower case variant of the current alphabet (PRIVATE).""" 202 return AlphabetEncoder(self.alphabet._lower(), self.new_letters.lower())
203 204
205 -class Gapped(AlphabetEncoder):
206 - def __init__(self, alphabet, gap_char="-"):
207 AlphabetEncoder.__init__(self, alphabet, gap_char) 208 self.gap_char = gap_char
209
210 - def contains(self, other):
211 """Does this alphabet 'contain' the other (OBSOLETE?). 212 213 Returns a boolean. This relies on the Alphabet subclassing 214 hierarchy, and attempts to check the gap character. This fails 215 if the other alphabet does not have a gap character! 216 """ 217 return other.gap_char == self.gap_char and \ 218 self.alphabet.contains(other.alphabet)
219
220 - def _upper(self):
221 """Return an upper case variant of the current alphabet (PRIVATE).""" 222 return Gapped(self.alphabet._upper(), self.gap_char.upper())
223
224 - def _lower(self):
225 """Return a lower case variant of the current alphabet (PRIVATE).""" 226 return Gapped(self.alphabet._lower(), self.gap_char.lower())
227 228
229 -class HasStopCodon(AlphabetEncoder):
230 - def __init__(self, alphabet, stop_symbol="*"):
231 AlphabetEncoder.__init__(self, alphabet, stop_symbol) 232 self.stop_symbol = stop_symbol
233
234 - def contains(self, other):
235 """Does this alphabet 'contain' the other (OBSOLETE?). 236 237 Returns a boolean. This relies on the Alphabet subclassing 238 hierarchy, and attempts to check the stop symbol. This fails 239 if the other alphabet does not have a stop symbol! 240 """ 241 return other.stop_symbol == self.stop_symbol and \ 242 self.alphabet.contains(other.alphabet)
243
244 - def _upper(self):
245 """Return an upper case variant of the current alphabet (PRIVATE).""" 246 return HasStopCodon(self.alphabet._upper(), self.stop_symbol.upper())
247
248 - def _lower(self):
249 """Return a lower case variant of the current alphabet (PRIVATE).""" 250 return HasStopCodon(self.alphabet._lower(), self.stop_symbol.lower())
251 252
253 -def _get_base_alphabet(alphabet):
254 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" 255 a = alphabet 256 while isinstance(a, AlphabetEncoder): 257 a = a.alphabet 258 assert isinstance(a, Alphabet), \ 259 "Invalid alphabet found, %s" % repr(a) 260 return a
261 262
263 -def _ungap(alphabet):
264 """Returns the alphabet without any gap encoder (PRIVATE).""" 265 # TODO - Handle via method of the objects? 266 if not hasattr(alphabet, "gap_char"): 267 return alphabet 268 elif isinstance(alphabet, Gapped): 269 return alphabet.alphabet 270 elif isinstance(alphabet, HasStopCodon): 271 return HasStopCodon(_ungap(alphabet.alphabet), stop_symbol=alphabet.stop_symbol) 272 elif isinstance(alphabet, AlphabetEncoder): 273 return AlphabetEncoder(_ungap(alphabet.alphabet), letters=alphabet.letters) 274 else: 275 raise NotImplementedError
276 277
278 -def _consensus_base_alphabet(alphabets):
279 """Returns a common but often generic base alphabet object (PRIVATE). 280 281 This throws away any AlphabetEncoder information, e.g. Gapped alphabets. 282 283 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 284 letter. These DO NOT raise an exception! 285 """ 286 common = None 287 for alpha in alphabets: 288 a = _get_base_alphabet(alpha) 289 if common is None: 290 common = a 291 elif common == a: 292 pass 293 elif isinstance(a, common.__class__): 294 pass 295 elif isinstance(common, a.__class__): 296 common = a 297 elif isinstance(a, NucleotideAlphabet) \ 298 and isinstance(common, NucleotideAlphabet): 299 # e.g. Give a mix of RNA and DNA alphabets 300 common = generic_nucleotide 301 elif isinstance(a, SingleLetterAlphabet) \ 302 and isinstance(common, SingleLetterAlphabet): 303 # This is a pretty big mis-match! 304 common = single_letter_alphabet 305 else: 306 # We have a major mis-match... take the easy way out! 307 return generic_alphabet 308 if common is None: 309 # Given NO alphabets! 310 return generic_alphabet 311 return common
312 313
314 -def _consensus_alphabet(alphabets):
315 """Returns a common but often generic alphabet object (PRIVATE). 316 317 >>> from Bio.Alphabet import IUPAC 318 >>> _consensus_alphabet([IUPAC.extended_protein, IUPAC.protein]) 319 ExtendedIUPACProtein() 320 >>> _consensus_alphabet([generic_protein, IUPAC.protein]) 321 ProteinAlphabet() 322 323 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 324 letter. These DO NOT raise an exception! 325 326 >>> _consensus_alphabet([generic_dna, generic_nucleotide]) 327 NucleotideAlphabet() 328 >>> _consensus_alphabet([generic_dna, generic_rna]) 329 NucleotideAlphabet() 330 >>> _consensus_alphabet([generic_dna, generic_protein]) 331 SingleLetterAlphabet() 332 >>> _consensus_alphabet([single_letter_alphabet, generic_protein]) 333 SingleLetterAlphabet() 334 335 This is aware of Gapped and HasStopCodon and new letters added by 336 other AlphabetEncoders. This WILL raise an exception if more than 337 one gap character or stop symbol is present. 338 339 >>> from Bio.Alphabet import IUPAC 340 >>> _consensus_alphabet([Gapped(IUPAC.extended_protein), HasStopCodon(IUPAC.protein)]) 341 HasStopCodon(Gapped(ExtendedIUPACProtein(), '-'), '*') 342 >>> _consensus_alphabet([Gapped(IUPAC.protein, "-"), Gapped(IUPAC.protein, "=")]) 343 Traceback (most recent call last): 344 ... 345 ValueError: More than one gap character present 346 >>> _consensus_alphabet([HasStopCodon(IUPAC.protein, "*"), HasStopCodon(IUPAC.protein, "+")]) 347 Traceback (most recent call last): 348 ... 349 ValueError: More than one stop symbol present 350 """ 351 base = _consensus_base_alphabet(alphabets) 352 gap = None 353 stop = None 354 new_letters = "" 355 for alpha in alphabets: 356 # Gaps... 357 if not hasattr(alpha, "gap_char"): 358 pass 359 elif gap is None: 360 gap = alpha.gap_char 361 elif gap == alpha.gap_char: 362 pass 363 else: 364 raise ValueError("More than one gap character present") 365 # Stops... 366 if not hasattr(alpha, "stop_symbol"): 367 pass 368 elif stop is None: 369 stop = alpha.stop_symbol 370 elif stop == alpha.stop_symbol: 371 pass 372 else: 373 raise ValueError("More than one stop symbol present") 374 # New letters... 375 if hasattr(alpha, "new_letters"): 376 for letter in alpha.new_letters: 377 if letter not in new_letters \ 378 and letter != gap and letter != stop: 379 new_letters += letter 380 381 alpha = base 382 if new_letters: 383 alpha = AlphabetEncoder(alpha, new_letters) 384 if gap: 385 alpha = Gapped(alpha, gap_char=gap) 386 if stop: 387 alpha = HasStopCodon(alpha, stop_symbol=stop) 388 return alpha
389 390
391 -def _check_type_compatible(alphabets):
392 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). 393 394 >>> _check_type_compatible([generic_dna, generic_nucleotide]) 395 True 396 >>> _check_type_compatible([generic_dna, generic_rna]) 397 False 398 >>> _check_type_compatible([generic_dna, generic_protein]) 399 False 400 >>> _check_type_compatible([single_letter_alphabet, generic_protein]) 401 True 402 403 This relies on the Alphabet subclassing hierarchy. It does not 404 check things like gap characters or stop symbols. 405 """ 406 dna, rna, nucl, protein = False, False, False, False 407 for alpha in alphabets: 408 a = _get_base_alphabet(alpha) 409 if isinstance(a, DNAAlphabet): 410 dna = True 411 nucl = True 412 if rna or protein: 413 return False 414 elif isinstance(a, RNAAlphabet): 415 rna = True 416 nucl = True 417 if dna or protein: 418 return False 419 elif isinstance(a, NucleotideAlphabet): 420 nucl = True 421 if protein: 422 return False 423 elif isinstance(a, ProteinAlphabet): 424 protein = True 425 if nucl: 426 return False 427 return True
428 429
430 -def _verify_alphabet(sequence):
431 """Check all letters in sequence are in the alphabet (PRIVATE). 432 433 >>> from Bio.Seq import Seq 434 >>> from Bio.Alphabet import IUPAC 435 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 436 ... IUPAC.protein) 437 >>> _verify_alphabet(my_seq) 438 True 439 440 This example has an X, which is not in the IUPAC protein alphabet 441 (you should be using the IUPAC extended protein alphabet): 442 443 >>> bad_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFX", 444 ... IUPAC.protein) 445 >>> _verify_alphabet(bad_seq) 446 False 447 448 This replaces Bio.utils.verify_alphabet() since we are deprecating 449 that. Potentially this could be added to the Alphabet object, and 450 I would like it to be an option when creating a Seq object... but 451 that might slow things down. 452 """ 453 letters = sequence.alphabet.letters 454 if not letters: 455 raise ValueError("Alphabet does not define letters.") 456 for letter in sequence: 457 if letter not in letters: 458 return False 459 return True
460