Package Bio :: Package Alphabet
[hide private]
[frames] | no frames]

Source Code for Package Bio.Alphabet

  1  # Copyright 2000-2002 by Andrew Dalke. 
  2  # Revisions copyright 2007-2010 by Peter Cock. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Alphabets used in Seq objects etc to declare sequence type and letters. 
  9   
 10  This is used by sequences which contain a finite number of similar words. 
 11  """ 
 12   
 13  __docformat__ = "restructuredtext en" 
 14   
 15   
16 -class Alphabet(object):
17 """Generic alphabet base class. 18 19 This class is used as a base class for other types of alphabets. 20 21 Attributes: 22 - letters - list-like object containing the letters of the alphabet. 23 Usually it is a string when letters are single characters. 24 - size - size of the alphabet's letters (e.g. 1 when letters are 25 single characters). 26 """ 27 28 size = None # default to no fixed size for words 29 letters = None # default to no fixed alphabet 30 # In general, a list-like object. However, 31 # assuming letters are single characters, use a 32 # string. This is expected for use with Seq like 33 # objects. 34
35 - def __repr__(self):
36 return self.__class__.__name__ + "()"
37
38 - def contains(self, other):
39 """Does this alphabet 'contain' the other (OBSOLETE?). 40 41 Returns a boolean. This relies on the Alphabet subclassing 42 hierarchy only, and does not check the letters property. 43 This isn't ideal, and doesn't seem to work as intended 44 with the AlphabetEncoder classes.""" 45 return isinstance(other, self.__class__)
46
47 - def _case_less(self):
48 """Return a case-less variant of the current alphabet (PRIVATE).""" 49 # TODO - remove this method by dealing with things in subclasses? 50 if isinstance(self, ProteinAlphabet): 51 return generic_protein 52 elif isinstance(self, DNAAlphabet): 53 return generic_dna 54 elif isinstance(self, RNAAlphabet): 55 return generic_rna 56 elif isinstance(self, NucleotideAlphabet): 57 return generic_nucleotide 58 elif isinstance(self, SingleLetterAlphabet): 59 return single_letter_alphabet 60 else: 61 return generic_alphabet
62
63 - def _upper(self):
64 """Return an upper case variant of the current alphabet (PRIVATE).""" 65 if not self.letters or self.letters == self.letters.upper(): 66 # Easy case, no letters or already upper case! 67 return self 68 else: 69 # TODO - Raise NotImplementedError and handle via subclass? 70 return self._case_less()
71
72 - def _lower(self):
73 """Return a lower case variant of the current alphabet (PRIVATE).""" 74 if not self.letters or self.letters == self.letters.lower(): 75 # Easy case, no letters or already lower case! 76 return self 77 else: 78 # TODO - Raise NotImplementedError and handle via subclass? 79 return self._case_less()
80 81 generic_alphabet = Alphabet() 82 83
84 -class SingleLetterAlphabet(Alphabet):
85 """Generic alphabet with letters of size one.""" 86 size = 1 87 letters = None # string of all letters in the alphabet
88 89 single_letter_alphabet = SingleLetterAlphabet() 90 91 # ########## Protein 92 93
94 -class ProteinAlphabet(SingleLetterAlphabet):
95 """Generic single letter protein alphabet.""" 96 pass
97 98 generic_protein = ProteinAlphabet() 99 100 # ########## DNA 101 102
103 -class NucleotideAlphabet(SingleLetterAlphabet):
104 """Generic single letter nucleotide alphabet.""" 105 pass
106 107 generic_nucleotide = NucleotideAlphabet() 108 109
110 -class DNAAlphabet(NucleotideAlphabet):
111 """Generic single letter DNA alphabet.""" 112 pass
113 114 generic_dna = DNAAlphabet() 115 116 117 # ########## RNA 118 119
120 -class RNAAlphabet(NucleotideAlphabet):
121 """Generic single letter RNA alphabet.""" 122 pass
123 124 generic_rna = RNAAlphabet() 125 126 # ########## Other per-sequence encodings 127 128
129 -class SecondaryStructure(SingleLetterAlphabet):
130 """Alphabet used to describe secondary structure. 131 132 Letters are 'H' (helix), 'S' (strand), 'T' (turn) and 'C' (coil). 133 """ 134 letters = "HSTC"
135 136
137 -class ThreeLetterProtein(Alphabet):
138 """Three letter protein alphabet.""" 139 size = 3 140 letters = [ 141 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 142 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 143 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", 144 ] 145
146 - def _upper(self):
147 raise NotImplementedError("We don't have an uppercase three letter protein alphabet.")
148
149 - def _lower(self):
150 raise NotImplementedError("We don't have a lowercase three letter protein alphabet.")
151 152 # ##### Non per-sequence modifications 153 154 # (These are Decorator classes) 155 156
157 -class AlphabetEncoder(object):
158 - def __init__(self, alphabet, new_letters):
159 self.alphabet = alphabet 160 self.new_letters = new_letters 161 if alphabet.letters is not None: 162 self.letters = alphabet.letters + new_letters 163 else: 164 self.letters = None
165
166 - def __getattr__(self, key):
167 if key[:2] == "__" and key[-2:] == "__": 168 raise AttributeError(key) 169 return getattr(self.alphabet, key)
170
171 - def __repr__(self):
172 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, 173 self.new_letters)
174
175 - def contains(self, other):
176 """Does this alphabet 'contain' the other (OBSOLETE?). 177 178 This is isn't implemented for the base AlphabetEncoder, 179 which will always return 0 (False).""" 180 return 0
181
182 - def _upper(self):
183 """Return an upper case variant of the current alphabet (PRIVATE).""" 184 return AlphabetEncoder(self.alphabet._upper(), self.new_letters.upper())
185
186 - def _lower(self):
187 """Return a lower case variant of the current alphabet (PRIVATE).""" 188 return AlphabetEncoder(self.alphabet._lower(), self.new_letters.lower())
189 190
191 -class Gapped(AlphabetEncoder):
192 - def __init__(self, alphabet, gap_char="-"):
193 AlphabetEncoder.__init__(self, alphabet, gap_char) 194 self.gap_char = gap_char
195
196 - def contains(self, other):
197 """Does this alphabet 'contain' the other (OBSOLETE?). 198 199 Returns a boolean. This relies on the Alphabet subclassing 200 hierarchy, and attempts to check the gap character. This fails 201 if the other alphabet does not have a gap character! 202 """ 203 return other.gap_char == self.gap_char and \ 204 self.alphabet.contains(other.alphabet)
205
206 - def _upper(self):
207 """Return an upper case variant of the current alphabet (PRIVATE).""" 208 return Gapped(self.alphabet._upper(), self.gap_char.upper())
209
210 - def _lower(self):
211 """Return a lower case variant of the current alphabet (PRIVATE).""" 212 return Gapped(self.alphabet._lower(), self.gap_char.lower())
213 214
215 -class HasStopCodon(AlphabetEncoder):
216 - def __init__(self, alphabet, stop_symbol="*"):
217 AlphabetEncoder.__init__(self, alphabet, stop_symbol) 218 self.stop_symbol = stop_symbol
219
220 - def contains(self, other):
221 """Does this alphabet 'contain' the other (OBSOLETE?). 222 223 Returns a boolean. This relies on the Alphabet subclassing 224 hierarchy, and attempts to check the stop symbol. This fails 225 if the other alphabet does not have a stop symbol! 226 """ 227 return other.stop_symbol == self.stop_symbol and \ 228 self.alphabet.contains(other.alphabet)
229
230 - def _upper(self):
231 """Return an upper case variant of the current alphabet (PRIVATE).""" 232 return HasStopCodon(self.alphabet._upper(), self.stop_symbol.upper())
233
234 - def _lower(self):
235 """Return a lower case variant of the current alphabet (PRIVATE).""" 236 return HasStopCodon(self.alphabet._lower(), self.stop_symbol.lower())
237 238
239 -def _get_base_alphabet(alphabet):
240 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" 241 a = alphabet 242 while isinstance(a, AlphabetEncoder): 243 a = a.alphabet 244 assert isinstance(a, Alphabet), \ 245 "Invalid alphabet found, %s" % repr(a) 246 return a
247 248
249 -def _ungap(alphabet):
250 """Returns the alphabet without any gap encoder (PRIVATE).""" 251 # TODO - Handle via method of the objects? 252 if not hasattr(alphabet, "gap_char"): 253 return alphabet 254 elif isinstance(alphabet, Gapped): 255 return alphabet.alphabet 256 elif isinstance(alphabet, HasStopCodon): 257 return HasStopCodon(_ungap(alphabet.alphabet), stop_symbol=alphabet.stop_symbol) 258 elif isinstance(alphabet, AlphabetEncoder): 259 return AlphabetEncoder(_ungap(alphabet.alphabet), letters=alphabet.letters) 260 else: 261 raise NotImplementedError
262 263
264 -def _consensus_base_alphabet(alphabets):
265 """Returns a common but often generic base alphabet object (PRIVATE). 266 267 This throws away any AlphabetEncoder information, e.g. Gapped alphabets. 268 269 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 270 letter. These DO NOT raise an exception!""" 271 common = None 272 for alpha in alphabets: 273 a = _get_base_alphabet(alpha) 274 if common is None: 275 common = a 276 elif common == a: 277 pass 278 elif isinstance(a, common.__class__): 279 pass 280 elif isinstance(common, a.__class__): 281 common = a 282 elif isinstance(a, NucleotideAlphabet) \ 283 and isinstance(common, NucleotideAlphabet): 284 # e.g. Give a mix of RNA and DNA alphabets 285 common = generic_nucleotide 286 elif isinstance(a, SingleLetterAlphabet) \ 287 and isinstance(common, SingleLetterAlphabet): 288 # This is a pretty big mis-match! 289 common = single_letter_alphabet 290 else: 291 # We have a major mis-match... take the easy way out! 292 return generic_alphabet 293 if common is None: 294 # Given NO alphabets! 295 return generic_alphabet 296 return common
297 298
299 -def _consensus_alphabet(alphabets):
300 """Returns a common but often generic alphabet object (PRIVATE). 301 302 >>> from Bio.Alphabet import IUPAC 303 >>> _consensus_alphabet([IUPAC.extended_protein, IUPAC.protein]) 304 ExtendedIUPACProtein() 305 >>> _consensus_alphabet([generic_protein, IUPAC.protein]) 306 ProteinAlphabet() 307 308 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 309 letter. These DO NOT raise an exception! 310 311 >>> _consensus_alphabet([generic_dna, generic_nucleotide]) 312 NucleotideAlphabet() 313 >>> _consensus_alphabet([generic_dna, generic_rna]) 314 NucleotideAlphabet() 315 >>> _consensus_alphabet([generic_dna, generic_protein]) 316 SingleLetterAlphabet() 317 >>> _consensus_alphabet([single_letter_alphabet, generic_protein]) 318 SingleLetterAlphabet() 319 320 This is aware of Gapped and HasStopCodon and new letters added by 321 other AlphabetEncoders. This WILL raise an exception if more than 322 one gap character or stop symbol is present. 323 324 >>> from Bio.Alphabet import IUPAC 325 >>> _consensus_alphabet([Gapped(IUPAC.extended_protein), HasStopCodon(IUPAC.protein)]) 326 HasStopCodon(Gapped(ExtendedIUPACProtein(), '-'), '*') 327 >>> _consensus_alphabet([Gapped(IUPAC.protein, "-"), Gapped(IUPAC.protein, "=")]) 328 Traceback (most recent call last): 329 ... 330 ValueError: More than one gap character present 331 >>> _consensus_alphabet([HasStopCodon(IUPAC.protein, "*"), HasStopCodon(IUPAC.protein, "+")]) 332 Traceback (most recent call last): 333 ... 334 ValueError: More than one stop symbol present 335 """ 336 base = _consensus_base_alphabet(alphabets) 337 gap = None 338 stop = None 339 new_letters = "" 340 for alpha in alphabets: 341 # Gaps... 342 if not hasattr(alpha, "gap_char"): 343 pass 344 elif gap is None: 345 gap = alpha.gap_char 346 elif gap == alpha.gap_char: 347 pass 348 else: 349 raise ValueError("More than one gap character present") 350 # Stops... 351 if not hasattr(alpha, "stop_symbol"): 352 pass 353 elif stop is None: 354 stop = alpha.stop_symbol 355 elif stop == alpha.stop_symbol: 356 pass 357 else: 358 raise ValueError("More than one stop symbol present") 359 # New letters... 360 if hasattr(alpha, "new_letters"): 361 for letter in alpha.new_letters: 362 if letter not in new_letters \ 363 and letter != gap and letter != stop: 364 new_letters += letter 365 366 alpha = base 367 if new_letters: 368 alpha = AlphabetEncoder(alpha, new_letters) 369 if gap: 370 alpha = Gapped(alpha, gap_char=gap) 371 if stop: 372 alpha = HasStopCodon(alpha, stop_symbol=stop) 373 return alpha
374 375
376 -def _check_type_compatible(alphabets):
377 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). 378 379 >>> _check_type_compatible([generic_dna, generic_nucleotide]) 380 True 381 >>> _check_type_compatible([generic_dna, generic_rna]) 382 False 383 >>> _check_type_compatible([generic_dna, generic_protein]) 384 False 385 >>> _check_type_compatible([single_letter_alphabet, generic_protein]) 386 True 387 388 This relies on the Alphabet subclassing hierarchy. It does not 389 check things like gap characters or stop symbols.""" 390 dna, rna, nucl, protein = False, False, False, False 391 for alpha in alphabets: 392 a = _get_base_alphabet(alpha) 393 if isinstance(a, DNAAlphabet): 394 dna = True 395 nucl = True 396 if rna or protein: 397 return False 398 elif isinstance(a, RNAAlphabet): 399 rna = True 400 nucl = True 401 if dna or protein: 402 return False 403 elif isinstance(a, NucleotideAlphabet): 404 nucl = True 405 if protein: 406 return False 407 elif isinstance(a, ProteinAlphabet): 408 protein = True 409 if nucl: 410 return False 411 return True
412 413
414 -def _verify_alphabet(sequence):
415 """Check all letters in sequence are in the alphabet (PRIVATE). 416 417 >>> from Bio.Seq import Seq 418 >>> from Bio.Alphabet import IUPAC 419 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 420 ... IUPAC.protein) 421 >>> _verify_alphabet(my_seq) 422 True 423 424 This example has an X, which is not in the IUPAC protein alphabet 425 (you should be using the IUPAC extended protein alphabet): 426 427 >>> bad_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFX", 428 ... IUPAC.protein) 429 >>> _verify_alphabet(bad_seq) 430 False 431 432 This replaces Bio.utils.verify_alphabet() since we are deprecating 433 that. Potentially this could be added to the Alphabet object, and 434 I would like it to be an option when creating a Seq object... but 435 that might slow things down. 436 """ 437 letters = sequence.alphabet.letters 438 if not letters: 439 raise ValueError("Alphabet does not define letters.") 440 for letter in sequence: 441 if letter not in letters: 442 return False 443 return True
444