Package Bio :: Package Alphabet
[hide private]
[frames] | no frames]

Source Code for Package Bio.Alphabet

  1  # Copyright 2000-2002 by Andrew Dalke. 
  2  # Revisions copyright 2007-2010 by Peter Cock. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Alphabets used in Seq objects etc to declare sequence type and letters. 
  9   
 10  This is used by sequences which contain a finite number of similar words. 
 11  """ 
 12   
 13   
14 -class Alphabet(object):
15 """Generic alphabet base class. 16 17 This class is used as a base class for other types of alphabets. 18 19 Attributes: 20 - letters - list-like object containing the letters of the alphabet. 21 Usually it is a string when letters are single characters. 22 - size - size of the alphabet's letters (e.g. 1 when letters are 23 single characters). 24 """ 25 26 size = None # default to no fixed size for words 27 letters = None # default to no fixed alphabet 28 # In general, a list-like object. However, 29 # assuming letters are single characters, use a 30 # string. This is expected for use with Seq like 31 # objects. 32
33 - def __repr__(self):
34 return self.__class__.__name__ + "()"
35
36 - def contains(self, other):
37 """Does this alphabet 'contain' the other (OBSOLETE?). 38 39 Returns a boolean. This relies on the Alphabet subclassing 40 hierarchy only, and does not check the letters property. 41 This isn't ideal, and doesn't seem to work as intended 42 with the AlphabetEncoder classes.""" 43 return isinstance(other, self.__class__)
44
45 - def _case_less(self):
46 """Return a case-less variant of the current alphabet (PRIVATE).""" 47 # TODO - remove this method by dealing with things in subclasses? 48 if isinstance(self, ProteinAlphabet): 49 return generic_protein 50 elif isinstance(self, DNAAlphabet): 51 return generic_dna 52 elif isinstance(self, RNAAlphabet): 53 return generic_rna 54 elif isinstance(self, NucleotideAlphabet): 55 return generic_nucleotide 56 elif isinstance(self, SingleLetterAlphabet): 57 return single_letter_alphabet 58 else: 59 return generic_alphabet
60
61 - def _upper(self):
62 """Return an upper case variant of the current alphabet (PRIVATE).""" 63 if not self.letters or self.letters == self.letters.upper(): 64 # Easy case, no letters or already upper case! 65 return self 66 else: 67 # TODO - Raise NotImplementedError and handle via subclass? 68 return self._case_less()
69
70 - def _lower(self):
71 """Return a lower case variant of the current alphabet (PRIVATE).""" 72 if not self.letters or self.letters == self.letters.lower(): 73 # Easy case, no letters or already lower case! 74 return self 75 else: 76 # TODO - Raise NotImplementedError and handle via subclass? 77 return self._case_less()
78 79 generic_alphabet = Alphabet() 80 81
82 -class SingleLetterAlphabet(Alphabet):
83 """Generic alphabet with letters of size one.""" 84 size = 1 85 letters = None # string of all letters in the alphabet
86 87 single_letter_alphabet = SingleLetterAlphabet() 88 89 # ########## Protein 90 91
92 -class ProteinAlphabet(SingleLetterAlphabet):
93 """Generic single letter protein alphabet.""" 94 pass
95 96 generic_protein = ProteinAlphabet() 97 98 # ########## DNA 99 100
101 -class NucleotideAlphabet(SingleLetterAlphabet):
102 """Generic single letter nucleotide alphabet.""" 103 pass
104 105 generic_nucleotide = NucleotideAlphabet() 106 107
108 -class DNAAlphabet(NucleotideAlphabet):
109 """Generic single letter DNA alphabet.""" 110 pass
111 112 generic_dna = DNAAlphabet() 113 114 115 # ########## RNA 116 117
118 -class RNAAlphabet(NucleotideAlphabet):
119 """Generic single letter RNA alphabet.""" 120 pass
121 122 generic_rna = RNAAlphabet() 123 124 # ########## Other per-sequence encodings 125 126
127 -class SecondaryStructure(SingleLetterAlphabet):
128 """Alphabet used to describe secondary structure. 129 130 Letters are 'H' (helix), 'S' (strand), 'T' (turn) and 'C' (coil). 131 """ 132 letters = "HSTC"
133 134
135 -class ThreeLetterProtein(Alphabet):
136 """Three letter protein alphabet.""" 137 size = 3 138 letters = [ 139 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 140 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 141 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", 142 ] 143
144 - def _upper(self):
145 raise NotImplementedError("We don't have an uppercase three letter protein alphabet.")
146
147 - def _lower(self):
148 raise NotImplementedError("We don't have a lowercase three letter protein alphabet.")
149 150 # ##### Non per-sequence modifications 151 152 # (These are Decorator classes) 153 154
155 -class AlphabetEncoder(object):
156 - def __init__(self, alphabet, new_letters):
157 self.alphabet = alphabet 158 self.new_letters = new_letters 159 if alphabet.letters is not None: 160 self.letters = alphabet.letters + new_letters 161 else: 162 self.letters = None
163
164 - def __getattr__(self, key):
165 if key[:2] == "__" and key[-2:] == "__": 166 raise AttributeError(key) 167 return getattr(self.alphabet, key)
168
169 - def __repr__(self):
170 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, 171 self.new_letters)
172
173 - def contains(self, other):
174 """Does this alphabet 'contain' the other (OBSOLETE?). 175 176 This is isn't implemented for the base AlphabetEncoder, 177 which will always return 0 (False).""" 178 return 0
179
180 - def _upper(self):
181 """Return an upper case variant of the current alphabet (PRIVATE).""" 182 return AlphabetEncoder(self.alphabet._upper(), self.new_letters.upper())
183
184 - def _lower(self):
185 """Return a lower case variant of the current alphabet (PRIVATE).""" 186 return AlphabetEncoder(self.alphabet._lower(), self.new_letters.lower())
187 188
189 -class Gapped(AlphabetEncoder):
190 - def __init__(self, alphabet, gap_char="-"):
191 AlphabetEncoder.__init__(self, alphabet, gap_char) 192 self.gap_char = gap_char
193
194 - def contains(self, other):
195 """Does this alphabet 'contain' the other (OBSOLETE?). 196 197 Returns a boolean. This relies on the Alphabet subclassing 198 hierarchy, and attempts to check the gap character. This fails 199 if the other alphabet does not have a gap character! 200 """ 201 return other.gap_char == self.gap_char and \ 202 self.alphabet.contains(other.alphabet)
203
204 - def _upper(self):
205 """Return an upper case variant of the current alphabet (PRIVATE).""" 206 return Gapped(self.alphabet._upper(), self.gap_char.upper())
207
208 - def _lower(self):
209 """Return a lower case variant of the current alphabet (PRIVATE).""" 210 return Gapped(self.alphabet._lower(), self.gap_char.lower())
211 212
213 -class HasStopCodon(AlphabetEncoder):
214 - def __init__(self, alphabet, stop_symbol="*"):
215 AlphabetEncoder.__init__(self, alphabet, stop_symbol) 216 self.stop_symbol = stop_symbol
217
218 - def contains(self, other):
219 """Does this alphabet 'contain' the other (OBSOLETE?). 220 221 Returns a boolean. This relies on the Alphabet subclassing 222 hierarchy, and attempts to check the stop symbol. This fails 223 if the other alphabet does not have a stop symbol! 224 """ 225 return other.stop_symbol == self.stop_symbol and \ 226 self.alphabet.contains(other.alphabet)
227
228 - def _upper(self):
229 """Return an upper case variant of the current alphabet (PRIVATE).""" 230 return HasStopCodon(self.alphabet._upper(), self.stop_symbol.upper())
231
232 - def _lower(self):
233 """Return a lower case variant of the current alphabet (PRIVATE).""" 234 return HasStopCodon(self.alphabet._lower(), self.stop_symbol.lower())
235 236
237 -def _get_base_alphabet(alphabet):
238 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" 239 a = alphabet 240 while isinstance(a, AlphabetEncoder): 241 a = a.alphabet 242 assert isinstance(a, Alphabet), \ 243 "Invalid alphabet found, %s" % repr(a) 244 return a
245 246
247 -def _ungap(alphabet):
248 """Returns the alphabet without any gap encoder (PRIVATE).""" 249 # TODO - Handle via method of the objects? 250 if not hasattr(alphabet, "gap_char"): 251 return alphabet 252 elif isinstance(alphabet, Gapped): 253 return alphabet.alphabet 254 elif isinstance(alphabet, HasStopCodon): 255 return HasStopCodon(_ungap(alphabet.alphabet), stop_symbol=alphabet.stop_symbol) 256 elif isinstance(alphabet, AlphabetEncoder): 257 return AlphabetEncoder(_ungap(alphabet.alphabet), letters=alphabet.letters) 258 else: 259 raise NotImplementedError
260 261
262 -def _consensus_base_alphabet(alphabets):
263 """Returns a common but often generic base alphabet object (PRIVATE). 264 265 This throws away any AlphabetEncoder information, e.g. Gapped alphabets. 266 267 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 268 letter. These DO NOT raise an exception!""" 269 common = None 270 for alpha in alphabets: 271 a = _get_base_alphabet(alpha) 272 if common is None: 273 common = a 274 elif common == a: 275 pass 276 elif isinstance(a, common.__class__): 277 pass 278 elif isinstance(common, a.__class__): 279 common = a 280 elif isinstance(a, NucleotideAlphabet) \ 281 and isinstance(common, NucleotideAlphabet): 282 # e.g. Give a mix of RNA and DNA alphabets 283 common = generic_nucleotide 284 elif isinstance(a, SingleLetterAlphabet) \ 285 and isinstance(common, SingleLetterAlphabet): 286 # This is a pretty big mis-match! 287 common = single_letter_alphabet 288 else: 289 # We have a major mis-match... take the easy way out! 290 return generic_alphabet 291 if common is None: 292 # Given NO alphabets! 293 return generic_alphabet 294 return common
295 296
297 -def _consensus_alphabet(alphabets):
298 """Returns a common but often generic alphabet object (PRIVATE). 299 300 >>> from Bio.Alphabet import IUPAC 301 >>> _consensus_alphabet([IUPAC.extended_protein, IUPAC.protein]) 302 ExtendedIUPACProtein() 303 >>> _consensus_alphabet([generic_protein, IUPAC.protein]) 304 ProteinAlphabet() 305 306 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 307 letter. These DO NOT raise an exception! 308 309 >>> _consensus_alphabet([generic_dna, generic_nucleotide]) 310 NucleotideAlphabet() 311 >>> _consensus_alphabet([generic_dna, generic_rna]) 312 NucleotideAlphabet() 313 >>> _consensus_alphabet([generic_dna, generic_protein]) 314 SingleLetterAlphabet() 315 >>> _consensus_alphabet([single_letter_alphabet, generic_protein]) 316 SingleLetterAlphabet() 317 318 This is aware of Gapped and HasStopCodon and new letters added by 319 other AlphabetEncoders. This WILL raise an exception if more than 320 one gap character or stop symbol is present. 321 322 >>> from Bio.Alphabet import IUPAC 323 >>> _consensus_alphabet([Gapped(IUPAC.extended_protein), HasStopCodon(IUPAC.protein)]) 324 HasStopCodon(Gapped(ExtendedIUPACProtein(), '-'), '*') 325 >>> _consensus_alphabet([Gapped(IUPAC.protein, "-"), Gapped(IUPAC.protein, "=")]) 326 Traceback (most recent call last): 327 ... 328 ValueError: More than one gap character present 329 >>> _consensus_alphabet([HasStopCodon(IUPAC.protein, "*"), HasStopCodon(IUPAC.protein, "+")]) 330 Traceback (most recent call last): 331 ... 332 ValueError: More than one stop symbol present 333 """ 334 base = _consensus_base_alphabet(alphabets) 335 gap = None 336 stop = None 337 new_letters = "" 338 for alpha in alphabets: 339 # Gaps... 340 if not hasattr(alpha, "gap_char"): 341 pass 342 elif gap is None: 343 gap = alpha.gap_char 344 elif gap == alpha.gap_char: 345 pass 346 else: 347 raise ValueError("More than one gap character present") 348 # Stops... 349 if not hasattr(alpha, "stop_symbol"): 350 pass 351 elif stop is None: 352 stop = alpha.stop_symbol 353 elif stop == alpha.stop_symbol: 354 pass 355 else: 356 raise ValueError("More than one stop symbol present") 357 # New letters... 358 if hasattr(alpha, "new_letters"): 359 for letter in alpha.new_letters: 360 if letter not in new_letters \ 361 and letter != gap and letter != stop: 362 new_letters += letter 363 364 alpha = base 365 if new_letters: 366 alpha = AlphabetEncoder(alpha, new_letters) 367 if gap: 368 alpha = Gapped(alpha, gap_char=gap) 369 if stop: 370 alpha = HasStopCodon(alpha, stop_symbol=stop) 371 return alpha
372 373
374 -def _check_type_compatible(alphabets):
375 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). 376 377 >>> _check_type_compatible([generic_dna, generic_nucleotide]) 378 True 379 >>> _check_type_compatible([generic_dna, generic_rna]) 380 False 381 >>> _check_type_compatible([generic_dna, generic_protein]) 382 False 383 >>> _check_type_compatible([single_letter_alphabet, generic_protein]) 384 True 385 386 This relies on the Alphabet subclassing hierarchy. It does not 387 check things like gap characters or stop symbols.""" 388 dna, rna, nucl, protein = False, False, False, False 389 for alpha in alphabets: 390 a = _get_base_alphabet(alpha) 391 if isinstance(a, DNAAlphabet): 392 dna = True 393 nucl = True 394 if rna or protein: 395 return False 396 elif isinstance(a, RNAAlphabet): 397 rna = True 398 nucl = True 399 if dna or protein: 400 return False 401 elif isinstance(a, NucleotideAlphabet): 402 nucl = True 403 if protein: 404 return False 405 elif isinstance(a, ProteinAlphabet): 406 protein = True 407 if nucl: 408 return False 409 return True
410 411
412 -def _verify_alphabet(sequence):
413 """Check all letters in sequence are in the alphabet (PRIVATE). 414 415 >>> from Bio.Seq import Seq 416 >>> from Bio.Alphabet import IUPAC 417 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 418 ... IUPAC.protein) 419 >>> _verify_alphabet(my_seq) 420 True 421 422 This example has an X, which is not in the IUPAC protein alphabet 423 (you should be using the IUPAC extended protein alphabet): 424 425 >>> bad_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFX", 426 ... IUPAC.protein) 427 >>> _verify_alphabet(bad_seq) 428 False 429 430 This replaces Bio.utils.verify_alphabet() since we are deprecating 431 that. Potentially this could be added to the Alphabet object, and 432 I would like it to be an option when creating a Seq object... but 433 that might slow things down. 434 """ 435 letters = sequence.alphabet.letters 436 if not letters: 437 raise ValueError("Alphabet does not define letters.") 438 for letter in sequence: 439 if letter not in letters: 440 return False 441 return True
442