Package Bio :: Package Data :: Module IUPACData
[hide private]
[frames] | no frames]

Source Code for Module Bio.Data.IUPACData

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4   
  5  """Information about the IUPAC alphabets.""" 
  6   
  7   
  8  protein_letters = "ACDEFGHIKLMNPQRSTVWY" 
  9  extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" 
 10  #   B = "Asx";  aspartic acid or asparagine (D or N) 
 11  #   X = "Xxx";  unknown or 'other' amino acid 
 12  #   Z = "Glx";  glutamic acid or glutamine (E or Q) 
 13  #   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212 
 14  # 
 15  #   J = "Xle";  leucine or isoleucine (L or I, used in NMR) 
 16  #   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 17  #   Also the International Nucleotide Sequence Database Collaboration (INSDC) 
 18  #   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006 
 19  #   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html 
 20  # 
 21  #   Xle (J); Leucine or Isoleucine 
 22  #   The residue abbreviations, Xle (the three-letter abbreviation) and J 
 23  #   (the one-letter abbreviation) are reserved for the case that cannot 
 24  #   experimentally distinguish leucine from isoleucine. 
 25  # 
 26  #   U = "Sec";  selenocysteine 
 27  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 28  # 
 29  #   O = "Pyl";  pyrrolysine 
 30  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35 
 31   
 32  protein_letters_1to3 = { 
 33      'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 
 34      'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His', 
 35      'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met', 
 36      'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 
 37      'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 
 38      'Y': 'Tyr', 
 39  } 
 40  protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({ 
 41      'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle', 
 42      'U': 'Sel', 'O': 'Pyl', 
 43  }.items())) 
 44   
 45  protein_letters_3to1 = dict((x[1], x[0]) for x in 
 46                              protein_letters_1to3.items()) 
 47  protein_letters_3to1_extended = dict((x[1], x[0]) for x in 
 48                                       protein_letters_1to3_extended.items()) 
 49   
 50  ambiguous_dna_letters = "GATCRYWSMKHBVDN" 
 51  unambiguous_dna_letters = "GATC" 
 52  ambiguous_rna_letters = "GAUCRYWSMKHBVDN" 
 53  unambiguous_rna_letters = "GAUC" 
 54   
 55  #   B == 5-bromouridine 
 56  #   D == 5,6-dihydrouridine 
 57  #   S == thiouridine 
 58  #   W == wyosine 
 59  extended_dna_letters = "GATCBDSW" 
 60   
 61  # are there extended forms? 
 62  # extended_rna_letters = "GAUCBDSW" 
 63   
 64  ambiguous_dna_values = { 
 65      "A": "A", 
 66      "C": "C", 
 67      "G": "G", 
 68      "T": "T", 
 69      "M": "AC", 
 70      "R": "AG", 
 71      "W": "AT", 
 72      "S": "CG", 
 73      "Y": "CT", 
 74      "K": "GT", 
 75      "V": "ACG", 
 76      "H": "ACT", 
 77      "D": "AGT", 
 78      "B": "CGT", 
 79      "X": "GATC", 
 80      "N": "GATC", 
 81      } 
 82  ambiguous_rna_values = { 
 83      "A": "A", 
 84      "C": "C", 
 85      "G": "G", 
 86      "U": "U", 
 87      "M": "AC", 
 88      "R": "AG", 
 89      "W": "AU", 
 90      "S": "CG", 
 91      "Y": "CU", 
 92      "K": "GU", 
 93      "V": "ACG", 
 94      "H": "ACU", 
 95      "D": "AGU", 
 96      "B": "CGU", 
 97      "X": "GAUC", 
 98      "N": "GAUC", 
 99      } 
100   
101  ambiguous_dna_complement = { 
102      "A": "T", 
103      "C": "G", 
104      "G": "C", 
105      "T": "A", 
106      "M": "K", 
107      "R": "Y", 
108      "W": "W", 
109      "S": "S", 
110      "Y": "R", 
111      "K": "M", 
112      "V": "B", 
113      "H": "D", 
114      "D": "H", 
115      "B": "V", 
116      "X": "X", 
117      "N": "N", 
118      } 
119   
120  ambiguous_rna_complement = { 
121      "A": "U", 
122      "C": "G", 
123      "G": "C", 
124      "U": "A", 
125      "M": "K", 
126      "R": "Y", 
127      "W": "W", 
128      "S": "S", 
129      "Y": "R", 
130      "K": "M", 
131      "V": "B", 
132      "H": "D", 
133      "D": "H", 
134      "B": "V", 
135      "X": "X", 
136      "N": "N", 
137      } 
138   
139   
140 -def _make_ranges(mydict):
141 d = {} 142 for key, value in mydict.items(): 143 d[key] = (value, value) 144 return d
145 146 # Mass data taken from PubChem 147 148 # Average masses of monophosphate deoxy nucleotides 149 unambiguous_dna_weights = { 150 "A": 331.2218, 151 "C": 307.1971, 152 "G": 347.2212, 153 "T": 322.2085 154 } 155 156 # Monoisotopic masses of monophospate deoxy nucleotides 157 monoisotopic_unambiguous_dna_weights = { 158 "A": 331.06817, 159 "C": 307.056936, 160 "G": 347.063084, 161 "T": 322.056602 162 } 163 164 unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) 165 166 unambiguous_rna_weights = { 167 "A": 347.2212, 168 "C": 323.1965, 169 "G": 363.2206, 170 "U": 324.1813 171 } 172 173 monoisotopic_unambiguous_rna_weights = { 174 "A": 347.063084, 175 "C": 323.051851, 176 "G": 363.057999, 177 "U": 324.035867 178 } 179 180 unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) 181 182
183 -def _make_ambiguous_ranges(mydict, weight_table):
184 range_d = {} 185 avg_d = {} 186 for letter, values in mydict.items(): 187 # Following line is a quick hack to skip undefined weights for U and O 188 if len(values) == 1 and values[0] not in weight_table: 189 continue 190 191 weights = [weight_table.get(x) for x in values] 192 range_d[letter] = (min(weights), max(weights)) 193 total_w = 0.0 194 for w in weights: 195 total_w = total_w + w 196 avg_d[letter] = total_w / len(weights) 197 return range_d, avg_d
198 199 ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \ 200 _make_ambiguous_ranges(ambiguous_dna_values, 201 unambiguous_dna_weights) 202 203 ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \ 204 _make_ambiguous_ranges(ambiguous_rna_values, 205 unambiguous_rna_weights) 206 207 protein_weights = { 208 "A": 89.0932, 209 "C": 121.1582, 210 "D": 133.1027, 211 "E": 147.1293, 212 "F": 165.1891, 213 "G": 75.0666, 214 "H": 155.1546, 215 "I": 131.1729, 216 "K": 146.1876, 217 "L": 131.1729, 218 "M": 149.2113, 219 "N": 132.1179, 220 "O": 255.3134, 221 "P": 115.1305, 222 "Q": 146.1445, 223 "R": 174.201, 224 "S": 105.0926, 225 "T": 119.1192, 226 "U": 168.0532, 227 "V": 117.1463, 228 "W": 204.2252, 229 "Y": 181.1885 230 } 231 232 monoisotopic_protein_weights = { 233 "A": 89.047678, 234 "C": 121.019749, 235 "D": 133.037508, 236 "E": 147.053158, 237 "F": 165.078979, 238 "G": 75.032028, 239 "H": 155.069477, 240 "I": 131.094629, 241 "K": 146.105528, 242 "L": 131.094629, 243 "M": 149.051049, 244 "N": 132.053492, 245 "O": 255.158292, 246 "P": 115.063329, 247 "Q": 146.069142, 248 "R": 174.111676, 249 "S": 105.042593, 250 "T": 119.058243, 251 "U": 168.964203, 252 "V": 117.078979, 253 "W": 204.089878, 254 "Y": 181.073893, 255 } 256 257 extended_protein_values = { 258 "A": "A", 259 "B": "ND", 260 "C": "C", 261 "D": "D", 262 "E": "E", 263 "F": "F", 264 "G": "G", 265 "H": "H", 266 "I": "I", 267 "J": "IL", 268 "K": "K", 269 "L": "L", 270 "M": "M", 271 "N": "N", 272 "O": "O", 273 "P": "P", 274 "Q": "Q", 275 "R": "R", 276 "S": "S", 277 "T": "T", 278 "U": "U", 279 "V": "V", 280 "W": "W", 281 "X": "ACDEFGHIKLMNPQRSTVWY", 282 # TODO - Include U and O in the possible values of X? 283 # This could alter the extended_protein_weight_ranges ... 284 # by MP: Won't do this, because they are so rare. 285 "Y": "Y", 286 "Z": "QE", 287 } 288 289 protein_weight_ranges = _make_ranges(protein_weights) 290 291 extended_protein_weight_ranges, avg_extended_protein_weights = \ 292 _make_ambiguous_ranges(extended_protein_values, 293 protein_weights) 294 295 296 # For Center of Mass Calculation. 297 # Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol 298 atom_weights = { 299 'H': 1.00794, 300 'He': 4.002602, 301 'Li': 6.941, 302 'Be': 9.012182, 303 'B': 10.811, 304 'C': 12.0107, 305 'N': 14.0067, 306 'O': 15.9994, 307 'F': 18.9984032, 308 'Ne': 20.1797, 309 'Na': 22.989770, 310 'Mg': 24.3050, 311 'Al': 26.981538, 312 'Si': 28.0855, 313 'P': 30.973761, 314 'S': 32.065, 315 'Cl': 35.453, 316 'Ar': 39.948, 317 'K': 39.0983, 318 'Ca': 40.078, 319 'Sc': 44.955910, 320 'Ti': 47.867, 321 'V': 50.9415, 322 'Cr': 51.9961, 323 'Mn': 54.938049, 324 'Fe': 55.845, 325 'Co': 58.933200, 326 'Ni': 58.6934, 327 'Cu': 63.546, 328 'Zn': 65.39, 329 'Ga': 69.723, 330 'Ge': 72.64, 331 'As': 74.92160, 332 'Se': 78.96, 333 'Br': 79.904, 334 'Kr': 83.80, 335 'Rb': 85.4678, 336 'Sr': 87.62, 337 'Y': 88.90585, 338 'Zr': 91.224, 339 'Nb': 92.90638, 340 'Mo': 95.94, 341 'Tc': 98.0, 342 'Ru': 101.07, 343 'Rh': 102.90550, 344 'Pd': 106.42, 345 'Ag': 107.8682, 346 'Cd': 112.411, 347 'In': 114.818, 348 'Sn': 118.710, 349 'Sb': 121.760, 350 'Te': 127.60, 351 'I': 126.90447, 352 'Xe': 131.293, 353 'Cs': 132.90545, 354 'Ba': 137.327, 355 'La': 138.9055, 356 'Ce': 140.116, 357 'Pr': 140.90765, 358 'Nd': 144.24, 359 'Pm': 145.0, 360 'Sm': 150.36, 361 'Eu': 151.964, 362 'Gd': 157.25, 363 'Tb': 158.92534, 364 'Dy': 162.50, 365 'Ho': 164.93032, 366 'Er': 167.259, 367 'Tm': 168.93421, 368 'Yb': 173.04, 369 'Lu': 174.967, 370 'Hf': 178.49, 371 'Ta': 180.9479, 372 'W': 183.84, 373 'Re': 186.207, 374 'Os': 190.23, 375 'Ir': 192.217, 376 'Pt': 195.078, 377 'Au': 196.96655, 378 'Hg': 200.59, 379 'Tl': 204.3833, 380 'Pb': 207.2, 381 'Bi': 208.98038, 382 'Po': 208.98, 383 'At': 209.99, 384 'Rn': 222.02, 385 'Fr': 223.02, 386 'Ra': 226.03, 387 'Ac': 227.03, 388 'Th': 232.0381, 389 'Pa': 231.03588, 390 'U': 238.02891, 391 'Np': 237.05, 392 'Pu': 244.06, 393 'Am': 243.06, 394 'Cm': 247.07, 395 'Bk': 247.07, 396 'Cf': 251.08, 397 'Es': 252.08, 398 'Fm': 257.10, 399 'Md': 258.10, 400 'No': 259.10, 401 'Lr': 262.11, 402 'Rf': 261.11, 403 'Db': 262.11, 404 'Sg': 266.12, 405 'Bh': 264.12, 406 'Hs': 269.13, 407 'Mt': 268.14, 408 } 409