Package Bio :: Package Data :: Module IUPACData
[hide private]
[frames] | no frames]

Source Code for Module Bio.Data.IUPACData

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4   
  5  """Information about the IUPAC alphabets.""" 
  6   
  7   
  8  protein_letters = "ACDEFGHIKLMNPQRSTVWY" 
  9  extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" 
 10  #   B = "Asx";  aspartic acid or asparagine (D or N) 
 11  #   X = "Xxx";  unknown or 'other' amino acid 
 12  #   Z = "Glx";  glutamic acid or glutamine (E or Q) 
 13  #   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212 
 14  # 
 15  #   J = "Xle";  leucine or isoleucine (L or I, used in NMR) 
 16  #   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 17  #   Also the International Nucleotide Sequence Database Collaboration (INSDC) 
 18  #   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006 
 19  #   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html 
 20  # 
 21  #   Xle (J); Leucine or Isoleucine 
 22  #   The residue abbreviations, Xle (the three-letter abbreviation) and J 
 23  #   (the one-letter abbreviation) are reserved for the case that cannot 
 24  #   experimentally distinguish leucine from isoleucine. 
 25  # 
 26  #   U = "Sec";  selenocysteine 
 27  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 28  # 
 29  #   O = "Pyl";  pyrrolysine 
 30  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35 
 31   
 32  protein_letters_1to3 = { 
 33      'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 
 34      'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His', 
 35      'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met', 
 36      'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 
 37      'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 
 38      'Y': 'Tyr', 
 39  } 
 40  protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({ 
 41      'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle', 
 42      'U': 'Sel', 'O': 'Pyl', 
 43  }.items())) 
 44   
 45  protein_letters_3to1 = dict((x[1], x[0]) for x in 
 46                              protein_letters_1to3.items()) 
 47  protein_letters_3to1_extended = dict((x[1], x[0]) for x in 
 48                                       protein_letters_1to3_extended.items()) 
 49   
 50  ambiguous_dna_letters = "GATCRYWSMKHBVDN" 
 51  unambiguous_dna_letters = "GATC" 
 52  ambiguous_rna_letters = "GAUCRYWSMKHBVDN" 
 53  unambiguous_rna_letters = "GAUC" 
 54   
 55  #   B == 5-bromouridine 
 56  #   D == 5,6-dihydrouridine 
 57  #   S == thiouridine 
 58  #   W == wyosine 
 59  extended_dna_letters = "GATCBDSW" 
 60   
 61  # are there extended forms? 
 62  # extended_rna_letters = "GAUCBDSW" 
 63   
 64  ambiguous_dna_values = { 
 65      "A": "A", 
 66      "C": "C", 
 67      "G": "G", 
 68      "T": "T", 
 69      "M": "AC", 
 70      "R": "AG", 
 71      "W": "AT", 
 72      "S": "CG", 
 73      "Y": "CT", 
 74      "K": "GT", 
 75      "V": "ACG", 
 76      "H": "ACT", 
 77      "D": "AGT", 
 78      "B": "CGT", 
 79      "X": "GATC", 
 80      "N": "GATC", 
 81      } 
 82  ambiguous_rna_values = { 
 83      "A": "A", 
 84      "C": "C", 
 85      "G": "G", 
 86      "U": "U", 
 87      "M": "AC", 
 88      "R": "AG", 
 89      "W": "AU", 
 90      "S": "CG", 
 91      "Y": "CU", 
 92      "K": "GU", 
 93      "V": "ACG", 
 94      "H": "ACU", 
 95      "D": "AGU", 
 96      "B": "CGU", 
 97      "X": "GAUC", 
 98      "N": "GAUC", 
 99      } 
100   
101  ambiguous_dna_complement = { 
102      "A": "T", 
103      "C": "G", 
104      "G": "C", 
105      "T": "A", 
106      "M": "K", 
107      "R": "Y", 
108      "W": "W", 
109      "S": "S", 
110      "Y": "R", 
111      "K": "M", 
112      "V": "B", 
113      "H": "D", 
114      "D": "H", 
115      "B": "V", 
116      "X": "X", 
117      "N": "N", 
118      } 
119   
120  ambiguous_rna_complement = { 
121      "A": "U", 
122      "C": "G", 
123      "G": "C", 
124      "U": "A", 
125      "M": "K", 
126      "R": "Y", 
127      "W": "W", 
128      "S": "S", 
129      "Y": "R", 
130      "K": "M", 
131      "V": "B", 
132      "H": "D", 
133      "D": "H", 
134      "B": "V", 
135      "X": "X", 
136      "N": "N", 
137      } 
138   
139   
140 -def _make_ranges(mydict):
141 d = {} 142 for key, value in mydict.items(): 143 d[key] = (value, value) 144 return d
145 146 # Mass data taken from PubChem 147 148 149 # Average masses of monophosphate deoxy nucleotides 150 unambiguous_dna_weights = { 151 "A": 331.2218, 152 "C": 307.1971, 153 "G": 347.2212, 154 "T": 322.2085 155 } 156 157 # Monoisotopic masses of monophospate deoxy nucleotides 158 monoisotopic_unambiguous_dna_weights = { 159 "A": 331.06817, 160 "C": 307.056936, 161 "G": 347.063084, 162 "T": 322.056602 163 } 164 165 unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) 166 167 unambiguous_rna_weights = { 168 "A": 347.2212, 169 "C": 323.1965, 170 "G": 363.2206, 171 "U": 324.1813 172 } 173 174 monoisotopic_unambiguous_rna_weights = { 175 "A": 347.063084, 176 "C": 323.051851, 177 "G": 363.057999, 178 "U": 324.035867 179 } 180 181 unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) 182 183
184 -def _make_ambiguous_ranges(mydict, weight_table):
185 range_d = {} 186 avg_d = {} 187 for letter, values in mydict.items(): 188 # Following line is a quick hack to skip undefined weights for U and O 189 if len(values) == 1 and values[0] not in weight_table: 190 continue 191 192 weights = [weight_table.get(x) for x in values] 193 range_d[letter] = (min(weights), max(weights)) 194 total_w = 0.0 195 for w in weights: 196 total_w = total_w + w 197 avg_d[letter] = total_w / len(weights) 198 return range_d, avg_d
199 200 201 ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \ 202 _make_ambiguous_ranges(ambiguous_dna_values, 203 unambiguous_dna_weights) 204 205 ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \ 206 _make_ambiguous_ranges(ambiguous_rna_values, 207 unambiguous_rna_weights) 208 209 protein_weights = { 210 "A": 89.0932, 211 "C": 121.1582, 212 "D": 133.1027, 213 "E": 147.1293, 214 "F": 165.1891, 215 "G": 75.0666, 216 "H": 155.1546, 217 "I": 131.1729, 218 "K": 146.1876, 219 "L": 131.1729, 220 "M": 149.2113, 221 "N": 132.1179, 222 "O": 255.3134, 223 "P": 115.1305, 224 "Q": 146.1445, 225 "R": 174.201, 226 "S": 105.0926, 227 "T": 119.1192, 228 "U": 168.0532, 229 "V": 117.1463, 230 "W": 204.2252, 231 "Y": 181.1885 232 } 233 234 monoisotopic_protein_weights = { 235 "A": 89.047678, 236 "C": 121.019749, 237 "D": 133.037508, 238 "E": 147.053158, 239 "F": 165.078979, 240 "G": 75.032028, 241 "H": 155.069477, 242 "I": 131.094629, 243 "K": 146.105528, 244 "L": 131.094629, 245 "M": 149.051049, 246 "N": 132.053492, 247 "O": 255.158292, 248 "P": 115.063329, 249 "Q": 146.069142, 250 "R": 174.111676, 251 "S": 105.042593, 252 "T": 119.058243, 253 "U": 168.964203, 254 "V": 117.078979, 255 "W": 204.089878, 256 "Y": 181.073893, 257 } 258 259 extended_protein_values = { 260 "A": "A", 261 "B": "ND", 262 "C": "C", 263 "D": "D", 264 "E": "E", 265 "F": "F", 266 "G": "G", 267 "H": "H", 268 "I": "I", 269 "J": "IL", 270 "K": "K", 271 "L": "L", 272 "M": "M", 273 "N": "N", 274 "O": "O", 275 "P": "P", 276 "Q": "Q", 277 "R": "R", 278 "S": "S", 279 "T": "T", 280 "U": "U", 281 "V": "V", 282 "W": "W", 283 "X": "ACDEFGHIKLMNPQRSTVWY", 284 # TODO - Include U and O in the possible values of X? 285 # This could alter the extended_protein_weight_ranges ... 286 # by MP: Won't do this, because they are so rare. 287 "Y": "Y", 288 "Z": "QE", 289 } 290 291 protein_weight_ranges = _make_ranges(protein_weights) 292 293 extended_protein_weight_ranges, avg_extended_protein_weights = \ 294 _make_ambiguous_ranges(extended_protein_values, 295 protein_weights) 296 297 298 # For Center of Mass Calculation. 299 # Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol 300 atom_weights = { 301 'H': 1.00794, 302 'D': 2.01410, 303 'He': 4.002602, 304 'Li': 6.941, 305 'Be': 9.012182, 306 'B': 10.811, 307 'C': 12.0107, 308 'N': 14.0067, 309 'O': 15.9994, 310 'F': 18.9984032, 311 'Ne': 20.1797, 312 'Na': 22.989770, 313 'Mg': 24.3050, 314 'Al': 26.981538, 315 'Si': 28.0855, 316 'P': 30.973761, 317 'S': 32.065, 318 'Cl': 35.453, 319 'Ar': 39.948, 320 'K': 39.0983, 321 'Ca': 40.078, 322 'Sc': 44.955910, 323 'Ti': 47.867, 324 'V': 50.9415, 325 'Cr': 51.9961, 326 'Mn': 54.938049, 327 'Fe': 55.845, 328 'Co': 58.933200, 329 'Ni': 58.6934, 330 'Cu': 63.546, 331 'Zn': 65.39, 332 'Ga': 69.723, 333 'Ge': 72.64, 334 'As': 74.92160, 335 'Se': 78.96, 336 'Br': 79.904, 337 'Kr': 83.80, 338 'Rb': 85.4678, 339 'Sr': 87.62, 340 'Y': 88.90585, 341 'Zr': 91.224, 342 'Nb': 92.90638, 343 'Mo': 95.94, 344 'Tc': 98.0, 345 'Ru': 101.07, 346 'Rh': 102.90550, 347 'Pd': 106.42, 348 'Ag': 107.8682, 349 'Cd': 112.411, 350 'In': 114.818, 351 'Sn': 118.710, 352 'Sb': 121.760, 353 'Te': 127.60, 354 'I': 126.90447, 355 'Xe': 131.293, 356 'Cs': 132.90545, 357 'Ba': 137.327, 358 'La': 138.9055, 359 'Ce': 140.116, 360 'Pr': 140.90765, 361 'Nd': 144.24, 362 'Pm': 145.0, 363 'Sm': 150.36, 364 'Eu': 151.964, 365 'Gd': 157.25, 366 'Tb': 158.92534, 367 'Dy': 162.50, 368 'Ho': 164.93032, 369 'Er': 167.259, 370 'Tm': 168.93421, 371 'Yb': 173.04, 372 'Lu': 174.967, 373 'Hf': 178.49, 374 'Ta': 180.9479, 375 'W': 183.84, 376 'Re': 186.207, 377 'Os': 190.23, 378 'Ir': 192.217, 379 'Pt': 195.078, 380 'Au': 196.96655, 381 'Hg': 200.59, 382 'Tl': 204.3833, 383 'Pb': 207.2, 384 'Bi': 208.98038, 385 'Po': 208.98, 386 'At': 209.99, 387 'Rn': 222.02, 388 'Fr': 223.02, 389 'Ra': 226.03, 390 'Ac': 227.03, 391 'Th': 232.0381, 392 'Pa': 231.03588, 393 'U': 238.02891, 394 'Np': 237.05, 395 'Pu': 244.06, 396 'Am': 243.06, 397 'Cm': 247.07, 398 'Bk': 247.07, 399 'Cf': 251.08, 400 'Es': 252.08, 401 'Fm': 257.10, 402 'Md': 258.10, 403 'No': 259.10, 404 'Lr': 262.11, 405 'Rf': 261.11, 406 'Db': 262.11, 407 'Sg': 266.12, 408 'Bh': 264.12, 409 'Hs': 269.13, 410 'Mt': 268.14, 411 } 412