Package Bio :: Package Data :: Module IUPACData
[hide private]
[frames] | no frames]

Source Code for Module Bio.Data.IUPACData

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4   
  5  """Information about the IUPAC alphabets.""" 
  6   
  7  __docformat__ = "restructuredtext en" 
  8   
  9   
 10  protein_letters = "ACDEFGHIKLMNPQRSTVWY" 
 11  extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" 
 12  #   B = "Asx";  aspartic acid or asparagine (D or N) 
 13  #   X = "Xxx";  unknown or 'other' amino acid 
 14  #   Z = "Glx";  glutamic acid or glutamine (E or Q) 
 15  #   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212 
 16  # 
 17  #   J = "Xle";  leucine or isoleucine (L or I, used in NMR) 
 18  #   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 19  #   Also the International Nucleotide Sequence Database Collaboration (INSDC) 
 20  #   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006 
 21  #   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html 
 22  # 
 23  #   Xle (J); Leucine or Isoleucine 
 24  #   The residue abbreviations, Xle (the three-letter abbreviation) and J 
 25  #   (the one-letter abbreviation) are reserved for the case that cannot 
 26  #   experimentally distinguish leucine from isoleucine. 
 27  # 
 28  #   U = "Sec";  selenocysteine 
 29  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 30  # 
 31  #   O = "Pyl";  pyrrolysine 
 32  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35 
 33   
 34  protein_letters_1to3 = { 
 35      'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 
 36      'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His', 
 37      'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met', 
 38      'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 
 39      'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 
 40      'Y': 'Tyr', 
 41  } 
 42  protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({ 
 43      'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle', 
 44      'U': 'Sel', 'O': 'Pyl', 
 45  }.items())) 
 46   
 47  protein_letters_3to1 = dict((x[1], x[0]) for x in 
 48                              protein_letters_1to3.items()) 
 49  protein_letters_3to1_extended = dict((x[1], x[0]) for x in 
 50                                       protein_letters_1to3_extended.items()) 
 51   
 52  ambiguous_dna_letters = "GATCRYWSMKHBVDN" 
 53  unambiguous_dna_letters = "GATC" 
 54  ambiguous_rna_letters = "GAUCRYWSMKHBVDN" 
 55  unambiguous_rna_letters = "GAUC" 
 56   
 57  #   B == 5-bromouridine 
 58  #   D == 5,6-dihydrouridine 
 59  #   S == thiouridine 
 60  #   W == wyosine 
 61  extended_dna_letters = "GATCBDSW" 
 62   
 63  # are there extended forms? 
 64  # extended_rna_letters = "GAUCBDSW" 
 65   
 66  ambiguous_dna_values = { 
 67      "A": "A", 
 68      "C": "C", 
 69      "G": "G", 
 70      "T": "T", 
 71      "M": "AC", 
 72      "R": "AG", 
 73      "W": "AT", 
 74      "S": "CG", 
 75      "Y": "CT", 
 76      "K": "GT", 
 77      "V": "ACG", 
 78      "H": "ACT", 
 79      "D": "AGT", 
 80      "B": "CGT", 
 81      "X": "GATC", 
 82      "N": "GATC", 
 83      } 
 84  ambiguous_rna_values = { 
 85      "A": "A", 
 86      "C": "C", 
 87      "G": "G", 
 88      "U": "U", 
 89      "M": "AC", 
 90      "R": "AG", 
 91      "W": "AU", 
 92      "S": "CG", 
 93      "Y": "CU", 
 94      "K": "GU", 
 95      "V": "ACG", 
 96      "H": "ACU", 
 97      "D": "AGU", 
 98      "B": "CGU", 
 99      "X": "GAUC", 
100      "N": "GAUC", 
101      } 
102   
103  ambiguous_dna_complement = { 
104      "A": "T", 
105      "C": "G", 
106      "G": "C", 
107      "T": "A", 
108      "M": "K", 
109      "R": "Y", 
110      "W": "W", 
111      "S": "S", 
112      "Y": "R", 
113      "K": "M", 
114      "V": "B", 
115      "H": "D", 
116      "D": "H", 
117      "B": "V", 
118      "X": "X", 
119      "N": "N", 
120      } 
121   
122  ambiguous_rna_complement = { 
123      "A": "U", 
124      "C": "G", 
125      "G": "C", 
126      "U": "A", 
127      "M": "K", 
128      "R": "Y", 
129      "W": "W", 
130      "S": "S", 
131      "Y": "R", 
132      "K": "M", 
133      "V": "B", 
134      "H": "D", 
135      "D": "H", 
136      "B": "V", 
137      "X": "X", 
138      "N": "N", 
139      } 
140   
141   
142 -def _make_ranges(mydict):
143 d = {} 144 for key, value in mydict.items(): 145 d[key] = (value, value) 146 return d
147 148 # Mass data taken from PubChem 149 150 # Average masses of monophosphate deoxy nucleotides 151 unambiguous_dna_weights = { 152 "A": 331.2218, 153 "C": 307.1971, 154 "G": 347.2212, 155 "T": 322.2085 156 } 157 158 # Monoisotopic masses of monophospate deoxy nucleotides 159 monoisotopic_unambiguous_dna_weights = { 160 "A": 331.06817, 161 "C": 307.056936, 162 "G": 347.063084, 163 "T": 322.056602 164 } 165 166 unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) 167 168 unambiguous_rna_weights = { 169 "A": 347.2212, 170 "C": 323.1965, 171 "G": 363.2206, 172 "U": 324.1813 173 } 174 175 monoisotopic_unambiguous_rna_weights = { 176 "A": 347.063084, 177 "C": 323.051851, 178 "G": 363.057999, 179 "U": 324.035867 180 } 181 182 unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) 183 184
185 -def _make_ambiguous_ranges(mydict, weight_table):
186 range_d = {} 187 avg_d = {} 188 for letter, values in mydict.items(): 189 # Following line is a quick hack to skip undefined weights for U and O 190 if len(values) == 1 and values[0] not in weight_table: 191 continue 192 193 weights = [weight_table.get(x) for x in values] 194 range_d[letter] = (min(weights), max(weights)) 195 total_w = 0.0 196 for w in weights: 197 total_w = total_w + w 198 avg_d[letter] = total_w / len(weights) 199 return range_d, avg_d
200 201 ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \ 202 _make_ambiguous_ranges(ambiguous_dna_values, 203 unambiguous_dna_weights) 204 205 ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \ 206 _make_ambiguous_ranges(ambiguous_rna_values, 207 unambiguous_rna_weights) 208 209 protein_weights = { 210 "A": 89.0932, 211 "C": 121.1582, 212 "D": 133.1027, 213 "E": 147.1293, 214 "F": 165.1891, 215 "G": 75.0666, 216 "H": 155.1546, 217 "I": 131.1729, 218 "K": 146.1876, 219 "L": 131.1729, 220 "M": 149.2113, 221 "N": 132.1179, 222 "O": 255.3134, 223 "P": 115.1305, 224 "Q": 146.1445, 225 "R": 174.201, 226 "S": 105.0926, 227 "T": 119.1192, 228 "U": 168.0532, 229 "V": 117.1463, 230 "W": 204.2252, 231 "Y": 181.1885 232 } 233 234 monoisotopic_protein_weights = { 235 "A": 89.047678, 236 "C": 121.019749, 237 "D": 133.037508, 238 "E": 147.053158, 239 "F": 165.078979, 240 "G": 75.032028, 241 "H": 155.069477, 242 "I": 131.094629, 243 "K": 146.105528, 244 "L": 131.094629, 245 "M": 149.051049, 246 "N": 132.053492, 247 "O": 255.158292, 248 "P": 115.063329, 249 "Q": 146.069142, 250 "R": 174.111676, 251 "S": 105.042593, 252 "T": 119.058243, 253 "U": 168.964203, 254 "V": 117.078979, 255 "W": 204.089878, 256 "Y": 181.073893, 257 } 258 259 extended_protein_values = { 260 "A": "A", 261 "B": "ND", 262 "C": "C", 263 "D": "D", 264 "E": "E", 265 "F": "F", 266 "G": "G", 267 "H": "H", 268 "I": "I", 269 "J": "IL", 270 "K": "K", 271 "L": "L", 272 "M": "M", 273 "N": "N", 274 "O": "O", 275 "P": "P", 276 "Q": "Q", 277 "R": "R", 278 "S": "S", 279 "T": "T", 280 "U": "U", 281 "V": "V", 282 "W": "W", 283 "X": "ACDEFGHIKLMNPQRSTVWY", 284 # TODO - Include U and O in the possible values of X? 285 # This could alter the extended_protein_weight_ranges ... 286 # by MP: Won't do this, because they are so rare. 287 "Y": "Y", 288 "Z": "QE", 289 } 290 291 protein_weight_ranges = _make_ranges(protein_weights) 292 293 extended_protein_weight_ranges, avg_extended_protein_weights = \ 294 _make_ambiguous_ranges(extended_protein_values, 295 protein_weights) 296 297 298 # For Center of Mass Calculation. 299 # Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol 300 atom_weights = { 301 'H': 1.00794, 302 'He': 4.002602, 303 'Li': 6.941, 304 'Be': 9.012182, 305 'B': 10.811, 306 'C': 12.0107, 307 'N': 14.0067, 308 'O': 15.9994, 309 'F': 18.9984032, 310 'Ne': 20.1797, 311 'Na': 22.989770, 312 'Mg': 24.3050, 313 'Al': 26.981538, 314 'Si': 28.0855, 315 'P': 30.973761, 316 'S': 32.065, 317 'Cl': 35.453, 318 'Ar': 39.948, 319 'K': 39.0983, 320 'Ca': 40.078, 321 'Sc': 44.955910, 322 'Ti': 47.867, 323 'V': 50.9415, 324 'Cr': 51.9961, 325 'Mn': 54.938049, 326 'Fe': 55.845, 327 'Co': 58.933200, 328 'Ni': 58.6934, 329 'Cu': 63.546, 330 'Zn': 65.39, 331 'Ga': 69.723, 332 'Ge': 72.64, 333 'As': 74.92160, 334 'Se': 78.96, 335 'Br': 79.904, 336 'Kr': 83.80, 337 'Rb': 85.4678, 338 'Sr': 87.62, 339 'Y': 88.90585, 340 'Zr': 91.224, 341 'Nb': 92.90638, 342 'Mo': 95.94, 343 'Tc': 98.0, 344 'Ru': 101.07, 345 'Rh': 102.90550, 346 'Pd': 106.42, 347 'Ag': 107.8682, 348 'Cd': 112.411, 349 'In': 114.818, 350 'Sn': 118.710, 351 'Sb': 121.760, 352 'Te': 127.60, 353 'I': 126.90447, 354 'Xe': 131.293, 355 'Cs': 132.90545, 356 'Ba': 137.327, 357 'La': 138.9055, 358 'Ce': 140.116, 359 'Pr': 140.90765, 360 'Nd': 144.24, 361 'Pm': 145.0, 362 'Sm': 150.36, 363 'Eu': 151.964, 364 'Gd': 157.25, 365 'Tb': 158.92534, 366 'Dy': 162.50, 367 'Ho': 164.93032, 368 'Er': 167.259, 369 'Tm': 168.93421, 370 'Yb': 173.04, 371 'Lu': 174.967, 372 'Hf': 178.49, 373 'Ta': 180.9479, 374 'W': 183.84, 375 'Re': 186.207, 376 'Os': 190.23, 377 'Ir': 192.217, 378 'Pt': 195.078, 379 'Au': 196.96655, 380 'Hg': 200.59, 381 'Tl': 204.3833, 382 'Pb': 207.2, 383 'Bi': 208.98038, 384 'Po': 208.98, 385 'At': 209.99, 386 'Rn': 222.02, 387 'Fr': 223.02, 388 'Ra': 226.03, 389 'Ac': 227.03, 390 'Th': 232.0381, 391 'Pa': 231.03588, 392 'U': 238.02891, 393 'Np': 237.05, 394 'Pu': 244.06, 395 'Am': 243.06, 396 'Cm': 247.07, 397 'Bk': 247.07, 398 'Cf': 251.08, 399 'Es': 252.08, 400 'Fm': 257.10, 401 'Md': 258.10, 402 'No': 259.10, 403 'Lr': 262.11, 404 'Rf': 261.11, 405 'Db': 262.11, 406 'Sg': 266.12, 407 'Bh': 264.12, 408 'Hs': 269.13, 409 'Mt': 268.14, 410 } 411