Package Bio :: Package Data :: Module IUPACData
[hide private]
[frames] | no frames]

Source Code for Module Bio.Data.IUPACData

  1  # Information about the IUPAC alphabets 
  2   
  3  protein_letters = "ACDEFGHIKLMNPQRSTVWY" 
  4  extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" 
  5  #   B = "Asx";  aspartic acid or asparagine (D or N) 
  6  #   X = "Xxx";  unknown or 'other' amino acid 
  7  #   Z = "Glx";  glutamic acid or glutamine (E or Q) 
  8  #   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212 
  9  # 
 10  #   J = "Xle";  leucine or isoleucine (L or I, used in NMR) 
 11  #   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 12  #   Also the International Nucleotide Sequence Database Collaboration (INSDC) 
 13  #   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006 
 14  #   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html 
 15  # 
 16  #   Xle (J); Leucine or Isoleucine 
 17  #   The residue abbreviations, Xle (the three-letter abbreviation) and J 
 18  #   (the one-letter abbreviation) are reserved for the case that cannot 
 19  #   experimentally distinguish leucine from isoleucine. 
 20  # 
 21  #   U = "Sec";  selenocysteine 
 22  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 23  # 
 24  #   O = "Pyl";  pyrrolysine 
 25  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35 
 26   
 27  protein_letters_1to3  = { 
 28      'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 
 29      'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His', 
 30      'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met', 
 31      'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 
 32      'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 
 33      'Y': 'Tyr', 
 34  } 
 35  protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({ 
 36      'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle', 
 37      'U': 'Sel', 'O': 'Pyl', 
 38  }.items())) 
 39   
 40  protein_letters_3to1 = dict((x[1], x[0]) for x in 
 41                              protein_letters_1to3.items()) 
 42  protein_letters_3to1_extended = dict((x[1], x[0]) for x in 
 43                                       protein_letters_1to3_extended.items()) 
 44   
 45  ambiguous_dna_letters = "GATCRYWSMKHBVDN" 
 46  unambiguous_dna_letters = "GATC" 
 47  ambiguous_rna_letters = "GAUCRYWSMKHBVDN" 
 48  unambiguous_rna_letters = "GAUC" 
 49   
 50  #   B == 5-bromouridine 
 51  #   D == 5,6-dihydrouridine 
 52  #   S == thiouridine 
 53  #   W == wyosine 
 54  extended_dna_letters = "GATCBDSW" 
 55   
 56  # are there extended forms? 
 57  #extended_rna_letters = "GAUCBDSW" 
 58   
 59  ambiguous_dna_values = { 
 60      "A": "A", 
 61      "C": "C", 
 62      "G": "G", 
 63      "T": "T", 
 64      "M": "AC", 
 65      "R": "AG", 
 66      "W": "AT", 
 67      "S": "CG", 
 68      "Y": "CT", 
 69      "K": "GT", 
 70      "V": "ACG", 
 71      "H": "ACT", 
 72      "D": "AGT", 
 73      "B": "CGT", 
 74      "X": "GATC", 
 75      "N": "GATC", 
 76      } 
 77  ambiguous_rna_values = { 
 78      "A": "A", 
 79      "C": "C", 
 80      "G": "G", 
 81      "U": "U", 
 82      "M": "AC", 
 83      "R": "AG", 
 84      "W": "AU", 
 85      "S": "CG", 
 86      "Y": "CU", 
 87      "K": "GU", 
 88      "V": "ACG", 
 89      "H": "ACU", 
 90      "D": "AGU", 
 91      "B": "CGU", 
 92      "X": "GAUC", 
 93      "N": "GAUC", 
 94      } 
 95   
 96  ambiguous_dna_complement = { 
 97      "A": "T", 
 98      "C": "G", 
 99      "G": "C", 
100      "T": "A", 
101      "M": "K", 
102      "R": "Y", 
103      "W": "W", 
104      "S": "S", 
105      "Y": "R", 
106      "K": "M", 
107      "V": "B", 
108      "H": "D", 
109      "D": "H", 
110      "B": "V", 
111      "X": "X", 
112      "N": "N", 
113      } 
114   
115  ambiguous_rna_complement = { 
116      "A": "U", 
117      "C": "G", 
118      "G": "C", 
119      "U": "A", 
120      "M": "K", 
121      "R": "Y", 
122      "W": "W", 
123      "S": "S", 
124      "Y": "R", 
125      "K": "M", 
126      "V": "B", 
127      "H": "D", 
128      "D": "H", 
129      "B": "V", 
130      "X": "X", 
131      "N": "N", 
132      } 
133   
134   
135 -def _make_ranges(mydict):
136 d = {} 137 for key, value in mydict.items(): 138 d[key] = (value, value) 139 return d
140 141 # Mass data taken from PubChem 142 143 # Average masses of monophosphate deoxy nucleotides 144 unambiguous_dna_weights = { 145 "A": 331.2218, 146 "C": 307.1971, 147 "G": 347.2212, 148 "T": 322.2085 149 } 150 151 # Monoisotopic masses of monophospate deoxy nucleotides 152 monoisotopic_unambiguous_dna_weights = { 153 "A": 331.06817, 154 "C": 307.056936, 155 "G": 347.063084, 156 "T": 322.056602 157 } 158 159 unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) 160 161 unambiguous_rna_weights = { 162 "A": 347.2212, 163 "C": 323.1965, 164 "G": 363.2206, 165 "U": 324.1813 166 } 167 168 monoisotopic_unambiguous_rna_weights = { 169 "A": 347.063084, 170 "C": 323.051851, 171 "G": 363.057999, 172 "U": 324.035867 173 } 174 175 unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) 176 177
178 -def _make_ambiguous_ranges(mydict, weight_table):
179 range_d = {} 180 avg_d = {} 181 for letter, values in mydict.items(): 182 #Following line is a quick hack to skip undefined weights for U and O 183 if len(values) == 1 and values[0] not in weight_table: 184 continue 185 186 weights = [weight_table.get(x) for x in values] 187 range_d[letter] = (min(weights), max(weights)) 188 total_w = 0.0 189 for w in weights: 190 total_w = total_w + w 191 avg_d[letter] = total_w / len(weights) 192 return range_d, avg_d
193 194 ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \ 195 _make_ambiguous_ranges(ambiguous_dna_values, 196 unambiguous_dna_weights) 197 198 ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \ 199 _make_ambiguous_ranges(ambiguous_rna_values, 200 unambiguous_rna_weights) 201 202 protein_weights = { 203 "A": 89.0932, 204 "C": 121.1582, 205 "D": 133.1027, 206 "E": 147.1293, 207 "F": 165.1891, 208 "G": 75.0666, 209 "H": 155.1546, 210 "I": 131.1729, 211 "K": 146.1876, 212 "L": 131.1729, 213 "M": 149.2113, 214 "N": 132.1179, 215 "O": 255.3134, 216 "P": 115.1305, 217 "Q": 146.1445, 218 "R": 174.201, 219 "S": 105.0926, 220 "T": 119.1192, 221 "U": 168.0532, 222 "V": 117.1463, 223 "W": 204.2252, 224 "Y": 181.1885 225 } 226 227 monoisotopic_protein_weights = { 228 "A": 89.047678, 229 "C": 121.019749, 230 "D": 133.037508, 231 "E": 147.053158, 232 "F": 165.078979, 233 "G": 75.032028, 234 "H": 155.069477, 235 "I": 131.094629, 236 "K": 146.105528, 237 "L": 131.094629, 238 "M": 149.051049, 239 "N": 132.053492, 240 "O": 255.158292, 241 "P": 115.063329, 242 "Q": 146.069142, 243 "R": 174.111676, 244 "S": 105.042593, 245 "T": 119.058243, 246 "U": 168.964203, 247 "V": 117.078979, 248 "W": 204.089878, 249 "Y": 181.073893, 250 } 251 252 extended_protein_values = { 253 "A": "A", 254 "B": "ND", 255 "C": "C", 256 "D": "D", 257 "E": "E", 258 "F": "F", 259 "G": "G", 260 "H": "H", 261 "I": "I", 262 "J": "IL", 263 "K": "K", 264 "L": "L", 265 "M": "M", 266 "N": "N", 267 "O": "O", 268 "P": "P", 269 "Q": "Q", 270 "R": "R", 271 "S": "S", 272 "T": "T", 273 "U": "U", 274 "V": "V", 275 "W": "W", 276 "X": "ACDEFGHIKLMNPQRSTVWY", 277 #TODO - Include U and O in the possible values of X? 278 #This could alter the extended_protein_weight_ranges ... 279 #by MP: Won't do this, because they are so rare. 280 "Y": "Y", 281 "Z": "QE", 282 } 283 284 protein_weight_ranges = _make_ranges(protein_weights) 285 286 extended_protein_weight_ranges, avg_extended_protein_weights = \ 287 _make_ambiguous_ranges(extended_protein_values, 288 protein_weights) 289 290 291 # For Center of Mass Calculation. 292 # Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol 293 atom_weights = { 294 'H': 1.00794, 295 'He': 4.002602, 296 'Li': 6.941, 297 'Be': 9.012182, 298 'B': 10.811, 299 'C': 12.0107, 300 'N': 14.0067, 301 'O': 15.9994, 302 'F': 18.9984032, 303 'Ne': 20.1797, 304 'Na': 22.989770, 305 'Mg': 24.3050, 306 'Al': 26.981538, 307 'Si': 28.0855, 308 'P': 30.973761, 309 'S': 32.065, 310 'Cl': 35.453, 311 'Ar': 39.948, 312 'K': 39.0983, 313 'Ca': 40.078, 314 'Sc': 44.955910, 315 'Ti': 47.867, 316 'V': 50.9415, 317 'Cr': 51.9961, 318 'Mn': 54.938049, 319 'Fe': 55.845, 320 'Co': 58.933200, 321 'Ni': 58.6934, 322 'Cu': 63.546, 323 'Zn': 65.39, 324 'Ga': 69.723, 325 'Ge': 72.64, 326 'As': 74.92160, 327 'Se': 78.96, 328 'Br': 79.904, 329 'Kr': 83.80, 330 'Rb': 85.4678, 331 'Sr': 87.62, 332 'Y': 88.90585, 333 'Zr': 91.224, 334 'Nb': 92.90638, 335 'Mo': 95.94, 336 'Tc': 98.0, 337 'Ru': 101.07, 338 'Rh': 102.90550, 339 'Pd': 106.42, 340 'Ag': 107.8682, 341 'Cd': 112.411, 342 'In': 114.818, 343 'Sn': 118.710, 344 'Sb': 121.760, 345 'Te': 127.60, 346 'I': 126.90447, 347 'Xe': 131.293, 348 'Cs': 132.90545, 349 'Ba': 137.327, 350 'La': 138.9055, 351 'Ce': 140.116, 352 'Pr': 140.90765, 353 'Nd': 144.24, 354 'Pm': 145.0, 355 'Sm': 150.36, 356 'Eu': 151.964, 357 'Gd': 157.25, 358 'Tb': 158.92534, 359 'Dy': 162.50, 360 'Ho': 164.93032, 361 'Er': 167.259, 362 'Tm': 168.93421, 363 'Yb': 173.04, 364 'Lu': 174.967, 365 'Hf': 178.49, 366 'Ta': 180.9479, 367 'W': 183.84, 368 'Re': 186.207, 369 'Os': 190.23, 370 'Ir': 192.217, 371 'Pt': 195.078, 372 'Au': 196.96655, 373 'Hg': 200.59, 374 'Tl': 204.3833, 375 'Pb': 207.2, 376 'Bi': 208.98038, 377 'Po': 208.98, 378 'At': 209.99, 379 'Rn': 222.02, 380 'Fr': 223.02, 381 'Ra': 226.03, 382 'Ac': 227.03, 383 'Th': 232.0381, 384 'Pa': 231.03588, 385 'U': 238.02891, 386 'Np': 237.05, 387 'Pu': 244.06, 388 'Am': 243.06, 389 'Cm': 247.07, 390 'Bk': 247.07, 391 'Cf': 251.08, 392 'Es': 252.08, 393 'Fm': 257.10, 394 'Md': 258.10, 395 'No': 259.10, 396 'Lr': 262.11, 397 'Rf': 261.11, 398 'Db': 262.11, 399 'Sg': 266.12, 400 'Bh': 264.12, 401 'Hs': 269.13, 402 'Mt': 268.14, 403 } 404