Package Bio :: Package Data :: Module IUPACData
[hide private]
[frames] | no frames]

Source Code for Module Bio.Data.IUPACData

  1  # Information about the IUPAC alphabets 
  2   
  3  protein_letters = "ACDEFGHIKLMNPQRSTVWY" 
  4  extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" 
  5  #   B = "Asx";  aspartic acid or asparagine (D or N) 
  6  #   X = "Xxx";  unknown or 'other' amino acid 
  7  #   Z = "Glx";  glutamic acid or glutamine (E or Q) 
  8  #   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212 
  9  # 
 10  #   J = "Xle";  leucine or isoleucine (L or I, used in NMR) 
 11  #   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 12  #   Also the International Nucleotide Sequence Database Collaboration (INSDC) 
 13  #   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006 
 14  #   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html 
 15  # 
 16  #   Xle (J); Leucine or Isoleucine 
 17  #   The residue abbreviations, Xle (the three-letter abbreviation) and J 
 18  #   (the one-letter abbreviation) are reserved for the case that cannot 
 19  #   experimentally distinguish leucine from isoleucine. 
 20  # 
 21  #   U = "Sec";  selenocysteine 
 22  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 
 23  # 
 24  #   O = "Pyl";  pyrrolysine 
 25  #   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35 
 26  ambiguous_dna_letters = "GATCRYWSMKHBVDN" 
 27  unambiguous_dna_letters = "GATC" 
 28  ambiguous_rna_letters = "GAUCRYWSMKHBVDN" 
 29  unambiguous_rna_letters = "GAUC" 
 30   
 31  #   B == 5-bromouridine 
 32  #   D == 5,6-dihydrouridine 
 33  #   S == thiouridine 
 34  #   W == wyosine 
 35  extended_dna_letters = "GATCBDSW" 
 36   
 37  # are there extended forms? 
 38  #extended_rna_letters = "GAUCBDSW" 
 39   
 40  ambiguous_dna_values = { 
 41      "A": "A", 
 42      "C": "C", 
 43      "G": "G", 
 44      "T": "T", 
 45      "M": "AC", 
 46      "R": "AG", 
 47      "W": "AT", 
 48      "S": "CG", 
 49      "Y": "CT", 
 50      "K": "GT", 
 51      "V": "ACG", 
 52      "H": "ACT", 
 53      "D": "AGT", 
 54      "B": "CGT", 
 55      "X": "GATC", 
 56      "N": "GATC", 
 57      } 
 58  ambiguous_rna_values = { 
 59      "A": "A", 
 60      "C": "C", 
 61      "G": "G", 
 62      "U": "U", 
 63      "M": "AC", 
 64      "R": "AG", 
 65      "W": "AU", 
 66      "S": "CG", 
 67      "Y": "CU", 
 68      "K": "GU", 
 69      "V": "ACG", 
 70      "H": "ACU", 
 71      "D": "AGU", 
 72      "B": "CGU", 
 73      "X": "GAUC", 
 74      "N": "GAUC", 
 75      } 
 76   
 77  ambiguous_dna_complement = { 
 78      "A": "T", 
 79      "C": "G", 
 80      "G": "C", 
 81      "T": "A", 
 82      "M": "K", 
 83      "R": "Y", 
 84      "W": "W", 
 85      "S": "S", 
 86      "Y": "R", 
 87      "K": "M", 
 88      "V": "B", 
 89      "H": "D", 
 90      "D": "H", 
 91      "B": "V", 
 92      "X": "X", 
 93      "N": "N", 
 94      } 
 95   
 96  ambiguous_rna_complement = { 
 97      "A": "U", 
 98      "C": "G", 
 99      "G": "C", 
100      "U": "A", 
101      "M": "K", 
102      "R": "Y", 
103      "W": "W", 
104      "S": "S", 
105      "Y": "R", 
106      "K": "M", 
107      "V": "B", 
108      "H": "D", 
109      "D": "H", 
110      "B": "V", 
111      "X": "X", 
112      "N": "N", 
113      } 
114   
115   
116 -def _make_ranges(mydict):
117 d = {} 118 for key, value in mydict.iteritems(): 119 d[key] = (value, value) 120 return d
121 122 # From bioperl's SeqStats.pm 123 unambiguous_dna_weights = { 124 "A": 347., 125 "C": 323., 126 "G": 363., 127 "T": 322., 128 } 129 unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) 130 131 unambiguous_rna_weights = { 132 "A": unambiguous_dna_weights["A"] + 16., # 16 for the oxygen 133 "C": unambiguous_dna_weights["C"] + 16., 134 "G": unambiguous_dna_weights["G"] + 16., 135 "U": 340., 136 } 137 unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) 138 139
140 -def _make_ambiguous_ranges(mydict, weight_table):
141 range_d = {} 142 avg_d = {} 143 for letter, values in mydict.iteritems(): 144 #Following line is a quick hack to skip undefined weights for U and O 145 if len(values) == 1 and values[0] not in weight_table: 146 continue 147 148 weights = map(weight_table.get, values) 149 range_d[letter] = (min(weights), max(weights)) 150 total_w = 0.0 151 for w in weights: 152 total_w = total_w + w 153 avg_d[letter] = total_w / len(weights) 154 return range_d, avg_d
155 156 ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \ 157 _make_ambiguous_ranges(ambiguous_dna_values, 158 unambiguous_dna_weights) 159 160 ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \ 161 _make_ambiguous_ranges(ambiguous_rna_values, 162 unambiguous_rna_weights) 163 164 protein_weights = { 165 "A": 89.09, 166 "C": 121.16, 167 "D": 133.10, 168 "E": 147.13, 169 "F": 165.19, 170 "G": 75.07, 171 "H": 155.16, 172 "I": 131.18, 173 "K": 146.19, 174 "L": 131.18, 175 "M": 149.21, 176 "N": 132.12, 177 #"O": 0.0, # Needs to be recorded! 178 "P": 115.13, 179 "Q": 146.15, 180 "R": 174.20, 181 "S": 105.09, 182 "T": 119.12, 183 #"U": 168.05, # To be confirmed 184 "V": 117.15, 185 "W": 204.23, 186 "Y": 181.19 187 } 188 189 monoisotopic_protein_weights = { 190 "A": 89.05, 191 "C": 121.02, 192 "D": 133.04, 193 "E": 147.05, 194 "F": 165.08, 195 "G": 75.03, 196 "H": 155.07, 197 "I": 131.09, 198 "K": 146.11, 199 "L": 131.09, 200 "M": 149.05, 201 "N": 132.05, 202 "P": 115.06, 203 "Q": 146.07, 204 "R": 174.11, 205 "S": 105.04, 206 "T": 119.06, 207 "V": 117.08, 208 "W": 204.09, 209 "Y": 181.07, 210 } 211 212 extended_protein_values = { 213 "A": "A", 214 "B": "ND", 215 "C": "C", 216 "D": "D", 217 "E": "E", 218 "F": "F", 219 "G": "G", 220 "H": "H", 221 "I": "I", 222 "J": "IL", 223 "K": "K", 224 "L": "L", 225 "M": "M", 226 "N": "N", 227 "O": "O", 228 "P": "P", 229 "Q": "Q", 230 "R": "R", 231 "S": "S", 232 "T": "T", 233 "U": "U", 234 "V": "V", 235 "W": "W", 236 "X": "ACDEFGHIKLMNPQRSTVWY", 237 #TODO - Include U and O in the possible values of X? 238 #This could alter the extended_protein_weight_ranges ... 239 "Y": "Y", 240 "Z": "QE", 241 } 242 243 protein_weight_ranges = _make_ranges(protein_weights) 244 245 extended_protein_weight_ranges, avg_extended_protein_weights = \ 246 _make_ambiguous_ranges(extended_protein_values, 247 protein_weights) 248 249 250 # For Center of Mass Calculation. 251 # Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol 252 atom_weights = { 253 'H' : 1.00794, 254 'He' : 4.002602, 255 'Li' : 6.941, 256 'Be' : 9.012182, 257 'B' : 10.811, 258 'C' : 12.0107, 259 'N' : 14.0067, 260 'O' : 15.9994, 261 'F' : 18.9984032, 262 'Ne' : 20.1797, 263 'Na' : 22.989770, 264 'Mg' : 24.3050, 265 'Al' : 26.981538, 266 'Si' : 28.0855, 267 'P' : 30.973761, 268 'S' : 32.065, 269 'Cl' : 35.453, 270 'Ar' : 39.948, 271 'K' : 39.0983, 272 'Ca' : 40.078, 273 'Sc' : 44.955910, 274 'Ti' : 47.867, 275 'V' : 50.9415, 276 'Cr' : 51.9961, 277 'Mn' : 54.938049, 278 'Fe' : 55.845, 279 'Co' : 58.933200, 280 'Ni' : 58.6934, 281 'Cu' : 63.546, 282 'Zn' : 65.39, 283 'Ga' : 69.723, 284 'Ge' : 72.64, 285 'As' : 74.92160, 286 'Se' : 78.96, 287 'Br' : 79.904, 288 'Kr' : 83.80, 289 'Rb' : 85.4678, 290 'Sr' : 87.62, 291 'Y' : 88.90585, 292 'Zr' : 91.224, 293 'Nb' : 92.90638, 294 'Mo' : 95.94, 295 'Tc' : 98.0, 296 'Ru' : 101.07, 297 'Rh' : 102.90550, 298 'Pd' : 106.42, 299 'Ag' : 107.8682, 300 'Cd' : 112.411, 301 'In' : 114.818, 302 'Sn' : 118.710, 303 'Sb' : 121.760, 304 'Te' : 127.60, 305 'I' : 126.90447, 306 'Xe' : 131.293, 307 'Cs' : 132.90545, 308 'Ba' : 137.327, 309 'La' : 138.9055, 310 'Ce' : 140.116, 311 'Pr' : 140.90765, 312 'Nd' : 144.24, 313 'Pm' : 145.0, 314 'Sm' : 150.36, 315 'Eu' : 151.964, 316 'Gd' : 157.25, 317 'Tb' : 158.92534, 318 'Dy' : 162.50, 319 'Ho' : 164.93032, 320 'Er' : 167.259, 321 'Tm' : 168.93421, 322 'Yb' : 173.04, 323 'Lu' : 174.967, 324 'Hf' : 178.49, 325 'Ta' : 180.9479, 326 'W' : 183.84, 327 'Re' : 186.207, 328 'Os' : 190.23, 329 'Ir' : 192.217, 330 'Pt' : 195.078, 331 'Au' : 196.96655, 332 'Hg' : 200.59, 333 'Tl' : 204.3833, 334 'Pb' : 207.2, 335 'Bi' : 208.98038, 336 'Po' : 208.98, 337 'At' : 209.99, 338 'Rn' : 222.02, 339 'Fr' : 223.02, 340 'Ra' : 226.03, 341 'Ac' : 227.03, 342 'Th' : 232.0381, 343 'Pa' : 231.03588, 344 'U' : 238.02891, 345 'Np' : 237.05, 346 'Pu' : 244.06, 347 'Am' : 243.06, 348 'Cm' : 247.07, 349 'Bk' : 247.07, 350 'Cf' : 251.08, 351 'Es' : 252.08, 352 'Fm' : 257.10, 353 'Md' : 258.10, 354 'No' : 259.10, 355 'Lr' : 262.11, 356 'Rf' : 261.11, 357 'Db' : 262.11, 358 'Sg' : 266.12, 359 'Bh' : 264.12, 360 'Hs' : 269.13, 361 'Mt' : 268.14, 362 } 363