Package Bio :: Package PDB :: Module DSSP'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.DSSP'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Use the DSSP program to calculate secondary structure and accessibility. 
  7   
  8  You need to have a working version of DSSP (and a license, free for academic 
  9  use) in order to use this. For DSSP, see U{http://swift.cmbi.ru.nl/gv/dssp/}. 
 10   
 11  The DSSP codes for secondary structure used here are: 
 12   
 13      H 
 14          Alpha helix (4-12) 
 15      B 
 16          Isolated beta-bridge residue 
 17      E 
 18          Strand 
 19      G 
 20          3-10 helix 
 21      I 
 22          pi helix 
 23      T 
 24          Turn 
 25      S 
 26          Bend 
 27      \- 
 28          None 
 29   
 30  The following Accessible surface area (ASA) values can be used, defaulting 
 31  to the Sander and Rost values: 
 32   
 33      Miller 
 34          Miller et al. 1987 http://dx.doi.org/10.1016/0022-2836(87)90038-6 
 35      Sander 
 36          Sander and Rost 1994 http://dx.doi.org/10.1002/prot.340200303 
 37      Wilke 
 38          Tien et al. 2013 http://dx.doi.org/10.1371/journal.pone.0080635 
 39   
 40  """ 
 41   
 42  from __future__ import print_function 
 43   
 44  import re 
 45  from Bio._py3k import StringIO 
 46  import subprocess 
 47  import warnings 
 48   
 49  from Bio.Data import SCOPData 
 50   
 51  from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap 
 52  from Bio.PDB.PDBExceptions import PDBException 
 53  from Bio.PDB.PDBParser import PDBParser 
 54   
 55   
 56  # Match C in DSSP 
 57  _dssp_cys = re.compile('[a-z]') 
 58   
 59  # Maximal ASA of amino acids 
 60  # Used for relative accessibility 
 61   
 62  residue_max_acc = { 
 63      # Miller max acc: Miller et al. 1987 http://dx.doi.org/10.1016/0022-2836(87)90038-6 
 64      # Wilke: Tien et al. 2013 http://dx.doi.org/10.1371/journal.pone.0080635 
 65      # Sander: Sander & Rost 1994 http://dx.doi.org/10.1002/prot.340200303 
 66      'Miller': { 
 67          'ALA': 113.0, 'ARG': 241.0, 'ASN': 158.0, 'ASP': 151.0, 
 68          'CYS': 140.0, 'GLN': 189.0, 'GLU': 183.0, 'GLY': 85.0, 
 69          'HIS': 194.0, 'ILE': 182.0, 'LEU': 180.0, 'LYS': 211.0, 
 70          'MET': 204.0, 'PHE': 218.0, 'PRO': 143.0, 'SER': 122.0, 
 71          'THR': 146.0, 'TRP': 259.0, 'TYR': 229.0, 'VAL': 160.0 
 72      }, 
 73      'Wilke': { 
 74          'ALA': 129.0, 'ARG': 274.0, 'ASN': 195.0, 'ASP': 193.0, 
 75          'CYS': 167.0, 'GLN': 225.0, 'GLU': 223.0, 'GLY': 104.0, 
 76          'HIS': 224.0, 'ILE': 197.0, 'LEU': 201.0, 'LYS': 236.0, 
 77          'MET': 224.0, 'PHE': 240.0, 'PRO': 159.0, 'SER': 155.0, 
 78          'THR': 172.0, 'TRP': 285.0, 'TYR': 263.0, 'VAL': 174.0 
 79      }, 
 80      'Sander': { 
 81          'ALA': 106.0, 'ARG': 248.0, 'ASN': 157.0, 'ASP': 163.0, 
 82          'CYS': 135.0, 'GLN': 198.0, 'GLU': 194.0, 'GLY': 84.0, 
 83          'HIS': 184.0, 'ILE': 169.0, 'LEU': 164.0, 'LYS': 205.0, 
 84          'MET': 188.0, 'PHE': 197.0, 'PRO': 136.0, 'SER': 130.0, 
 85          'THR': 142.0, 'TRP': 227.0, 'TYR': 222.0, 'VAL': 142.0 
 86      } 
 87  } 
 88   
 89   
90 -def ss_to_index(ss):
91 """Secondary structure symbol to index. 92 93 H=0 94 E=1 95 C=2 96 """ 97 if ss == 'H': 98 return 0 99 if ss == 'E': 100 return 1 101 if ss == 'C': 102 return 2 103 assert 0
104 105
106 -def dssp_dict_from_pdb_file(in_file, DSSP="dssp"):
107 """Create a DSSP dictionary from a PDB file. 108 109 Example: 110 -------- 111 >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb") 112 >>> aa, ss, acc=dssp_dict[('A', 1)] 113 114 Parameters 115 ---------- 116 in_file : string 117 pdb file 118 119 DSSP : string 120 DSSP executable (argument to os.system) 121 122 Returns 123 ------- 124 (out_dict, keys) : tuple 125 a dictionary that maps (chainid, resid) to 126 amino acid type, secondary structure code and 127 accessibility. 128 """ 129 # Using universal newlines is important on Python 3, this 130 # gives unicode handles rather than bytes handles. 131 p = subprocess.Popen([DSSP, in_file], universal_newlines=True, 132 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 133 out, err = p.communicate() 134 135 # Alert user for errors 136 if err.strip(): 137 warnings.warn(err) 138 if not out.strip(): 139 raise Exception('DSSP failed to produce an output') 140 141 out_dict, keys = _make_dssp_dict(StringIO(out)) 142 return out_dict, keys
143 144
145 -def make_dssp_dict(filename):
146 """DSSP dictionary mapping identifers to properties. 147 148 Return a DSSP dictionary that maps (chainid, resid) to 149 aa, ss and accessibility, from a DSSP file. :: 150 151 Parameters 152 ---------- 153 filename : string 154 the DSSP output file 155 """ 156 with open(filename, "r") as handle: 157 return _make_dssp_dict(handle)
158 159
160 -def _make_dssp_dict(handle):
161 """Internal function used by mask_dssp_dict (PRIVATE). 162 163 Return a DSSP dictionary that maps (chainid, resid) to an amino acid, 164 secondary structure symbol, solvent accessibility value, and hydrogen bond 165 information (relative dssp indices and hydrogen bond energies) from an open 166 DSSP file object. :: 167 168 Parameters 169 ---------- 170 handle : file 171 the open DSSP output file handle 172 """ 173 dssp = {} 174 start = 0 175 keys = [] 176 for l in handle.readlines(): 177 sl = l.split() 178 if len(sl) < 2: 179 continue 180 if sl[1] == "RESIDUE": 181 # Start parsing from here 182 start = 1 183 continue 184 if not start: 185 continue 186 if l[9] == " ": 187 # Skip -- missing residue 188 continue 189 190 dssp_index = int(l[:5]) 191 resseq = int(l[5:10]) 192 icode = l[10] 193 chainid = l[11] 194 aa = l[13] 195 ss = l[16] 196 if ss == " ": 197 ss = "-" 198 try: 199 NH_O_1_relidx = int(l[38:45]) 200 NH_O_1_energy = float(l[46:50]) 201 O_NH_1_relidx = int(l[50:56]) 202 O_NH_1_energy = float(l[57:61]) 203 NH_O_2_relidx = int(l[61:67]) 204 NH_O_2_energy = float(l[68:72]) 205 O_NH_2_relidx = int(l[72:78]) 206 O_NH_2_energy = float(l[79:83]) 207 208 acc = int(l[34:38]) 209 phi = float(l[103:109]) 210 psi = float(l[109:115]) 211 except ValueError as exc: 212 # DSSP output breaks its own format when there are >9999 213 # residues, since only 4 digits are allocated to the seq num 214 # field. See 3kic chain T res 321, 1vsy chain T res 6077. 215 # Here, look for whitespace to figure out the number of extra 216 # digits, and shift parsing the rest of the line by that amount. 217 if l[34] != ' ': 218 shift = l[34:].find(' ') 219 220 NH_O_1_relidx = int(l[38 + shift:45 + shift]) 221 NH_O_1_energy = float(l[46 + shift:50 + shift]) 222 O_NH_1_relidx = int(l[50 + shift:56 + shift]) 223 O_NH_1_energy = float(l[57 + shift:61 + shift]) 224 NH_O_2_relidx = int(l[61 + shift:67 + shift]) 225 NH_O_2_energy = float(l[68 + shift:72 + shift]) 226 O_NH_2_relidx = int(l[72 + shift:78 + shift]) 227 O_NH_2_energy = float(l[79 + shift:83 + shift]) 228 229 acc = int((l[34 + shift:38 + shift])) 230 phi = float(l[103 + shift:109 + shift]) 231 psi = float(l[109 + shift:115 + shift]) 232 else: 233 raise ValueError(exc) 234 res_id = (" ", resseq, icode) 235 dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi, dssp_index, 236 NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, 237 NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) 238 keys.append((chainid, res_id)) 239 return dssp, keys
240 241
242 -class DSSP(AbstractResiduePropertyMap):
243 """Run DSSP and parse secondary structure and accessibility. 244 245 Run DSSP on a pdb file, and provide a handle to the 246 DSSP secondary structure and accessibility. 247 248 **Note** that DSSP can only handle one model. 249 250 Example: 251 -------- 252 253 >>> p = PDBParser() 254 >>> structure = p.get_structure("1MOT", "1MOT.pdb") 255 >>> model = structure[0] 256 >>> dssp = DSSP(model, "1MOT.pdb") 257 >>> # DSSP data is accessed by a tuple (chain_id, res_id) 258 >>> a_key = list(dssp.keys())[2] 259 >>> # residue object, secondary structure, solvent accessibility, 260 >>> # relative accessiblity, phi, psi 261 >>> dssp[a_key] 262 (<Residue ALA het= resseq=251 icode= >, 263 'H', 264 72, 265 0.67924528301886788, 266 -61.200000000000003, 267 -42.399999999999999) 268 """ 269
270 - def __init__(self, model, in_file, dssp="dssp", acc_array="Sander", file_type='PDB'):
271 """Create a DSSP object. 272 273 Parameters 274 ---------- 275 model : Model 276 The first model of the structure 277 in_file : string 278 Either a PDB file or a DSSP file. 279 dssp : string 280 The dssp executable (ie. the argument to os.system) 281 acc_array : string 282 Accessible surface area (ASA) from either Miller et al. (1987), 283 Sander & Rost (1994), or Wilke: Tien et al. 2013, as string 284 Sander/Wilke/Miller. Defaults to Sander. 285 file_type: string 286 File type switch, either PDB or DSSP with PDB as default. 287 """ 288 289 self.residue_max_acc = residue_max_acc[acc_array] 290 291 # create DSSP dictionary 292 file_type = file_type.upper() 293 assert(file_type in ['PDB', 'DSSP']) 294 # If the input file is a PDB file run DSSP and parse output: 295 if file_type == 'PDB': 296 dssp_dict, dssp_keys = dssp_dict_from_pdb_file(in_file, dssp) 297 # If the input file is a DSSP file just parse it directly: 298 elif file_type == 'DSSP': 299 dssp_dict, dssp_keys = make_dssp_dict(in_file) 300 301 dssp_map = {} 302 dssp_list = [] 303 304 def resid2code(res_id): 305 """Serialize a residue's resseq and icode for easy comparison.""" 306 return '%s%s' % (res_id[1], res_id[2])
307 308 # Now create a dictionary that maps Residue objects to 309 # secondary structure and accessibility, and a list of 310 # (residue, (secondary structure, accessibility)) tuples 311 for key in dssp_keys: 312 chain_id, res_id = key 313 chain = model[chain_id] 314 try: 315 res = chain[res_id] 316 except KeyError: 317 # In DSSP, HET field is not considered in residue identifier. 318 # Thus HETATM records may cause unnecessary exceptions. 319 # (See 3jui chain A res 593.) 320 # Try the lookup again with all HETATM other than water 321 res_seq_icode = resid2code(res_id) 322 for r in chain: 323 if r.id[0] not in (' ', 'W'): 324 # Compare resseq + icode 325 if resid2code(r.id) == res_seq_icode: 326 # Found a matching residue 327 res = r 328 break 329 else: 330 raise KeyError(res_id) 331 332 # For disordered residues of point mutations, BioPython uses the 333 # last one as default, But DSSP takes the first one (alternative 334 # location is blank, A or 1). See 1h9h chain E resi 22. 335 # Here we select the res in which all atoms have altloc blank, A or 336 # 1. If no such residues are found, simply use the first one appears 337 # (as DSSP does). 338 if res.is_disordered() == 2: 339 for rk in res.disordered_get_id_list(): 340 # All atoms in the disordered residue should have the same 341 # altloc, so it suffices to check the altloc of the first 342 # atom. 343 altloc = res.child_dict[rk].get_list()[0].get_altloc() 344 if altloc in tuple('A1 '): 345 res.disordered_select(rk) 346 break 347 else: 348 # Simply select the first one 349 res.disordered_select(res.disordered_get_id_list()[0]) 350 351 # Sometimes point mutations are put into HETATM and ATOM with altloc 352 # 'A' and 'B'. 353 # See 3piu chain A residue 273: 354 # <Residue LLP het=H_LLP resseq=273 icode= > 355 # <Residue LYS het= resseq=273 icode= > 356 # DSSP uses the HETATM LLP as it has altloc 'A' 357 # We check the altloc code here. 358 elif res.is_disordered() == 1: 359 # Check altloc of all atoms in the DisorderedResidue. If it 360 # contains blank, A or 1, then use it. Otherwise, look for HET 361 # residues of the same seq+icode. If not such HET residues are 362 # found, just accept the current one. 363 altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) 364 if altlocs.isdisjoint('A1 '): 365 # Try again with all HETATM other than water 366 res_seq_icode = resid2code(res_id) 367 for r in chain: 368 if r.id[0] not in (' ', 'W'): 369 if resid2code(r.id) == res_seq_icode and \ 370 r.get_list()[0].get_altloc() in tuple('A1 '): 371 res = r 372 break 373 374 (aa, ss, acc, phi, psi, dssp_index, 375 NH_O_1_relidx, NH_O_1_energy, 376 O_NH_1_relidx, O_NH_1_energy, 377 NH_O_2_relidx, NH_O_2_energy, 378 O_NH_2_relidx, O_NH_2_energy) = dssp_dict[key] 379 380 res.xtra["SS_DSSP"] = ss 381 res.xtra["EXP_DSSP_ASA"] = acc 382 res.xtra["PHI_DSSP"] = phi 383 res.xtra["PSI_DSSP"] = psi 384 res.xtra["DSSP_INDEX"] = dssp_index 385 res.xtra["NH_O_1_RELIDX_DSSP"] = NH_O_1_relidx 386 res.xtra["NH_O_1_ENERGY_DSSP"] = NH_O_1_energy 387 res.xtra["O_NH_1_RELIDX_DSSP"] = O_NH_1_relidx 388 res.xtra["O_NH_1_ENERGY_DSSP"] = O_NH_1_energy 389 res.xtra["NH_O_2_RELIDX_DSSP"] = NH_O_2_relidx 390 res.xtra["NH_O_2_ENERGY_DSSP"] = NH_O_2_energy 391 res.xtra["O_NH_2_RELIDX_DSSP"] = O_NH_2_relidx 392 res.xtra["O_NH_2_ENERGY_DSSP"] = O_NH_2_energy 393 394 # Relative accessibility 395 resname = res.get_resname() 396 try: 397 rel_acc = acc / self.residue_max_acc[resname] 398 except KeyError: 399 # Invalid value for resname 400 rel_acc = 'NA' 401 else: 402 if rel_acc > 1.0: 403 rel_acc = 1.0 404 res.xtra["EXP_DSSP_RASA"] = rel_acc 405 # Verify if AA in DSSP == AA in Structure 406 # Something went wrong if this is not true! 407 # NB: DSSP uses X often 408 resname = SCOPData.protein_letters_3to1.get(resname, 'X') 409 if resname == "C": 410 # DSSP renames C in C-bridges to a,b,c,d,... 411 # - we rename it back to 'C' 412 if _dssp_cys.match(aa): 413 aa = 'C' 414 # Take care of HETATM again 415 if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): 416 raise PDBException("Structure/DSSP mismatch at %s" % res) 417 418 dssp_vals = (dssp_index, aa, ss, rel_acc, phi, psi, 419 NH_O_1_relidx, NH_O_1_energy, 420 O_NH_1_relidx, O_NH_1_energy, 421 NH_O_2_relidx, NH_O_2_energy, 422 O_NH_2_relidx, O_NH_2_energy) 423 424 dssp_map[key] = dssp_vals 425 dssp_list.append(dssp_vals) 426 427 AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, 428 dssp_list)
429 430 431 if __name__ == "__main__": 432 import sys 433 434 p = PDBParser() 435 s = p.get_structure('X', sys.argv[1]) 436 model = s[0] 437 d = DSSP(model, sys.argv[1]) 438 439 for r in d: 440 print(r) 441 print("Handled %i residues" % len(d)) 442 print(sorted(d)) 443 if ('A', 1) in d: 444 print(d[('A', 1)]) 445 print(s[0]['A'][1].xtra) 446 # Secondary structure 447 print(''.join(item[1] for item in d)) 448