Package Bio :: Package PDB :: Module DSSP'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.DSSP'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  r"""Use the DSSP program to calculate secondary structure and accessibility. 
  7   
  8  You need to have a working version of DSSP (and a license, free for academic 
  9  use) in order to use this. For DSSP, see http://swift.cmbi.ru.nl/gv/dssp/. 
 10   
 11  The following Accessible surface area (ASA) values can be used, defaulting 
 12  to the Sander and Rost values: 
 13   
 14      Miller 
 15          Miller et al. 1987 http://dx.doi.org/10.1016/0022-2836(87)90038-6 
 16      Sander 
 17          Sander and Rost 1994 http://dx.doi.org/10.1002/prot.340200303 
 18      Wilke 
 19          Tien et al. 2013 http://dx.doi.org/10.1371/journal.pone.0080635 
 20   
 21  The DSSP codes for secondary structure used here are: 
 22   
 23      =====     ==== 
 24      Code      Structure 
 25      =====     ==== 
 26       H        Alpha helix (4-12) 
 27       B        Isolated beta-bridge residue 
 28       E        Strand 
 29       G        3-10 helix 
 30       I        Pi helix 
 31       T        Turn 
 32       S        Bend 
 33       \-       None 
 34      =====     ==== 
 35   
 36  Usage 
 37  ----- 
 38   
 39  The DSSP class can be used to run DSSP on a pdb file, and provides a 
 40  handle to the DSSP secondary structure and accessibility. 
 41   
 42  **Note** that DSSP can only handle one model, and will only run 
 43  calculations on the first model in the provided PDB file. 
 44   
 45  Example: 
 46  -------- 
 47   
 48  >>> p = PDBParser() 
 49  >>> structure = p.get_structure("1MOT", "1mot.pdb") 
 50  >>> model = structure[0] 
 51  >>> dssp = DSSP(model, "1mot.pdb") 
 52   
 53  Note that the recent DSSP executable from the DSSP-2 package was 
 54  renamed from `dssp` to `mkdssp`. If using a recent DSSP release, 
 55  you may need to provide the name of your DSSP executable: 
 56   
 57  >>> dssp = DSSP(model, '1mot.pdb', dssp='mkdssp') 
 58   
 59  DSSP data is accessed by a tuple - (chain id, residue id): 
 60   
 61  >>> a_key = list(dssp.keys())[2] 
 62  >>> a_key 
 63  ('A', (' ', 251, ' ')) 
 64  >>> dssp[a_key] 
 65  (3, 'A', 'H', 0.7075471698113207, -61.2, -42.4, 
 66   -2, -0.7, 4, -3.0, 1, -0.2, 5, -0.2) 
 67   
 68  The dssp data returned for a single residue is a tuple in the form: 
 69   
 70      ============ === 
 71      Tuple Index  Value 
 72      ============ === 
 73      0            DSSP index 
 74      1            Amino acid 
 75      2            Secondary structure 
 76      3            Relative ASA 
 77      4            Phi 
 78      5            Psi 
 79      6            NH-->O_1_relidx 
 80      7            NH-->O_1_energy 
 81      8            O-->NH_1_relidx 
 82      9            O-->NH_1_energy 
 83      10           NH-->O_2_relidx 
 84      11           NH-->O_2_energy 
 85      12           O-->NH_2_relidx 
 86      13           O-->NH_2_energy 
 87      ============ === 
 88   
 89  """ 
 90   
 91  from __future__ import print_function 
 92   
 93  import re 
 94  from Bio._py3k import StringIO 
 95  import subprocess 
 96  import warnings 
 97   
 98  from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap 
 99  from Bio.PDB.PDBExceptions import PDBException 
100  from Bio.PDB.PDBParser import PDBParser 
101  from Bio.PDB.Polypeptide import three_to_one 
102   
103  # Match C in DSSP 
104  _dssp_cys = re.compile('[a-z]') 
105   
106  # Maximal ASA of amino acids 
107  # Used for relative accessibility 
108   
109  residue_max_acc = { 
110      # Miller max acc: Miller et al. 1987 http://dx.doi.org/10.1016/0022-2836(87)90038-6 
111      # Wilke: Tien et al. 2013 http://dx.doi.org/10.1371/journal.pone.0080635 
112      # Sander: Sander & Rost 1994 http://dx.doi.org/10.1002/prot.340200303 
113      'Miller': { 
114          'ALA': 113.0, 'ARG': 241.0, 'ASN': 158.0, 'ASP': 151.0, 
115          'CYS': 140.0, 'GLN': 189.0, 'GLU': 183.0, 'GLY': 85.0, 
116          'HIS': 194.0, 'ILE': 182.0, 'LEU': 180.0, 'LYS': 211.0, 
117          'MET': 204.0, 'PHE': 218.0, 'PRO': 143.0, 'SER': 122.0, 
118          'THR': 146.0, 'TRP': 259.0, 'TYR': 229.0, 'VAL': 160.0 
119      }, 
120      'Wilke': { 
121          'ALA': 129.0, 'ARG': 274.0, 'ASN': 195.0, 'ASP': 193.0, 
122          'CYS': 167.0, 'GLN': 225.0, 'GLU': 223.0, 'GLY': 104.0, 
123          'HIS': 224.0, 'ILE': 197.0, 'LEU': 201.0, 'LYS': 236.0, 
124          'MET': 224.0, 'PHE': 240.0, 'PRO': 159.0, 'SER': 155.0, 
125          'THR': 172.0, 'TRP': 285.0, 'TYR': 263.0, 'VAL': 174.0 
126      }, 
127      'Sander': { 
128          'ALA': 106.0, 'ARG': 248.0, 'ASN': 157.0, 'ASP': 163.0, 
129          'CYS': 135.0, 'GLN': 198.0, 'GLU': 194.0, 'GLY': 84.0, 
130          'HIS': 184.0, 'ILE': 169.0, 'LEU': 164.0, 'LYS': 205.0, 
131          'MET': 188.0, 'PHE': 197.0, 'PRO': 136.0, 'SER': 130.0, 
132          'THR': 142.0, 'TRP': 227.0, 'TYR': 222.0, 'VAL': 142.0 
133      } 
134  } 
135   
136   
137 -def ss_to_index(ss):
138 """Secondary structure symbol to index. 139 140 H=0 141 E=1 142 C=2 143 """ 144 if ss == 'H': 145 return 0 146 if ss == 'E': 147 return 1 148 if ss == 'C': 149 return 2 150 assert 0
151 152
153 -def dssp_dict_from_pdb_file(in_file, DSSP="dssp"):
154 """Create a DSSP dictionary from a PDB file. 155 156 Example: 157 -------- 158 >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb") 159 >>> aa, ss, acc=dssp_dict[('A', 1)] 160 161 Parameters 162 ---------- 163 in_file : string 164 pdb file 165 166 DSSP : string 167 DSSP executable (argument to os.system) 168 169 Returns 170 ------- 171 (out_dict, keys) : tuple 172 a dictionary that maps (chainid, resid) to 173 amino acid type, secondary structure code and 174 accessibility. 175 176 """ 177 # Using universal newlines is important on Python 3, this 178 # gives unicode handles rather than bytes handles. 179 # Newer version of DSSP executable is named 'mkdssp', 180 # and calling 'dssp' will hence not work in some operating systems 181 # (Debian distribution of DSSP includes a symlink for 'dssp' argument) 182 try: 183 p = subprocess.Popen([DSSP, in_file], universal_newlines=True, 184 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 185 except OSError: # TODO: Use FileNotFoundError once drop Python 2 186 if DSSP == "mkdssp": 187 raise 188 p = subprocess.Popen(["mkdssp", in_file], universal_newlines=True, 189 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 190 out, err = p.communicate() 191 192 # Alert user for errors 193 if err.strip(): 194 warnings.warn(err) 195 if not out.strip(): 196 raise Exception('DSSP failed to produce an output') 197 198 out_dict, keys = _make_dssp_dict(StringIO(out)) 199 return out_dict, keys
200 201
202 -def make_dssp_dict(filename):
203 """DSSP dictionary mapping identifers to properties. 204 205 Return a DSSP dictionary that maps (chainid, resid) to 206 aa, ss and accessibility, from a DSSP file. 207 208 Parameters 209 ---------- 210 filename : string 211 the DSSP output file 212 213 """ 214 with open(filename, "r") as handle: 215 return _make_dssp_dict(handle)
216 217
218 -def _make_dssp_dict(handle):
219 """Internal function used by mask_dssp_dict (PRIVATE). 220 221 Return a DSSP dictionary that maps (chainid, resid) to an amino acid, 222 secondary structure symbol, solvent accessibility value, and hydrogen bond 223 information (relative dssp indices and hydrogen bond energies) from an open 224 DSSP file object. 225 226 Parameters 227 ---------- 228 handle : file 229 the open DSSP output file handle 230 231 """ 232 dssp = {} 233 start = 0 234 keys = [] 235 for l in handle.readlines(): 236 sl = l.split() 237 if len(sl) < 2: 238 continue 239 if sl[1] == "RESIDUE": 240 # Start parsing from here 241 start = 1 242 continue 243 if not start: 244 continue 245 if l[9] == " ": 246 # Skip -- missing residue 247 continue 248 249 dssp_index = int(l[:5]) 250 resseq = int(l[5:10]) 251 icode = l[10] 252 chainid = l[11] 253 aa = l[13] 254 ss = l[16] 255 if ss == " ": 256 ss = "-" 257 try: 258 NH_O_1_relidx = int(l[38:45]) 259 NH_O_1_energy = float(l[46:50]) 260 O_NH_1_relidx = int(l[50:56]) 261 O_NH_1_energy = float(l[57:61]) 262 NH_O_2_relidx = int(l[61:67]) 263 NH_O_2_energy = float(l[68:72]) 264 O_NH_2_relidx = int(l[72:78]) 265 O_NH_2_energy = float(l[79:83]) 266 267 acc = int(l[34:38]) 268 phi = float(l[103:109]) 269 psi = float(l[109:115]) 270 except ValueError as exc: 271 # DSSP output breaks its own format when there are >9999 272 # residues, since only 4 digits are allocated to the seq num 273 # field. See 3kic chain T res 321, 1vsy chain T res 6077. 274 # Here, look for whitespace to figure out the number of extra 275 # digits, and shift parsing the rest of the line by that amount. 276 if l[34] != ' ': 277 shift = l[34:].find(' ') 278 279 NH_O_1_relidx = int(l[38 + shift:45 + shift]) 280 NH_O_1_energy = float(l[46 + shift:50 + shift]) 281 O_NH_1_relidx = int(l[50 + shift:56 + shift]) 282 O_NH_1_energy = float(l[57 + shift:61 + shift]) 283 NH_O_2_relidx = int(l[61 + shift:67 + shift]) 284 NH_O_2_energy = float(l[68 + shift:72 + shift]) 285 O_NH_2_relidx = int(l[72 + shift:78 + shift]) 286 O_NH_2_energy = float(l[79 + shift:83 + shift]) 287 288 acc = int((l[34 + shift:38 + shift])) 289 phi = float(l[103 + shift:109 + shift]) 290 psi = float(l[109 + shift:115 + shift]) 291 else: 292 raise ValueError(exc) 293 res_id = (" ", resseq, icode) 294 dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi, dssp_index, 295 NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, 296 NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) 297 keys.append((chainid, res_id)) 298 return dssp, keys
299 300
301 -class DSSP(AbstractResiduePropertyMap):
302 """Run DSSP and parse secondary structure and accessibility. 303 304 Run DSSP on a pdb file, and provide a handle to the 305 DSSP secondary structure and accessibility. 306 307 **Note** that DSSP can only handle one model. 308 309 Example: 310 -------- 311 312 >>> p = PDBParser() 313 >>> structure = p.get_structure("1MOT", "1mot.pdb") 314 >>> model = structure[0] 315 >>> dssp = DSSP(model, "1mot.pdb") 316 >>> # DSSP data is accessed by a tuple (chain_id, res_id) 317 >>> a_key = list(dssp.keys())[2] 318 >>> # (dssp index, amino acid, secondary structure, relative ASA, phi, psi, 319 >>> # NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, 320 >>> # NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) 321 >>> dssp[a_key] 322 (3, 'A', 'H', 0.7075471698113207, -61.2, -42.4, 323 -2, -0.7, 4, -3.0, 1, -0.2, 5, -0.2) 324 """ 325
326 - def __init__(self, model, in_file, dssp="dssp", acc_array="Sander", file_type='PDB'):
327 """Create a DSSP object. 328 329 Parameters 330 ---------- 331 model : Model 332 The first model of the structure 333 in_file : string 334 Either a PDB file or a DSSP file. 335 dssp : string 336 The dssp executable (ie. the argument to os.system) 337 acc_array : string 338 Accessible surface area (ASA) from either Miller et al. (1987), 339 Sander & Rost (1994), or Wilke: Tien et al. 2013, as string 340 Sander/Wilke/Miller. Defaults to Sander. 341 file_type: string 342 File type switch, either PDB or DSSP with PDB as default. 343 344 """ 345 self.residue_max_acc = residue_max_acc[acc_array] 346 347 # create DSSP dictionary 348 file_type = file_type.upper() 349 assert(file_type in ['PDB', 'DSSP']) 350 # If the input file is a PDB file run DSSP and parse output: 351 if file_type == 'PDB': 352 # Newer versions of DSSP program call the binary 'mkdssp', so 353 # calling 'dssp' will not work in some operating systems 354 # (Debian distribution of DSSP includes a symlink for 'dssp' argument) 355 try: 356 dssp_dict, dssp_keys = dssp_dict_from_pdb_file(in_file, dssp) 357 except OSError: # TODO: Use FileNotFoundError once drop Python 2 358 if dssp == 'dssp': 359 dssp = 'mkdssp' 360 elif dssp == 'mkdssp': 361 dssp = 'dssp' 362 else: 363 raise 364 dssp_dict, dssp_keys = dssp_dict_from_pdb_file(in_file, dssp) 365 # If the input file is a DSSP file just parse it directly: 366 elif file_type == 'DSSP': 367 dssp_dict, dssp_keys = make_dssp_dict(in_file) 368 369 dssp_map = {} 370 dssp_list = [] 371 372 def resid2code(res_id): 373 """Serialize a residue's resseq and icode for easy comparison.""" 374 return '%s%s' % (res_id[1], res_id[2])
375 376 # Now create a dictionary that maps Residue objects to 377 # secondary structure and accessibility, and a list of 378 # (residue, (secondary structure, accessibility)) tuples 379 for key in dssp_keys: 380 chain_id, res_id = key 381 chain = model[chain_id] 382 try: 383 res = chain[res_id] 384 except KeyError: 385 # In DSSP, HET field is not considered in residue identifier. 386 # Thus HETATM records may cause unnecessary exceptions. 387 # (See 3jui chain A res 593.) 388 # Try the lookup again with all HETATM other than water 389 res_seq_icode = resid2code(res_id) 390 for r in chain: 391 if r.id[0] not in (' ', 'W'): 392 # Compare resseq + icode 393 if resid2code(r.id) == res_seq_icode: 394 # Found a matching residue 395 res = r 396 break 397 else: 398 raise KeyError(res_id) 399 400 # For disordered residues of point mutations, BioPython uses the 401 # last one as default, But DSSP takes the first one (alternative 402 # location is blank, A or 1). See 1h9h chain E resi 22. 403 # Here we select the res in which all atoms have altloc blank, A or 404 # 1. If no such residues are found, simply use the first one appears 405 # (as DSSP does). 406 if res.is_disordered() == 2: 407 for rk in res.disordered_get_id_list(): 408 # All atoms in the disordered residue should have the same 409 # altloc, so it suffices to check the altloc of the first 410 # atom. 411 altloc = res.child_dict[rk].get_list()[0].get_altloc() 412 if altloc in tuple('A1 '): 413 res.disordered_select(rk) 414 break 415 else: 416 # Simply select the first one 417 res.disordered_select(res.disordered_get_id_list()[0]) 418 419 # Sometimes point mutations are put into HETATM and ATOM with altloc 420 # 'A' and 'B'. 421 # See 3piu chain A residue 273: 422 # <Residue LLP het=H_LLP resseq=273 icode= > 423 # <Residue LYS het= resseq=273 icode= > 424 # DSSP uses the HETATM LLP as it has altloc 'A' 425 # We check the altloc code here. 426 elif res.is_disordered() == 1: 427 # Check altloc of all atoms in the DisorderedResidue. If it 428 # contains blank, A or 1, then use it. Otherwise, look for HET 429 # residues of the same seq+icode. If not such HET residues are 430 # found, just accept the current one. 431 altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) 432 if altlocs.isdisjoint('A1 '): 433 # Try again with all HETATM other than water 434 res_seq_icode = resid2code(res_id) 435 for r in chain: 436 if r.id[0] not in (' ', 'W'): 437 if resid2code(r.id) == res_seq_icode and \ 438 r.get_list()[0].get_altloc() in tuple('A1 '): 439 res = r 440 break 441 442 (aa, ss, acc, phi, psi, dssp_index, 443 NH_O_1_relidx, NH_O_1_energy, 444 O_NH_1_relidx, O_NH_1_energy, 445 NH_O_2_relidx, NH_O_2_energy, 446 O_NH_2_relidx, O_NH_2_energy) = dssp_dict[key] 447 448 res.xtra["SS_DSSP"] = ss 449 res.xtra["EXP_DSSP_ASA"] = acc 450 res.xtra["PHI_DSSP"] = phi 451 res.xtra["PSI_DSSP"] = psi 452 res.xtra["DSSP_INDEX"] = dssp_index 453 res.xtra["NH_O_1_RELIDX_DSSP"] = NH_O_1_relidx 454 res.xtra["NH_O_1_ENERGY_DSSP"] = NH_O_1_energy 455 res.xtra["O_NH_1_RELIDX_DSSP"] = O_NH_1_relidx 456 res.xtra["O_NH_1_ENERGY_DSSP"] = O_NH_1_energy 457 res.xtra["NH_O_2_RELIDX_DSSP"] = NH_O_2_relidx 458 res.xtra["NH_O_2_ENERGY_DSSP"] = NH_O_2_energy 459 res.xtra["O_NH_2_RELIDX_DSSP"] = O_NH_2_relidx 460 res.xtra["O_NH_2_ENERGY_DSSP"] = O_NH_2_energy 461 462 # Relative accessibility 463 resname = res.get_resname() 464 try: 465 rel_acc = acc / self.residue_max_acc[resname] 466 except KeyError: 467 # Invalid value for resname 468 rel_acc = 'NA' 469 else: 470 if rel_acc > 1.0: 471 rel_acc = 1.0 472 res.xtra["EXP_DSSP_RASA"] = rel_acc 473 # Verify if AA in DSSP == AA in Structure 474 # Something went wrong if this is not true! 475 # NB: DSSP uses X often 476 try: 477 resname = three_to_one(resname) 478 except KeyError: 479 resname = 'X' 480 if resname == "C": 481 # DSSP renames C in C-bridges to a,b,c,d,... 482 # - we rename it back to 'C' 483 if _dssp_cys.match(aa): 484 aa = 'C' 485 # Take care of HETATM again 486 if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): 487 raise PDBException("Structure/DSSP mismatch at %s" % res) 488 489 dssp_vals = (dssp_index, aa, ss, rel_acc, phi, psi, 490 NH_O_1_relidx, NH_O_1_energy, 491 O_NH_1_relidx, O_NH_1_energy, 492 NH_O_2_relidx, NH_O_2_energy, 493 O_NH_2_relidx, O_NH_2_energy) 494 495 dssp_map[key] = dssp_vals 496 dssp_list.append(dssp_vals) 497 498 AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, 499 dssp_list)
500 501 502 if __name__ == "__main__": 503 import sys 504 505 p = PDBParser() 506 s = p.get_structure('X', sys.argv[1]) 507 model = s[0] 508 d = DSSP(model, sys.argv[1]) 509 510 for r in d: 511 print(r) 512 print("Handled %i residues" % len(d)) 513 print(sorted(d)) 514 if ('A', 1) in d: 515 print(d[('A', 1)]) 516 print(s[0]['A'][1].xtra) 517 # Secondary structure 518 print(''.join(item[1] for item in d)) 519