Package Bio :: Package PDB :: Module DSSP'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.DSSP'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  r"""Use the DSSP program to calculate secondary structure and accessibility. 
  7   
  8  You need to have a working version of DSSP (and a license, free for academic 
  9  use) in order to use this. For DSSP, see http://swift.cmbi.ru.nl/gv/dssp/. 
 10   
 11  The following Accessible surface area (ASA) values can be used, defaulting 
 12  to the Sander and Rost values: 
 13   
 14      Miller 
 15          Miller et al. 1987 http://dx.doi.org/10.1016/0022-2836(87)90038-6 
 16      Sander 
 17          Sander and Rost 1994 http://dx.doi.org/10.1002/prot.340200303 
 18      Wilke 
 19          Tien et al. 2013 http://dx.doi.org/10.1371/journal.pone.0080635 
 20   
 21  The DSSP codes for secondary structure used here are: 
 22   
 23      =====     ==== 
 24      Code      Structure 
 25      =====     ==== 
 26       H        Alpha helix (4-12) 
 27       B        Isolated beta-bridge residue 
 28       E        Strand 
 29       G        3-10 helix 
 30       I        Pi helix 
 31       T        Turn 
 32       S        Bend 
 33       \-       None 
 34      =====     ==== 
 35   
 36  Usage 
 37  ----- 
 38   
 39  The DSSP class can be used to run DSSP on a pdb file, and provides a 
 40  handle to the DSSP secondary structure and accessibility. 
 41   
 42  **Note** that DSSP can only handle one model, and will only run 
 43  calculations on the first model in the provided PDB file. 
 44   
 45  Example: 
 46  -------- 
 47   
 48  >>> p = PDBParser() 
 49  >>> structure = p.get_structure("1MOT", "1mot.pdb") 
 50  >>> model = structure[0] 
 51  >>> dssp = DSSP(model, "1mot.pdb") 
 52   
 53  Note that the recent DSSP executable from the DSSP-2 package was 
 54  renamed from `dssp` to `mkdssp`. If using a recent DSSP release, 
 55  you may need to provide the name of your DSSP executable: 
 56   
 57  >>> dssp = DSSP(model, '1mot.pdb', dssp='mkdssp') 
 58   
 59  DSSP data is accessed by a tuple - (chain id, residue id): 
 60   
 61  >>> a_key = list(dssp.keys())[2] 
 62  >>> a_key 
 63  ('A', (' ', 251, ' ')) 
 64  >>> dssp[a_key] 
 65  (3, 'A', 'H', 0.7075471698113207, -61.2, -42.4, 
 66   -2, -0.7, 4, -3.0, 1, -0.2, 5, -0.2) 
 67   
 68  The dssp data returned for a single residue is a tuple in the form: 
 69   
 70      ============ === 
 71      Tuple Index  Value 
 72      ============ === 
 73      0            DSSP index 
 74      1            Amino acid 
 75      2            Secondary structure 
 76      3            Relative ASA 
 77      4            Phi 
 78      5            Psi 
 79      6            NH-->O_1_relidx 
 80      7            NH-->O_1_energy 
 81      8            O-->NH_1_relidx 
 82      9            O-->NH_1_energy 
 83      10           NH-->O_2_relidx 
 84      11           NH-->O_2_energy 
 85      12           O-->NH_2_relidx 
 86      13           O-->NH_2_energy 
 87      ============ === 
 88   
 89  """ 
 90   
 91  from __future__ import print_function 
 92   
 93  import re 
 94  from Bio._py3k import StringIO 
 95  import subprocess 
 96  import warnings 
 97   
 98  from Bio.Data import SCOPData 
 99   
100  from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap 
101  from Bio.PDB.PDBExceptions import PDBException 
102  from Bio.PDB.PDBParser import PDBParser 
103   
104   
105  # Match C in DSSP 
106  _dssp_cys = re.compile('[a-z]') 
107   
108  # Maximal ASA of amino acids 
109  # Used for relative accessibility 
110   
111  residue_max_acc = { 
112      # Miller max acc: Miller et al. 1987 http://dx.doi.org/10.1016/0022-2836(87)90038-6 
113      # Wilke: Tien et al. 2013 http://dx.doi.org/10.1371/journal.pone.0080635 
114      # Sander: Sander & Rost 1994 http://dx.doi.org/10.1002/prot.340200303 
115      'Miller': { 
116          'ALA': 113.0, 'ARG': 241.0, 'ASN': 158.0, 'ASP': 151.0, 
117          'CYS': 140.0, 'GLN': 189.0, 'GLU': 183.0, 'GLY': 85.0, 
118          'HIS': 194.0, 'ILE': 182.0, 'LEU': 180.0, 'LYS': 211.0, 
119          'MET': 204.0, 'PHE': 218.0, 'PRO': 143.0, 'SER': 122.0, 
120          'THR': 146.0, 'TRP': 259.0, 'TYR': 229.0, 'VAL': 160.0 
121      }, 
122      'Wilke': { 
123          'ALA': 129.0, 'ARG': 274.0, 'ASN': 195.0, 'ASP': 193.0, 
124          'CYS': 167.0, 'GLN': 225.0, 'GLU': 223.0, 'GLY': 104.0, 
125          'HIS': 224.0, 'ILE': 197.0, 'LEU': 201.0, 'LYS': 236.0, 
126          'MET': 224.0, 'PHE': 240.0, 'PRO': 159.0, 'SER': 155.0, 
127          'THR': 172.0, 'TRP': 285.0, 'TYR': 263.0, 'VAL': 174.0 
128      }, 
129      'Sander': { 
130          'ALA': 106.0, 'ARG': 248.0, 'ASN': 157.0, 'ASP': 163.0, 
131          'CYS': 135.0, 'GLN': 198.0, 'GLU': 194.0, 'GLY': 84.0, 
132          'HIS': 184.0, 'ILE': 169.0, 'LEU': 164.0, 'LYS': 205.0, 
133          'MET': 188.0, 'PHE': 197.0, 'PRO': 136.0, 'SER': 130.0, 
134          'THR': 142.0, 'TRP': 227.0, 'TYR': 222.0, 'VAL': 142.0 
135      } 
136  } 
137   
138   
139 -def ss_to_index(ss):
140 """Secondary structure symbol to index. 141 142 H=0 143 E=1 144 C=2 145 """ 146 if ss == 'H': 147 return 0 148 if ss == 'E': 149 return 1 150 if ss == 'C': 151 return 2 152 assert 0
153 154
155 -def dssp_dict_from_pdb_file(in_file, DSSP="dssp"):
156 """Create a DSSP dictionary from a PDB file. 157 158 Example: 159 -------- 160 >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb") 161 >>> aa, ss, acc=dssp_dict[('A', 1)] 162 163 Parameters 164 ---------- 165 in_file : string 166 pdb file 167 168 DSSP : string 169 DSSP executable (argument to os.system) 170 171 Returns 172 ------- 173 (out_dict, keys) : tuple 174 a dictionary that maps (chainid, resid) to 175 amino acid type, secondary structure code and 176 accessibility. 177 """ 178 # Using universal newlines is important on Python 3, this 179 # gives unicode handles rather than bytes handles. 180 # Newer version of DSSP executable is named 'mkdssp', 181 # and calling 'dssp' will hence not work in some operating systems 182 # (Debian distribution of DSSP includes a symlink for 'dssp' argument) 183 try: 184 p = subprocess.Popen([DSSP, in_file], universal_newlines=True, 185 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 186 except FileNotFoundError: 187 if DSSP == "mkdssp": 188 raise 189 p = subprocess.Popen(["mkdssp", in_file], universal_newlines=True, 190 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 191 out, err = p.communicate() 192 193 # Alert user for errors 194 if err.strip(): 195 warnings.warn(err) 196 if not out.strip(): 197 raise Exception('DSSP failed to produce an output') 198 199 out_dict, keys = _make_dssp_dict(StringIO(out)) 200 return out_dict, keys
201 202
203 -def make_dssp_dict(filename):
204 """DSSP dictionary mapping identifers to properties. 205 206 Return a DSSP dictionary that maps (chainid, resid) to 207 aa, ss and accessibility, from a DSSP file. 208 209 Parameters 210 ---------- 211 filename : string 212 the DSSP output file 213 """ 214 with open(filename, "r") as handle: 215 return _make_dssp_dict(handle)
216 217
218 -def _make_dssp_dict(handle):
219 """Internal function used by mask_dssp_dict (PRIVATE). 220 221 Return a DSSP dictionary that maps (chainid, resid) to an amino acid, 222 secondary structure symbol, solvent accessibility value, and hydrogen bond 223 information (relative dssp indices and hydrogen bond energies) from an open 224 DSSP file object. 225 226 Parameters 227 ---------- 228 handle : file 229 the open DSSP output file handle 230 """ 231 dssp = {} 232 start = 0 233 keys = [] 234 for l in handle.readlines(): 235 sl = l.split() 236 if len(sl) < 2: 237 continue 238 if sl[1] == "RESIDUE": 239 # Start parsing from here 240 start = 1 241 continue 242 if not start: 243 continue 244 if l[9] == " ": 245 # Skip -- missing residue 246 continue 247 248 dssp_index = int(l[:5]) 249 resseq = int(l[5:10]) 250 icode = l[10] 251 chainid = l[11] 252 aa = l[13] 253 ss = l[16] 254 if ss == " ": 255 ss = "-" 256 try: 257 NH_O_1_relidx = int(l[38:45]) 258 NH_O_1_energy = float(l[46:50]) 259 O_NH_1_relidx = int(l[50:56]) 260 O_NH_1_energy = float(l[57:61]) 261 NH_O_2_relidx = int(l[61:67]) 262 NH_O_2_energy = float(l[68:72]) 263 O_NH_2_relidx = int(l[72:78]) 264 O_NH_2_energy = float(l[79:83]) 265 266 acc = int(l[34:38]) 267 phi = float(l[103:109]) 268 psi = float(l[109:115]) 269 except ValueError as exc: 270 # DSSP output breaks its own format when there are >9999 271 # residues, since only 4 digits are allocated to the seq num 272 # field. See 3kic chain T res 321, 1vsy chain T res 6077. 273 # Here, look for whitespace to figure out the number of extra 274 # digits, and shift parsing the rest of the line by that amount. 275 if l[34] != ' ': 276 shift = l[34:].find(' ') 277 278 NH_O_1_relidx = int(l[38 + shift:45 + shift]) 279 NH_O_1_energy = float(l[46 + shift:50 + shift]) 280 O_NH_1_relidx = int(l[50 + shift:56 + shift]) 281 O_NH_1_energy = float(l[57 + shift:61 + shift]) 282 NH_O_2_relidx = int(l[61 + shift:67 + shift]) 283 NH_O_2_energy = float(l[68 + shift:72 + shift]) 284 O_NH_2_relidx = int(l[72 + shift:78 + shift]) 285 O_NH_2_energy = float(l[79 + shift:83 + shift]) 286 287 acc = int((l[34 + shift:38 + shift])) 288 phi = float(l[103 + shift:109 + shift]) 289 psi = float(l[109 + shift:115 + shift]) 290 else: 291 raise ValueError(exc) 292 res_id = (" ", resseq, icode) 293 dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi, dssp_index, 294 NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, 295 NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) 296 keys.append((chainid, res_id)) 297 return dssp, keys
298 299
300 -class DSSP(AbstractResiduePropertyMap):
301 """Run DSSP and parse secondary structure and accessibility. 302 303 Run DSSP on a pdb file, and provide a handle to the 304 DSSP secondary structure and accessibility. 305 306 **Note** that DSSP can only handle one model. 307 308 Example: 309 -------- 310 311 >>> p = PDBParser() 312 >>> structure = p.get_structure("1MOT", "1mot.pdb") 313 >>> model = structure[0] 314 >>> dssp = DSSP(model, "1mot.pdb") 315 >>> # DSSP data is accessed by a tuple (chain_id, res_id) 316 >>> a_key = list(dssp.keys())[2] 317 >>> # (dssp index, amino acid, secondary structure, relative ASA, phi, psi, 318 >>> # NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, 319 >>> # NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) 320 >>> dssp[a_key] 321 (3, 'A', 'H', 0.7075471698113207, -61.2, -42.4, 322 -2, -0.7, 4, -3.0, 1, -0.2, 5, -0.2) 323 """ 324
325 - def __init__(self, model, in_file, dssp="dssp", acc_array="Sander", file_type='PDB'):
326 """Create a DSSP object. 327 328 Parameters 329 ---------- 330 model : Model 331 The first model of the structure 332 in_file : string 333 Either a PDB file or a DSSP file. 334 dssp : string 335 The dssp executable (ie. the argument to os.system) 336 acc_array : string 337 Accessible surface area (ASA) from either Miller et al. (1987), 338 Sander & Rost (1994), or Wilke: Tien et al. 2013, as string 339 Sander/Wilke/Miller. Defaults to Sander. 340 file_type: string 341 File type switch, either PDB or DSSP with PDB as default. 342 """ 343 self.residue_max_acc = residue_max_acc[acc_array] 344 345 # create DSSP dictionary 346 file_type = file_type.upper() 347 assert(file_type in ['PDB', 'DSSP']) 348 # If the input file is a PDB file run DSSP and parse output: 349 if file_type == 'PDB': 350 # Newer versions of DSSP program call the binary 'mkdssp', so 351 # calling 'dssp' will not work in some operating systems 352 # (Debian distribution of DSSP includes a symlink for 'dssp' argument) 353 try: 354 dssp_dict, dssp_keys = dssp_dict_from_pdb_file(in_file, dssp) 355 except FileNotFoundError: 356 if dssp == 'dssp': 357 dssp = 'mkdssp' 358 elif dssp == 'mkdssp': 359 dssp = 'dssp' 360 else: 361 raise 362 dssp_dict, dssp_keys = dssp_dict_from_pdb_file(in_file, dssp) 363 # If the input file is a DSSP file just parse it directly: 364 elif file_type == 'DSSP': 365 dssp_dict, dssp_keys = make_dssp_dict(in_file) 366 367 dssp_map = {} 368 dssp_list = [] 369 370 def resid2code(res_id): 371 """Serialize a residue's resseq and icode for easy comparison.""" 372 return '%s%s' % (res_id[1], res_id[2])
373 374 # Now create a dictionary that maps Residue objects to 375 # secondary structure and accessibility, and a list of 376 # (residue, (secondary structure, accessibility)) tuples 377 for key in dssp_keys: 378 chain_id, res_id = key 379 chain = model[chain_id] 380 try: 381 res = chain[res_id] 382 except KeyError: 383 # In DSSP, HET field is not considered in residue identifier. 384 # Thus HETATM records may cause unnecessary exceptions. 385 # (See 3jui chain A res 593.) 386 # Try the lookup again with all HETATM other than water 387 res_seq_icode = resid2code(res_id) 388 for r in chain: 389 if r.id[0] not in (' ', 'W'): 390 # Compare resseq + icode 391 if resid2code(r.id) == res_seq_icode: 392 # Found a matching residue 393 res = r 394 break 395 else: 396 raise KeyError(res_id) 397 398 # For disordered residues of point mutations, BioPython uses the 399 # last one as default, But DSSP takes the first one (alternative 400 # location is blank, A or 1). See 1h9h chain E resi 22. 401 # Here we select the res in which all atoms have altloc blank, A or 402 # 1. If no such residues are found, simply use the first one appears 403 # (as DSSP does). 404 if res.is_disordered() == 2: 405 for rk in res.disordered_get_id_list(): 406 # All atoms in the disordered residue should have the same 407 # altloc, so it suffices to check the altloc of the first 408 # atom. 409 altloc = res.child_dict[rk].get_list()[0].get_altloc() 410 if altloc in tuple('A1 '): 411 res.disordered_select(rk) 412 break 413 else: 414 # Simply select the first one 415 res.disordered_select(res.disordered_get_id_list()[0]) 416 417 # Sometimes point mutations are put into HETATM and ATOM with altloc 418 # 'A' and 'B'. 419 # See 3piu chain A residue 273: 420 # <Residue LLP het=H_LLP resseq=273 icode= > 421 # <Residue LYS het= resseq=273 icode= > 422 # DSSP uses the HETATM LLP as it has altloc 'A' 423 # We check the altloc code here. 424 elif res.is_disordered() == 1: 425 # Check altloc of all atoms in the DisorderedResidue. If it 426 # contains blank, A or 1, then use it. Otherwise, look for HET 427 # residues of the same seq+icode. If not such HET residues are 428 # found, just accept the current one. 429 altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) 430 if altlocs.isdisjoint('A1 '): 431 # Try again with all HETATM other than water 432 res_seq_icode = resid2code(res_id) 433 for r in chain: 434 if r.id[0] not in (' ', 'W'): 435 if resid2code(r.id) == res_seq_icode and \ 436 r.get_list()[0].get_altloc() in tuple('A1 '): 437 res = r 438 break 439 440 (aa, ss, acc, phi, psi, dssp_index, 441 NH_O_1_relidx, NH_O_1_energy, 442 O_NH_1_relidx, O_NH_1_energy, 443 NH_O_2_relidx, NH_O_2_energy, 444 O_NH_2_relidx, O_NH_2_energy) = dssp_dict[key] 445 446 res.xtra["SS_DSSP"] = ss 447 res.xtra["EXP_DSSP_ASA"] = acc 448 res.xtra["PHI_DSSP"] = phi 449 res.xtra["PSI_DSSP"] = psi 450 res.xtra["DSSP_INDEX"] = dssp_index 451 res.xtra["NH_O_1_RELIDX_DSSP"] = NH_O_1_relidx 452 res.xtra["NH_O_1_ENERGY_DSSP"] = NH_O_1_energy 453 res.xtra["O_NH_1_RELIDX_DSSP"] = O_NH_1_relidx 454 res.xtra["O_NH_1_ENERGY_DSSP"] = O_NH_1_energy 455 res.xtra["NH_O_2_RELIDX_DSSP"] = NH_O_2_relidx 456 res.xtra["NH_O_2_ENERGY_DSSP"] = NH_O_2_energy 457 res.xtra["O_NH_2_RELIDX_DSSP"] = O_NH_2_relidx 458 res.xtra["O_NH_2_ENERGY_DSSP"] = O_NH_2_energy 459 460 # Relative accessibility 461 resname = res.get_resname() 462 try: 463 rel_acc = acc / self.residue_max_acc[resname] 464 except KeyError: 465 # Invalid value for resname 466 rel_acc = 'NA' 467 else: 468 if rel_acc > 1.0: 469 rel_acc = 1.0 470 res.xtra["EXP_DSSP_RASA"] = rel_acc 471 # Verify if AA in DSSP == AA in Structure 472 # Something went wrong if this is not true! 473 # NB: DSSP uses X often 474 resname = SCOPData.protein_letters_3to1.get(resname, 'X') 475 if resname == "C": 476 # DSSP renames C in C-bridges to a,b,c,d,... 477 # - we rename it back to 'C' 478 if _dssp_cys.match(aa): 479 aa = 'C' 480 # Take care of HETATM again 481 if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): 482 raise PDBException("Structure/DSSP mismatch at %s" % res) 483 484 dssp_vals = (dssp_index, aa, ss, rel_acc, phi, psi, 485 NH_O_1_relidx, NH_O_1_energy, 486 O_NH_1_relidx, O_NH_1_energy, 487 NH_O_2_relidx, NH_O_2_energy, 488 O_NH_2_relidx, O_NH_2_energy) 489 490 dssp_map[key] = dssp_vals 491 dssp_list.append(dssp_vals) 492 493 AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, 494 dssp_list)
495 496 497 if __name__ == "__main__": 498 import sys 499 500 p = PDBParser() 501 s = p.get_structure('X', sys.argv[1]) 502 model = s[0] 503 d = DSSP(model, sys.argv[1]) 504 505 for r in d: 506 print(r) 507 print("Handled %i residues" % len(d)) 508 print(sorted(d)) 509 if ('A', 1) in d: 510 print(d[('A', 1)]) 511 print(s[0]['A'][1].xtra) 512 # Secondary structure 513 print(''.join(item[1] for item in d)) 514