Package Bio :: Package PDB :: Module DSSP'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.DSSP'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Use the DSSP program to calculate secondary structure and accessibility. 
  7   
  8  You need to have a working version of DSSP (and a license, free for academic 
  9  use) in order to use this. For DSSP, see U{http://swift.cmbi.ru.nl/gv/dssp/}. 
 10   
 11  The DSSP codes for secondary structure used here are: 
 12   
 13      - H        Alpha helix (4-12) 
 14      - B        Isolated beta-bridge residue 
 15      - E        Strand 
 16      - G        3-10 helix 
 17      - I        pi helix 
 18      - T        Turn 
 19      - S        Bend 
 20      - -        None 
 21  """ 
 22   
 23  from __future__ import print_function 
 24   
 25  import re 
 26  from Bio._py3k import StringIO 
 27  import subprocess 
 28  import warnings 
 29   
 30  from Bio.Data import SCOPData 
 31   
 32  from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap 
 33  from Bio.PDB.PDBExceptions import PDBException 
 34  from Bio.PDB.PDBParser import PDBParser 
 35   
 36   
 37  # Match C in DSSP 
 38  _dssp_cys = re.compile('[a-z]') 
 39   
 40  # Maximal ASA of amino acids 
 41  # Values from Sander & Rost, (1994), Proteins, 20:216-226 
 42  # Used for relative accessibility 
 43  MAX_ACC = {} 
 44  MAX_ACC["ALA"] = 106.0 
 45  MAX_ACC["CYS"] = 135.0 
 46  MAX_ACC["ASP"] = 163.0 
 47  MAX_ACC["GLU"] = 194.0 
 48  MAX_ACC["PHE"] = 197.0 
 49  MAX_ACC["GLY"] = 84.0 
 50  MAX_ACC["HIS"] = 184.0 
 51  MAX_ACC["ILE"] = 169.0 
 52  MAX_ACC["LYS"] = 205.0 
 53  MAX_ACC["LEU"] = 164.0 
 54  MAX_ACC["MET"] = 188.0 
 55  MAX_ACC["ASN"] = 157.0 
 56  MAX_ACC["PRO"] = 136.0 
 57  MAX_ACC["GLN"] = 198.0 
 58  MAX_ACC["ARG"] = 248.0 
 59  MAX_ACC["SER"] = 130.0 
 60  MAX_ACC["THR"] = 142.0 
 61  MAX_ACC["VAL"] = 142.0 
 62  MAX_ACC["TRP"] = 227.0 
 63  MAX_ACC["TYR"] = 222.0 
 64   
 65   
66 -def ss_to_index(ss):
67 """ 68 Secondary structure symbol to index. 69 H=0 70 E=1 71 C=2 72 """ 73 if ss == 'H': 74 return 0 75 if ss == 'E': 76 return 1 77 if ss == 'C': 78 return 2 79 assert 0
80 81
82 -def dssp_dict_from_pdb_file(in_file, DSSP="dssp"):
83 """ 84 Create a DSSP dictionary from a PDB file. 85 86 Example: 87 -------- 88 >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb") 89 >>> aa, ss, acc=dssp_dict[('A', 1)] 90 91 :: 92 93 @param in_file: pdb file 94 @type in_file: string :: 95 96 @param DSSP: DSSP executable (argument to os.system) 97 @type DSSP: string :: 98 99 @return: a dictionary that maps (chainid, resid) to 100 amino acid type, secondary structure code and 101 accessibility. 102 @rtype: {} 103 """ 104 # Using universal newlines is important on Python 3, this 105 # gives unicode handles rather than bytes handles. 106 p = subprocess.Popen([DSSP, in_file], universal_newlines=True, 107 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 108 out, err = p.communicate() 109 110 # Alert user for errors 111 if err.strip(): 112 warnings.warn(err) 113 if not out.strip(): 114 raise Exception('DSSP failed to produce an output') 115 116 out_dict, keys = _make_dssp_dict(StringIO(out)) 117 return out_dict, keys
118 119
120 -def make_dssp_dict(filename):
121 """ 122 Return a DSSP dictionary that maps (chainid, resid) to 123 aa, ss and accessibility, from a DSSP file. :: 124 125 @param filename: the DSSP output file 126 @type filename: string 127 """ 128 with open(filename, "r") as handle: 129 return _make_dssp_dict(handle)
130 131
132 -def _make_dssp_dict(handle):
133 """ 134 Return a DSSP dictionary that maps (chainid, resid) to an amino acid, 135 secondary structure symbol, solvent accessibility value, and hydrogen bond 136 information (relative dssp indices and hydrogen bond energies) from an open 137 DSSP file object. :: 138 139 @param handle: the open DSSP output file handle 140 @type handle: file 141 """ 142 dssp = {} 143 start = 0 144 keys = [] 145 for l in handle.readlines(): 146 sl = l.split() 147 if len(sl) < 2: 148 continue 149 if sl[1] == "RESIDUE": 150 # Start parsing from here 151 start = 1 152 continue 153 if not start: 154 continue 155 if l[9] == " ": 156 # Skip -- missing residue 157 continue 158 159 dssp_index = int(l[:5]) 160 resseq = int(l[5:10]) 161 icode = l[10] 162 chainid = l[11] 163 aa = l[13] 164 ss = l[16] 165 if ss == " ": 166 ss = "-" 167 try: 168 NH_O_1_relidx = int(l[38:45]) 169 NH_O_1_energy = float(l[46:50]) 170 O_NH_1_relidx = int(l[50:56]) 171 O_NH_1_energy = float(l[57:61]) 172 NH_O_2_relidx = int(l[61:67]) 173 NH_O_2_energy = float(l[68:72]) 174 O_NH_2_relidx = int(l[72:78]) 175 O_NH_2_energy = float(l[79:83]) 176 177 acc = int(l[34:38]) 178 phi = float(l[103:109]) 179 psi = float(l[109:115]) 180 except ValueError as exc: 181 # DSSP output breaks its own format when there are >9999 182 # residues, since only 4 digits are allocated to the seq num 183 # field. See 3kic chain T res 321, 1vsy chain T res 6077. 184 # Here, look for whitespace to figure out the number of extra 185 # digits, and shift parsing the rest of the line by that amount. 186 if l[34] != ' ': 187 shift = l[34:].find(' ') 188 189 NH_O_1_relidx = int(l[38 + shift:45 + shift]) 190 NH_O_1_energy = float(l[46 + shift:50 + shift]) 191 O_NH_1_relidx = int(l[50 + shift:56 + shift]) 192 O_NH_1_energy = float(l[57 + shift:61 + shift]) 193 NH_O_2_relidx = int(l[61 + shift:67 + shift]) 194 NH_O_2_energy = float(l[68 + shift:72 + shift]) 195 O_NH_2_relidx = int(l[72 + shift:78 + shift]) 196 O_NH_2_energy = float(l[79 + shift:83 + shift]) 197 198 acc = int((l[34 + shift:38 + shift])) 199 phi = float(l[103 + shift:109 + shift]) 200 psi = float(l[109 + shift:115 + shift]) 201 else: 202 raise ValueError(exc) 203 res_id = (" ", resseq, icode) 204 dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi, dssp_index, 205 NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, 206 NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) 207 keys.append((chainid, res_id)) 208 return dssp, keys
209 210
211 -class DSSP(AbstractResiduePropertyMap):
212 """ 213 Run DSSP on a pdb file, and provide a handle to the 214 DSSP secondary structure and accessibility. 215 216 **Note** that DSSP can only handle one model. 217 218 Example: 219 -------- 220 221 >>> p = PDBParser() 222 >>> structure = p.get_structure("1MOT", "1MOT.pdb") 223 >>> model = structure[0] 224 >>> dssp = DSSP(model, "1MOT.pdb") 225 >>> # DSSP data is accessed by a tuple (chain_id, res_id) 226 >>> a_key = list(dssp.keys())[2] 227 >>> # residue object, secondary structure, solvent accessibility, 228 >>> # relative accessiblity, phi, psi 229 >>> dssp[a_key] 230 (<Residue ALA het= resseq=251 icode= >, 231 'H', 232 72, 233 0.67924528301886788, 234 -61.200000000000003, 235 -42.399999999999999) 236 """ 237
238 - def __init__(self, model, pdb_file, dssp="dssp"):
239 """ 240 :: 241 242 @param model: the first model of the structure 243 @type model: L{Model} :: 244 245 @param pdb_file: a PDB file 246 @type pdb_file: string :: 247 248 @param dssp: the dssp executable (ie. the argument to os.system) 249 @type dssp: string 250 """ 251 # create DSSP dictionary 252 dssp_dict, dssp_keys = dssp_dict_from_pdb_file(pdb_file, dssp) 253 dssp_map = {} 254 dssp_list = [] 255 256 def resid2code(res_id): 257 """Serialize a residue's resseq and icode for easy comparison.""" 258 return '%s%s' % (res_id[1], res_id[2])
259 260 # Now create a dictionary that maps Residue objects to 261 # secondary structure and accessibility, and a list of 262 # (residue, (secondary structure, accessibility)) tuples 263 for key in dssp_keys: 264 chain_id, res_id = key 265 chain = model[chain_id] 266 try: 267 res = chain[res_id] 268 except KeyError: 269 # In DSSP, HET field is not considered in residue identifier. 270 # Thus HETATM records may cause unnecessary exceptions. 271 # (See 3jui chain A res 593.) 272 # Try the lookup again with all HETATM other than water 273 res_seq_icode = resid2code(res_id) 274 for r in chain: 275 if r.id[0] not in (' ', 'W'): 276 # Compare resseq + icode 277 if resid2code(r.id) == res_seq_icode: 278 # Found a matching residue 279 res = r 280 break 281 else: 282 raise KeyError(res_id) 283 284 # For disordered residues of point mutations, BioPython uses the 285 # last one as default, But DSSP takes the first one (alternative 286 # location is blank, A or 1). See 1h9h chain E resi 22. 287 # Here we select the res in which all atoms have altloc blank, A or 288 # 1. If no such residues are found, simply use the first one appears 289 # (as DSSP does). 290 if res.is_disordered() == 2: 291 for rk in res.disordered_get_id_list(): 292 # All atoms in the disordered residue should have the same 293 # altloc, so it suffices to check the altloc of the first 294 # atom. 295 altloc = res.child_dict[rk].get_list()[0].get_altloc() 296 if altloc in tuple('A1 '): 297 res.disordered_select(rk) 298 break 299 else: 300 # Simply select the first one 301 res.disordered_select(res.disordered_get_id_list()[0]) 302 303 # Sometimes point mutations are put into HETATM and ATOM with altloc 304 # 'A' and 'B'. 305 # See 3piu chain A residue 273: 306 # <Residue LLP het=H_LLP resseq=273 icode= > 307 # <Residue LYS het= resseq=273 icode= > 308 # DSSP uses the HETATM LLP as it has altloc 'A' 309 # We check the altloc code here. 310 elif res.is_disordered() == 1: 311 # Check altloc of all atoms in the DisorderedResidue. If it 312 # contains blank, A or 1, then use it. Otherwise, look for HET 313 # residues of the same seq+icode. If not such HET residues are 314 # found, just accept the current one. 315 altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) 316 if altlocs.isdisjoint('A1 '): 317 # Try again with all HETATM other than water 318 res_seq_icode = resid2code(res_id) 319 for r in chain: 320 if r.id[0] not in (' ', 'W'): 321 if resid2code(r.id) == res_seq_icode and \ 322 r.get_list()[0].get_altloc() in tuple('A1 '): 323 res = r 324 break 325 326 (aa, ss, acc, phi, psi, dssp_index, 327 NH_O_1_relidx, NH_O_1_energy, 328 O_NH_1_relidx, O_NH_1_energy, 329 NH_O_2_relidx, NH_O_2_energy, 330 O_NH_2_relidx, O_NH_2_energy) = dssp_dict[key] 331 332 res.xtra["SS_DSSP"] = ss 333 res.xtra["EXP_DSSP_ASA"] = acc 334 res.xtra["PHI_DSSP"] = phi 335 res.xtra["PSI_DSSP"] = psi 336 res.xtra["DSSP_INDEX"] = dssp_index 337 res.xtra["NH_O_1_RELIDX_DSSP"] = NH_O_1_relidx 338 res.xtra["NH_O_1_ENERGY_DSSP"] = NH_O_1_energy 339 res.xtra["O_NH_1_RELIDX_DSSP"] = O_NH_1_relidx 340 res.xtra["O_NH_1_ENERGY_DSSP"] = O_NH_1_energy 341 res.xtra["NH_O_2_RELIDX_DSSP"] = NH_O_2_relidx 342 res.xtra["NH_O_2_ENERGY_DSSP"] = NH_O_2_energy 343 res.xtra["O_NH_2_RELIDX_DSSP"] = O_NH_2_relidx 344 res.xtra["O_NH_2_ENERGY_DSSP"] = O_NH_2_energy 345 346 # Relative accessibility 347 resname = res.get_resname() 348 try: 349 rel_acc = acc / MAX_ACC[resname] 350 except KeyError: 351 # Invalid value for resname 352 rel_acc = 'NA' 353 else: 354 if rel_acc > 1.0: 355 rel_acc = 1.0 356 res.xtra["EXP_DSSP_RASA"] = rel_acc 357 # Verify if AA in DSSP == AA in Structure 358 # Something went wrong if this is not true! 359 # NB: DSSP uses X often 360 resname = SCOPData.protein_letters_3to1.get(resname, 'X') 361 if resname == "C": 362 # DSSP renames C in C-bridges to a,b,c,d,... 363 # - we rename it back to 'C' 364 if _dssp_cys.match(aa): 365 aa = 'C' 366 # Take care of HETATM again 367 if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): 368 raise PDBException("Structure/DSSP mismatch at %s" % res) 369 370 dssp_vals = (dssp_index, aa, ss, rel_acc, phi, psi, 371 NH_O_1_relidx, NH_O_1_energy, 372 O_NH_1_relidx, O_NH_1_energy, 373 NH_O_2_relidx, NH_O_2_energy, 374 O_NH_2_relidx, O_NH_2_energy) 375 376 dssp_map[key] = dssp_vals 377 dssp_list.append(dssp_vals) 378 379 AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, 380 dssp_list)
381 382 383 if __name__ == "__main__": 384 import sys 385 386 p = PDBParser() 387 s = p.get_structure('X', sys.argv[1]) 388 model = s[0] 389 d = DSSP(model, sys.argv[1]) 390 391 for r in d: 392 print(r) 393 print("Handled %i residues" % len(d)) 394 print(sorted(d)) 395 if ('A', 1) in d: 396 print(d[('A', 1)]) 397 print(s[0]['A'][1].xtra) 398 # Secondary structure 399 print(''.join(item[1] for item in d)) 400