Package Bio :: Package PDB :: Module DSSP'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.DSSP'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Use the DSSP program to calculate secondary structure and accessibility. 
  7   
  8  You need to have a working version of DSSP (and a license, free for academic 
  9  use) in order to use this. For DSSP, see U{http://swift.cmbi.ru.nl/gv/dssp/}. 
 10   
 11  The DSSP codes for secondary structure used here are: 
 12   
 13      - H        Alpha helix (4-12) 
 14      - B        Isolated beta-bridge residue 
 15      - E        Strand 
 16      - G        3-10 helix 
 17      - I        pi helix 
 18      - T        Turn 
 19      - S        Bend 
 20      - -        None 
 21  """ 
 22   
 23  from __future__ import print_function 
 24   
 25  __docformat__ = "restructuredtext en" 
 26   
 27  import re 
 28  from Bio._py3k import StringIO 
 29  import subprocess 
 30  import warnings 
 31   
 32  from Bio.Data import SCOPData 
 33   
 34  from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap 
 35  from Bio.PDB.PDBExceptions import PDBException 
 36  from Bio.PDB.PDBParser import PDBParser 
 37   
 38   
 39  # Match C in DSSP 
 40  _dssp_cys = re.compile('[a-z]') 
 41   
 42  # Maximal ASA of amino acids 
 43  # Values from Sander & Rost, (1994), Proteins, 20:216-226 
 44  # Used for relative accessibility 
 45  MAX_ACC = {} 
 46  MAX_ACC["ALA"] = 106.0 
 47  MAX_ACC["CYS"] = 135.0 
 48  MAX_ACC["ASP"] = 163.0 
 49  MAX_ACC["GLU"] = 194.0 
 50  MAX_ACC["PHE"] = 197.0 
 51  MAX_ACC["GLY"] = 84.0 
 52  MAX_ACC["HIS"] = 184.0 
 53  MAX_ACC["ILE"] = 169.0 
 54  MAX_ACC["LYS"] = 205.0 
 55  MAX_ACC["LEU"] = 164.0 
 56  MAX_ACC["MET"] = 188.0 
 57  MAX_ACC["ASN"] = 157.0 
 58  MAX_ACC["PRO"] = 136.0 
 59  MAX_ACC["GLN"] = 198.0 
 60  MAX_ACC["ARG"] = 248.0 
 61  MAX_ACC["SER"] = 130.0 
 62  MAX_ACC["THR"] = 142.0 
 63  MAX_ACC["VAL"] = 142.0 
 64  MAX_ACC["TRP"] = 227.0 
 65  MAX_ACC["TYR"] = 222.0 
 66   
 67   
68 -def ss_to_index(ss):
69 """ 70 Secondary structure symbol to index. 71 H=0 72 E=1 73 C=2 74 """ 75 if ss == 'H': 76 return 0 77 if ss == 'E': 78 return 1 79 if ss == 'C': 80 return 2 81 assert 0
82 83
84 -def dssp_dict_from_pdb_file(in_file, DSSP="dssp"):
85 """ 86 Create a DSSP dictionary from a PDB file. 87 88 Example: 89 -------- 90 >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb") 91 >>> aa, ss, acc=dssp_dict[('A', 1)] 92 93 :: 94 95 @param in_file: pdb file 96 @type in_file: string :: 97 98 @param DSSP: DSSP executable (argument to os.system) 99 @type DSSP: string :: 100 101 @return: a dictionary that maps (chainid, resid) to 102 amino acid type, secondary structure code and 103 accessibility. 104 @rtype: {} 105 """ 106 # Using universal newlines is important on Python 3, this 107 # gives unicode handles rather than bytes handles. 108 p = subprocess.Popen([DSSP, in_file], universal_newlines=True, 109 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 110 out, err = p.communicate() 111 112 # Alert user for errors 113 if err.strip(): 114 warnings.warn(err) 115 if not out.strip(): 116 raise Exception('DSSP failed to produce an output') 117 118 out_dict, keys = _make_dssp_dict(StringIO(out)) 119 return out_dict, keys
120 121
122 -def make_dssp_dict(filename):
123 """ 124 Return a DSSP dictionary that maps (chainid, resid) to 125 aa, ss and accessibility, from a DSSP file. :: 126 127 @param filename: the DSSP output file 128 @type filename: string 129 """ 130 with open(filename, "r") as handle: 131 return _make_dssp_dict(handle)
132 133
134 -def _make_dssp_dict(handle):
135 """ 136 Return a DSSP dictionary that maps (chainid, resid) to an amino acid, 137 secondary structure symbol, solvent accessibility value, and hydrogen bond 138 information (relative dssp indices and hydrogen bond energies) from an open 139 DSSP file object. :: 140 141 @param handle: the open DSSP output file handle 142 @type handle: file 143 """ 144 dssp = {} 145 start = 0 146 keys = [] 147 for l in handle.readlines(): 148 sl = l.split() 149 if len(sl) < 2: 150 continue 151 if sl[1] == "RESIDUE": 152 # Start parsing from here 153 start = 1 154 continue 155 if not start: 156 continue 157 if l[9] == " ": 158 # Skip -- missing residue 159 continue 160 161 dssp_index = int(l[:5]) 162 resseq = int(l[5:10]) 163 icode = l[10] 164 chainid = l[11] 165 aa = l[13] 166 ss = l[16] 167 if ss == " ": 168 ss = "-" 169 try: 170 NH_O_1_relidx = int(l[38:45]) 171 NH_O_1_energy = float(l[46:50]) 172 O_NH_1_relidx = int(l[50:56]) 173 O_NH_1_energy = float(l[57:61]) 174 NH_O_2_relidx = int(l[61:67]) 175 NH_O_2_energy = float(l[68:72]) 176 O_NH_2_relidx = int(l[72:78]) 177 O_NH_2_energy = float(l[79:83]) 178 179 acc = int(l[34:38]) 180 phi = float(l[103:109]) 181 psi = float(l[109:115]) 182 except ValueError as exc: 183 # DSSP output breaks its own format when there are >9999 184 # residues, since only 4 digits are allocated to the seq num 185 # field. See 3kic chain T res 321, 1vsy chain T res 6077. 186 # Here, look for whitespace to figure out the number of extra 187 # digits, and shift parsing the rest of the line by that amount. 188 if l[34] != ' ': 189 shift = l[34:].find(' ') 190 191 NH_O_1_relidx = int(l[38 + shift:45 + shift]) 192 NH_O_1_energy = float(l[46 + shift:50 + shift]) 193 O_NH_1_relidx = int(l[50 + shift:56 + shift]) 194 O_NH_1_energy = float(l[57 + shift:61 + shift]) 195 NH_O_2_relidx = int(l[61 + shift:67 + shift]) 196 NH_O_2_energy = float(l[68 + shift:72 + shift]) 197 O_NH_2_relidx = int(l[72 + shift:78 + shift]) 198 O_NH_2_energy = float(l[79 + shift:83 + shift]) 199 200 acc = int((l[34 + shift:38 + shift])) 201 phi = float(l[103 + shift:109 + shift]) 202 psi = float(l[109 + shift:115 + shift]) 203 else: 204 raise ValueError(exc) 205 res_id = (" ", resseq, icode) 206 dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi, dssp_index, 207 NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, 208 NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) 209 keys.append((chainid, res_id)) 210 return dssp, keys
211 212
213 -class DSSP(AbstractResiduePropertyMap):
214 """ 215 Run DSSP on a pdb file, and provide a handle to the 216 DSSP secondary structure and accessibility. 217 218 **Note** that DSSP can only handle one model. 219 220 Example: 221 -------- 222 223 >>> p = PDBParser() 224 >>> structure = p.get_structure("1MOT", "1MOT.pdb") 225 >>> model = structure[0] 226 >>> dssp = DSSP(model, "1MOT.pdb") 227 >>> # DSSP data is accessed by a tuple (chain_id, res_id) 228 >>> a_key = list(dssp)[2] 229 >>> # residue object, secondary structure, solvent accessibility, 230 >>> # relative accessiblity, phi, psi 231 >>> dssp[a_key] 232 (<Residue ALA het= resseq=251 icode= >, 233 'H', 234 72, 235 0.67924528301886788, 236 -61.200000000000003, 237 -42.399999999999999) 238 """ 239
240 - def __init__(self, model, pdb_file, dssp="dssp"):
241 """ 242 :: 243 244 @param model: the first model of the structure 245 @type model: L{Model} :: 246 247 @param pdb_file: a PDB file 248 @type pdb_file: string :: 249 250 @param dssp: the dssp executable (ie. the argument to os.system) 251 @type dssp: string 252 """ 253 # create DSSP dictionary 254 dssp_dict, dssp_keys = dssp_dict_from_pdb_file(pdb_file, dssp) 255 dssp_map = {} 256 dssp_list = [] 257 258 def resid2code(res_id): 259 """Serialize a residue's resseq and icode for easy comparison.""" 260 return '%s%s' % (res_id[1], res_id[2])
261 262 # Now create a dictionary that maps Residue objects to 263 # secondary structure and accessibility, and a list of 264 # (residue, (secondary structure, accessibility)) tuples 265 for key in dssp_keys: 266 chain_id, res_id = key 267 chain = model[chain_id] 268 try: 269 res = chain[res_id] 270 except KeyError: 271 # In DSSP, HET field is not considered in residue identifier. 272 # Thus HETATM records may cause unnecessary exceptions. 273 # (See 3jui chain A res 593.) 274 # Try the lookup again with all HETATM other than water 275 res_seq_icode = resid2code(res_id) 276 for r in chain: 277 if r.id[0] not in (' ', 'W'): 278 # Compare resseq + icode 279 if resid2code(r.id) == res_seq_icode: 280 # Found a matching residue 281 res = r 282 break 283 else: 284 raise KeyError(res_id) 285 286 # For disordered residues of point mutations, BioPython uses the 287 # last one as default, But DSSP takes the first one (alternative 288 # location is blank, A or 1). See 1h9h chain E resi 22. 289 # Here we select the res in which all atoms have altloc blank, A or 290 # 1. If no such residues are found, simply use the first one appears 291 # (as DSSP does). 292 if res.is_disordered() == 2: 293 for rk in res.disordered_get_id_list(): 294 # All atoms in the disordered residue should have the same 295 # altloc, so it suffices to check the altloc of the first 296 # atom. 297 altloc = res.child_dict[rk].get_list()[0].get_altloc() 298 if altloc in tuple('A1 '): 299 res.disordered_select(rk) 300 break 301 else: 302 # Simply select the first one 303 res.disordered_select(res.disordered_get_id_list()[0]) 304 305 # Sometimes point mutations are put into HETATM and ATOM with altloc 306 # 'A' and 'B'. 307 # See 3piu chain A residue 273: 308 # <Residue LLP het=H_LLP resseq=273 icode= > 309 # <Residue LYS het= resseq=273 icode= > 310 # DSSP uses the HETATM LLP as it has altloc 'A' 311 # We check the altloc code here. 312 elif res.is_disordered() == 1: 313 # Check altloc of all atoms in the DisorderedResidue. If it 314 # contains blank, A or 1, then use it. Otherwise, look for HET 315 # residues of the same seq+icode. If not such HET residues are 316 # found, just accept the current one. 317 altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) 318 if altlocs.isdisjoint('A1 '): 319 # Try again with all HETATM other than water 320 res_seq_icode = resid2code(res_id) 321 for r in chain: 322 if r.id[0] not in (' ', 'W'): 323 if resid2code(r.id) == res_seq_icode and \ 324 r.get_list()[0].get_altloc() in tuple('A1 '): 325 res = r 326 break 327 328 (aa, ss, acc, phi, psi, dssp_index, 329 NH_O_1_relidx, NH_O_1_energy, 330 O_NH_1_relidx, O_NH_1_energy, 331 NH_O_2_relidx, NH_O_2_energy, 332 O_NH_2_relidx, O_NH_2_energy) = dssp_dict[key] 333 334 res.xtra["SS_DSSP"] = ss 335 res.xtra["EXP_DSSP_ASA"] = acc 336 res.xtra["PHI_DSSP"] = phi 337 res.xtra["PSI_DSSP"] = psi 338 res.xtra["DSSP_INDEX"] = dssp_index 339 res.xtra["NH_O_1_RELIDX_DSSP"] = NH_O_1_relidx 340 res.xtra["NH_O_1_ENERGY_DSSP"] = NH_O_1_energy 341 res.xtra["O_NH_1_RELIDX_DSSP"] = O_NH_1_relidx 342 res.xtra["O_NH_1_ENERGY_DSSP"] = O_NH_1_energy 343 res.xtra["NH_O_2_RELIDX_DSSP"] = NH_O_2_relidx 344 res.xtra["NH_O_2_ENERGY_DSSP"] = NH_O_2_energy 345 res.xtra["O_NH_2_RELIDX_DSSP"] = O_NH_2_relidx 346 res.xtra["O_NH_2_ENERGY_DSSP"] = O_NH_2_energy 347 348 # Relative accessibility 349 resname = res.get_resname() 350 try: 351 rel_acc = acc / MAX_ACC[resname] 352 except KeyError: 353 # Invalid value for resname 354 rel_acc = 'NA' 355 else: 356 if rel_acc > 1.0: 357 rel_acc = 1.0 358 res.xtra["EXP_DSSP_RASA"] = rel_acc 359 # Verify if AA in DSSP == AA in Structure 360 # Something went wrong if this is not true! 361 # NB: DSSP uses X often 362 resname = SCOPData.protein_letters_3to1.get(resname, 'X') 363 if resname == "C": 364 # DSSP renames C in C-bridges to a,b,c,d,... 365 # - we rename it back to 'C' 366 if _dssp_cys.match(aa): 367 aa = 'C' 368 # Take care of HETATM again 369 if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): 370 raise PDBException("Structure/DSSP mismatch at %s" % res) 371 372 dssp_vals = (dssp_index, aa, ss, rel_acc, phi, psi, 373 NH_O_1_relidx, NH_O_1_energy, 374 O_NH_1_relidx, O_NH_1_energy, 375 NH_O_2_relidx, NH_O_2_energy, 376 O_NH_2_relidx, O_NH_2_energy) 377 378 dssp_map[key] = dssp_vals 379 dssp_list.append(dssp_vals) 380 381 AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, 382 dssp_list)
383 384 385 if __name__ == "__main__": 386 import sys 387 388 p = PDBParser() 389 s = p.get_structure('X', sys.argv[1]) 390 model = s[0] 391 d = DSSP(model, sys.argv[1]) 392 393 for r in d: 394 print(r) 395 print("Handled %i residues" % len(d)) 396 print(sorted(d)) 397 if ('A', 1) in d: 398 print(d[('A', 1)]) 399 print(s[0]['A'][1].xtra) 400 # Secondary structure 401 print(''.join(item[1] for item in d)) 402