Package Bio :: Package PDB :: Module DSSP'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.DSSP'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Use the DSSP program to calculate secondary structure and accessibility. 
  7   
  8  You need to have a working version of DSSP (and a license, free for academic 
  9  use) in order to use this. For DSSP, see U{http://swift.cmbi.ru.nl/gv/dssp/}. 
 10   
 11  The DSSP codes for secondary structure used here are: 
 12   
 13      - H        Alpha helix (4-12) 
 14      - B        Isolated beta-bridge residue 
 15      - E        Strand 
 16      - G        3-10 helix 
 17      - I        pi helix 
 18      - T        Turn 
 19      - S        Bend 
 20      - -        None 
 21  """ 
 22   
 23  from __future__ import print_function 
 24   
 25  __docformat__ = "epytext en" 
 26   
 27  import re 
 28  from Bio._py3k import StringIO 
 29  import subprocess 
 30   
 31  from Bio.Data import SCOPData 
 32   
 33  from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap 
 34  from Bio.PDB.PDBExceptions import PDBException 
 35  from Bio.PDB.PDBParser import PDBParser 
 36   
 37   
 38  # Match C in DSSP 
 39  _dssp_cys=re.compile('[a-z]') 
 40   
 41  # Maximal ASA of amino acids 
 42  # Values from Sander & Rost, (1994), Proteins, 20:216-226 
 43  # Used for relative accessibility 
 44  MAX_ACC={} 
 45  MAX_ACC["ALA"]=106.0 
 46  MAX_ACC["CYS"]=135.0 
 47  MAX_ACC["ASP"]=163.0 
 48  MAX_ACC["GLU"]=194.0 
 49  MAX_ACC["PHE"]=197.0 
 50  MAX_ACC["GLY"]=84.0 
 51  MAX_ACC["HIS"]=184.0 
 52  MAX_ACC["ILE"]=169.0 
 53  MAX_ACC["LYS"]=205.0 
 54  MAX_ACC["LEU"]=164.0 
 55  MAX_ACC["MET"]=188.0 
 56  MAX_ACC["ASN"]=157.0 
 57  MAX_ACC["PRO"]=136.0 
 58  MAX_ACC["GLN"]=198.0 
 59  MAX_ACC["ARG"]=248.0 
 60  MAX_ACC["SER"]=130.0 
 61  MAX_ACC["THR"]=142.0 
 62  MAX_ACC["VAL"]=142.0 
 63  MAX_ACC["TRP"]=227.0 
 64  MAX_ACC["TYR"]=222.0 
 65   
 66   
67 -def ss_to_index(ss):
68 """ 69 Secondary structure symbol to index. 70 H=0 71 E=1 72 C=2 73 """ 74 if ss=='H': 75 return 0 76 if ss=='E': 77 return 1 78 if ss=='C': 79 return 2 80 assert 0
81 82
83 -def dssp_dict_from_pdb_file(in_file, DSSP="dssp"):
84 """ 85 Create a DSSP dictionary from a PDB file. 86 87 Example: 88 >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb") 89 >>> aa, ss, acc=dssp_dict[('A', 1)] 90 91 @param in_file: pdb file 92 @type in_file: string 93 94 @param DSSP: DSSP executable (argument to os.system) 95 @type DSSP: string 96 97 @return: a dictionary that maps (chainid, resid) to 98 amino acid type, secondary structure code and 99 accessibility. 100 @rtype: {} 101 """ 102 #Using universal newlines is important on Python 3, this 103 #gives unicode handles rather than bytes handles. 104 p = subprocess.Popen([DSSP, in_file], universal_newlines=True, 105 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 106 out, err = p.communicate() 107 out_dict, keys = _make_dssp_dict(StringIO(out)) 108 return out_dict, keys
109 110
111 -def make_dssp_dict(filename):
112 """ 113 Return a DSSP dictionary that maps (chainid, resid) to 114 aa, ss and accessibility, from a DSSP file. 115 116 @param filename: the DSSP output file 117 @type filename: string 118 """ 119 with open(filename, "r") as handle: 120 return _make_dssp_dict(handle)
121
122 -def _make_dssp_dict(handle):
123 """ 124 Return a DSSP dictionary that maps (chainid, resid) to 125 aa, ss and accessibility, from an open DSSP file object. 126 127 @param handle: the open DSSP output file handle 128 @type handle: file 129 """ 130 dssp = {} 131 start = 0 132 keys = [] 133 for l in handle.readlines(): 134 sl = l.split() 135 if len(sl) < 2: 136 continue 137 if sl[1] == "RESIDUE": 138 # Start parsing from here 139 start = 1 140 continue 141 if not start: 142 continue 143 if l[9] == " ": 144 # Skip -- missing residue 145 continue 146 resseq = int(l[5:10]) 147 icode = l[10] 148 chainid = l[11] 149 aa = l[13] 150 ss = l[16] 151 if ss == " ": 152 ss = "-" 153 try: 154 acc = int(l[34:38]) 155 phi = float(l[103:109]) 156 psi = float(l[109:115]) 157 except ValueError as exc: 158 # DSSP output breaks its own format when there are >9999 159 # residues, since only 4 digits are allocated to the seq num 160 # field. See 3kic chain T res 321, 1vsy chain T res 6077. 161 # Here, look for whitespace to figure out the number of extra 162 # digits, and shift parsing the rest of the line by that amount. 163 if l[34] != ' ': 164 shift = l[34:].find(' ') 165 acc = int((l[34+shift:38+shift])) 166 phi = float(l[103+shift:109+shift]) 167 psi = float(l[109+shift:115+shift]) 168 else: 169 raise ValueError(exc) 170 res_id = (" ", resseq, icode) 171 dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi) 172 keys.append((chainid, res_id)) 173 return dssp, keys
174 175
176 -class DSSP(AbstractResiduePropertyMap):
177 """ 178 Run DSSP on a pdb file, and provide a handle to the 179 DSSP secondary structure and accessibility. 180 181 Note that DSSP can only handle one model. 182 183 Example: 184 185 >>> p = PDBParser() 186 >>> structure = p.get_structure("1MOT", "1MOT.pdb") 187 >>> model = structure[0] 188 >>> dssp = DSSP(model, "1MOT.pdb") 189 >>> # DSSP data is accessed by a tuple (chain_id, res_id) 190 >>> a_key = list(dssp)[2] 191 >>> # residue object, secondary structure, solvent accessibility, 192 >>> # relative accessiblity, phi, psi 193 >>> dssp[a_key] 194 (<Residue ALA het= resseq=251 icode= >, 195 'H', 196 72, 197 0.67924528301886788, 198 -61.200000000000003, 199 -42.399999999999999) 200 """ 201
202 - def __init__(self, model, pdb_file, dssp="dssp"):
203 """ 204 @param model: the first model of the structure 205 @type model: L{Model} 206 207 @param pdb_file: a PDB file 208 @type pdb_file: string 209 210 @param dssp: the dssp executable (ie. the argument to os.system) 211 @type dssp: string 212 """ 213 # create DSSP dictionary 214 dssp_dict, dssp_keys = dssp_dict_from_pdb_file(pdb_file, dssp) 215 dssp_map = {} 216 dssp_list = [] 217 218 def resid2code(res_id): 219 """Serialize a residue's resseq and icode for easy comparison.""" 220 return '%s%s' % (res_id[1], res_id[2])
221 222 # Now create a dictionary that maps Residue objects to 223 # secondary structure and accessibility, and a list of 224 # (residue, (secondary structure, accessibility)) tuples 225 for key in dssp_keys: 226 chain_id, res_id = key 227 chain = model[chain_id] 228 try: 229 res = chain[res_id] 230 except KeyError: 231 # In DSSP, HET field is not considered in residue identifier. 232 # Thus HETATM records may cause unnecessary exceptions. 233 # (See 3jui chain A res 593.) 234 # Try the lookup again with all HETATM other than water 235 res_seq_icode = resid2code(res_id) 236 for r in chain: 237 if r.id[0] not in (' ', 'W'): 238 # Compare resseq + icode 239 if resid2code(r.id) == res_seq_icode: 240 # Found a matching residue 241 res = r 242 break 243 else: 244 raise KeyError(res_id) 245 246 # For disordered residues of point mutations, BioPython uses the 247 # last one as default, But DSSP takes the first one (alternative 248 # location is blank, A or 1). See 1h9h chain E resi 22. 249 # Here we select the res in which all atoms have altloc blank, A or 250 # 1. If no such residues are found, simply use the first one appears 251 # (as DSSP does). 252 if res.is_disordered() == 2: 253 for rk in res.disordered_get_id_list(): 254 # All atoms in the disordered residue should have the same 255 # altloc, so it suffices to check the altloc of the first 256 # atom. 257 altloc = res.child_dict[rk].get_list()[0].get_altloc() 258 if altloc in tuple('A1 '): 259 res.disordered_select(rk) 260 break 261 else: 262 # Simply select the first one 263 res.disordered_select(res.disordered_get_id_list()[0]) 264 265 # Sometimes point mutations are put into HETATM and ATOM with altloc 266 # 'A' and 'B'. 267 # See 3piu chain A residue 273: 268 # <Residue LLP het=H_LLP resseq=273 icode= > 269 # <Residue LYS het= resseq=273 icode= > 270 # DSSP uses the HETATM LLP as it has altloc 'A' 271 # We check the altloc code here. 272 elif res.is_disordered() == 1: 273 # Check altloc of all atoms in the DisorderedResidue. If it 274 # contains blank, A or 1, then use it. Otherwise, look for HET 275 # residues of the same seq+icode. If not such HET residues are 276 # found, just accept the current one. 277 altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) 278 if altlocs.isdisjoint('A1 '): 279 # Try again with all HETATM other than water 280 res_seq_icode = resid2code(res_id) 281 for r in chain: 282 if r.id[0] not in (' ', 'W'): 283 if resid2code(r.id) == res_seq_icode and \ 284 r.get_list()[0].get_altloc() in tuple('A1 '): 285 res = r 286 break 287 288 aa, ss, acc, phi, psi = dssp_dict[key] 289 res.xtra["SS_DSSP"] = ss 290 res.xtra["EXP_DSSP_ASA"] = acc 291 res.xtra["PHI_DSSP"] = phi 292 res.xtra["PSI_DSSP"] = psi 293 # Relative accessibility 294 resname = res.get_resname() 295 try: 296 rel_acc = acc/MAX_ACC[resname] 297 except KeyError: 298 # Invalid value for resname 299 rel_acc = 'NA' 300 else: 301 if rel_acc > 1.0: 302 rel_acc = 1.0 303 res.xtra["EXP_DSSP_RASA"] = rel_acc 304 # Verify if AA in DSSP == AA in Structure 305 # Something went wrong if this is not true! 306 # NB: DSSP uses X often 307 resname = SCOPData.protein_letters_3to1.get(resname, 'X') 308 if resname == "C": 309 # DSSP renames C in C-bridges to a,b,c,d,... 310 # - we rename it back to 'C' 311 if _dssp_cys.match(aa): 312 aa = 'C' 313 # Take care of HETATM again 314 if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): 315 raise PDBException("Structure/DSSP mismatch at %s" % res) 316 dssp_map[key] = ((res, ss, acc, rel_acc, phi, psi)) 317 dssp_list.append((res, ss, acc, rel_acc, phi, psi)) 318 319 AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, 320 dssp_list)
321 322 323 if __name__ == "__main__": 324 import sys 325 326 p = PDBParser() 327 s = p.get_structure('X', sys.argv[1]) 328 model = s[0] 329 d = DSSP(model, sys.argv[1]) 330 331 for r in d: 332 print(r) 333 print("Handled %i residues" % len(d)) 334 print(sorted(d)) 335 if ('A', 1) in d: 336 print(d[('A', 1)]) 337 print(s[0]['A'][1].xtra) 338 # Secondary structure 339 print(''.join(item[1] for item in d)) 340