Package Bio :: Package PDB :: Module Polypeptide
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.Polypeptide

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Polypeptide-related classes (construction and representation). 
  7   
  8  Simple example with multiple chains, 
  9   
 10      >>> from Bio.PDB.PDBParser import PDBParser 
 11      >>> from Bio.PDB.Polypeptide import PPBuilder 
 12      >>> structure = PDBParser().get_structure('2BEG', 'PDB/2BEG.pdb') 
 13      >>> ppb=PPBuilder() 
 14      >>> for pp in ppb.build_peptides(structure): 
 15      ...     print(pp.get_sequence()) 
 16      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 17      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 18      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 19      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 20      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 21   
 22  Example with non-standard amino acids using HETATM lines in the PDB file, 
 23  in this case selenomethionine (MSE): 
 24   
 25      >>> from Bio.PDB.PDBParser import PDBParser 
 26      >>> from Bio.PDB.Polypeptide import PPBuilder 
 27      >>> structure = PDBParser().get_structure('1A8O', 'PDB/1A8O.pdb') 
 28      >>> ppb=PPBuilder() 
 29      >>> for pp in ppb.build_peptides(structure): 
 30      ...     print(pp.get_sequence()) 
 31      DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW 
 32      TETLLVQNANPDCKTILKALGPGATLEE 
 33      TACQG 
 34   
 35  If you want to, you can include non-standard amino acids in the peptides: 
 36   
 37      >>> for pp in ppb.build_peptides(structure, aa_only=False): 
 38      ...     print(pp.get_sequence()) 
 39      ...     print("%s %s" % (pp.get_sequence()[0], pp[0].get_resname())) 
 40      ...     print("%s %s" % (pp.get_sequence()[-7], pp[-7].get_resname())) 
 41      ...     print("%s %s" % (pp.get_sequence()[-6], pp[-6].get_resname())) 
 42      MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG 
 43      M MSE 
 44      M MSE 
 45      M MSE 
 46   
 47  In this case the selenomethionines (the first and also seventh and sixth from 
 48  last residues) have been shown as M (methionine) by the get_sequence method. 
 49  """ 
 50   
 51  from __future__ import print_function 
 52  from Bio._py3k import basestring 
 53   
 54  import warnings 
 55   
 56  from Bio.Alphabet import generic_protein 
 57  from Bio.Data import SCOPData 
 58  from Bio.Seq import Seq 
 59  from Bio.PDB.PDBExceptions import PDBException 
 60  from Bio.PDB.Vector import calc_dihedral, calc_angle 
 61   
 62   
 63  standard_aa_names=["ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", 
 64                     "LEU", "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", 
 65                     "TRP", "TYR"] 
 66   
 67   
 68  aa1="ACDEFGHIKLMNPQRSTVWY" 
 69  aa3=standard_aa_names 
 70   
 71  d1_to_index={} 
 72  dindex_to_1={} 
 73  d3_to_index={} 
 74  dindex_to_3={} 
 75   
 76  # Create some lookup tables 
 77  for i in range(0, 20): 
 78      n1=aa1[i] 
 79      n3=aa3[i] 
 80      d1_to_index[n1]=i 
 81      dindex_to_1[i]=n1 
 82      d3_to_index[n3]=i 
 83      dindex_to_3[i]=n3 
 84   
 85   
86 -def index_to_one(index):
87 """Index to corresponding one letter amino acid name. 88 89 >>> index_to_one(0) 90 'A' 91 >>> index_to_one(19) 92 'Y' 93 """ 94 return dindex_to_1[index]
95 96
97 -def one_to_index(s):
98 """One letter code to index. 99 100 >>> one_to_index('A') 101 0 102 >>> one_to_index('Y') 103 19 104 """ 105 return d1_to_index[s]
106 107
108 -def index_to_three(i):
109 """Index to corresponding three letter amino acid name. 110 111 >>> index_to_three(0) 112 'ALA' 113 >>> index_to_three(19) 114 'TYR' 115 """ 116 return dindex_to_3[i]
117 118
119 -def three_to_index(s):
120 """Three letter code to index. 121 122 >>> three_to_index('ALA') 123 0 124 >>> three_to_index('TYR') 125 19 126 """ 127 return d3_to_index[s]
128 129
130 -def three_to_one(s):
131 """Three letter code to one letter code. 132 133 >>> three_to_one('ALA') 134 'A' 135 >>> three_to_one('TYR') 136 'Y' 137 138 For non-standard amino acids, you get a KeyError: 139 140 >>> three_to_one('MSE') 141 Traceback (most recent call last): 142 ... 143 KeyError: 'MSE' 144 """ 145 i=d3_to_index[s] 146 return dindex_to_1[i]
147 148
149 -def one_to_three(s):
150 """One letter code to three letter code. 151 152 >>> one_to_three('A') 153 'ALA' 154 >>> one_to_three('Y') 155 'TYR' 156 """ 157 i=d1_to_index[s] 158 return dindex_to_3[i]
159 160
161 -def is_aa(residue, standard=False):
162 """Return True if residue object/string is an amino acid. 163 164 @param residue: a L{Residue} object OR a three letter amino acid code 165 @type residue: L{Residue} or string 166 167 @param standard: flag to check for the 20 AA (default false) 168 @type standard: boolean 169 170 >>> is_aa('ALA') 171 True 172 173 Known three letter codes for modified amino acids are supported, 174 175 >>> is_aa('FME') 176 True 177 >>> is_aa('FME', standard=True) 178 False 179 """ 180 # TODO - What about special cases like XXX, can they appear in PDB files? 181 if not isinstance(residue, basestring): 182 residue=residue.get_resname() 183 residue=residue.upper() 184 if standard: 185 return residue in d3_to_index 186 else: 187 return residue in SCOPData.protein_letters_3to1
188 189
190 -class Polypeptide(list):
191 """A polypeptide is simply a list of L{Residue} objects."""
192 - def get_ca_list(self):
193 """Get list of C-alpha atoms in the polypeptide. 194 195 @return: the list of C-alpha atoms 196 @rtype: [L{Atom}, L{Atom}, ...] 197 """ 198 ca_list=[] 199 for res in self: 200 ca=res["CA"] 201 ca_list.append(ca) 202 return ca_list
203
204 - def get_phi_psi_list(self):
205 """Return the list of phi/psi dihedral angles.""" 206 ppl=[] 207 lng=len(self) 208 for i in range(0, lng): 209 res=self[i] 210 try: 211 n=res['N'].get_vector() 212 ca=res['CA'].get_vector() 213 c=res['C'].get_vector() 214 except: 215 # Some atoms are missing 216 # Phi/Psi cannot be calculated for this residue 217 ppl.append((None, None)) 218 res.xtra["PHI"]=None 219 res.xtra["PSI"]=None 220 continue 221 # Phi 222 if i>0: 223 rp=self[i-1] 224 try: 225 cp=rp['C'].get_vector() 226 phi=calc_dihedral(cp, n, ca, c) 227 except: 228 phi=None 229 else: 230 # No phi for residue 0! 231 phi=None 232 # Psi 233 if i<(lng-1): 234 rn=self[i+1] 235 try: 236 nn=rn['N'].get_vector() 237 psi=calc_dihedral(n, ca, c, nn) 238 except: 239 psi=None 240 else: 241 # No psi for last residue! 242 psi=None 243 ppl.append((phi, psi)) 244 # Add Phi/Psi to xtra dict of residue 245 res.xtra["PHI"]=phi 246 res.xtra["PSI"]=psi 247 return ppl
248
249 - def get_tau_list(self):
250 """List of tau torsions angles for all 4 consecutive Calpha atoms.""" 251 ca_list=self.get_ca_list() 252 tau_list=[] 253 for i in range(0, len(ca_list)-3): 254 atom_list = (ca_list[i], ca_list[i+1], ca_list[i+2], ca_list[i+3]) 255 v1, v2, v3, v4 = [a.get_vector() for a in atom_list] 256 tau=calc_dihedral(v1, v2, v3, v4) 257 tau_list.append(tau) 258 # Put tau in xtra dict of residue 259 res=ca_list[i+2].get_parent() 260 res.xtra["TAU"]=tau 261 return tau_list
262
263 - def get_theta_list(self):
264 """List of theta angles for all 3 consecutive Calpha atoms.""" 265 theta_list=[] 266 ca_list=self.get_ca_list() 267 for i in range(0, len(ca_list)-2): 268 atom_list = (ca_list[i], ca_list[i+1], ca_list[i+2]) 269 v1, v2, v3 = [a.get_vector() for a in atom_list] 270 theta=calc_angle(v1, v2, v3) 271 theta_list.append(theta) 272 # Put tau in xtra dict of residue 273 res=ca_list[i+1].get_parent() 274 res.xtra["THETA"]=theta 275 return theta_list
276
277 - def get_sequence(self):
278 """Return the AA sequence as a Seq object. 279 280 @return: polypeptide sequence 281 @rtype: L{Seq} 282 """ 283 s="" 284 for res in self: 285 s += SCOPData.protein_letters_3to1.get(res.get_resname(), 'X') 286 seq=Seq(s, generic_protein) 287 return seq
288
289 - def __repr__(self):
290 """Return string representation of the polypeptide. 291 292 Return <Polypeptide start=START end=END>, where START 293 and END are sequence identifiers of the outer residues. 294 """ 295 start=self[0].get_id()[1] 296 end=self[-1].get_id()[1] 297 s="<Polypeptide start=%s end=%s>" % (start, end) 298 return s
299 300
301 -class _PPBuilder:
302 """Base class to extract polypeptides. 303 304 It checks if two consecutive residues in a chain are connected. 305 The connectivity test is implemented by a subclass. 306 307 This assumes you want both standard and non-standard amino acids. 308 """
309 - def __init__(self, radius):
310 """ 311 @param radius: distance 312 @type radius: float 313 """ 314 self.radius=radius
315
316 - def _accept(self, residue, standard_aa_only):
317 """Check if the residue is an amino acid (PRIVATE).""" 318 if is_aa(residue, standard=standard_aa_only): 319 return True 320 elif not standard_aa_only and "CA" in residue.child_dict: 321 # It has an alpha carbon... 322 # We probably need to update the hard coded list of 323 # non-standard residues, see function is_aa for details. 324 warnings.warn("Assuming residue %s is an unknown modified " 325 "amino acid" % residue.get_resname()) 326 return True 327 else: 328 # not a standard AA so skip 329 return False
330
331 - def build_peptides(self, entity, aa_only=1):
332 """Build and return a list of Polypeptide objects. 333 334 @param entity: polypeptides are searched for in this object 335 @type entity: L{Structure}, L{Model} or L{Chain} 336 337 @param aa_only: if 1, the residue needs to be a standard AA 338 @type aa_only: int 339 """ 340 is_connected=self._is_connected 341 accept=self._accept 342 level=entity.get_level() 343 # Decide which entity we are dealing with 344 if level=="S": 345 model=entity[0] 346 chain_list=model.get_list() 347 elif level=="M": 348 chain_list=entity.get_list() 349 elif level=="C": 350 chain_list=[entity] 351 else: 352 raise PDBException("Entity should be Structure, Model or Chain.") 353 pp_list=[] 354 for chain in chain_list: 355 chain_it=iter(chain) 356 try: 357 prev_res = next(chain_it) 358 while not accept(prev_res, aa_only): 359 prev_res = next(chain_it) 360 except StopIteration: 361 # No interesting residues at all in this chain 362 continue 363 pp=None 364 for next_res in chain_it: 365 if accept(prev_res, aa_only) \ 366 and accept(next_res, aa_only) \ 367 and is_connected(prev_res, next_res): 368 if pp is None: 369 pp=Polypeptide() 370 pp.append(prev_res) 371 pp_list.append(pp) 372 pp.append(next_res) 373 else: 374 # Either too far apart, or one of the residues is unwanted. 375 # End the current peptide 376 pp=None 377 prev_res=next_res 378 return pp_list
379 380
381 -class CaPPBuilder(_PPBuilder):
382 """Use CA--CA distance to find polypeptides."""
383 - def __init__(self, radius=4.3):
384 _PPBuilder.__init__(self, radius)
385
386 - def _is_connected(self, prev_res, next_res):
387 for r in [prev_res, next_res]: 388 if not r.has_id("CA"): 389 return False 390 n=next_res["CA"] 391 p=prev_res["CA"] 392 # Unpack disordered 393 if n.is_disordered(): 394 nlist=n.disordered_get_list() 395 else: 396 nlist=[n] 397 if p.is_disordered(): 398 plist=p.disordered_get_list() 399 else: 400 plist=[p] 401 for nn in nlist: 402 for pp in plist: 403 if (nn-pp)<self.radius: 404 return True 405 return False
406 407
408 -class PPBuilder(_PPBuilder):
409 """Use C--N distance to find polypeptides."""
410 - def __init__(self, radius=1.8):
411 _PPBuilder.__init__(self, radius)
412
413 - def _is_connected(self, prev_res, next_res):
414 if not prev_res.has_id("C"): 415 return False 416 if not next_res.has_id("N"): 417 return False 418 test_dist=self._test_dist 419 c=prev_res["C"] 420 n=next_res["N"] 421 # Test all disordered atom positions! 422 if c.is_disordered(): 423 clist=c.disordered_get_list() 424 else: 425 clist=[c] 426 if n.is_disordered(): 427 nlist=n.disordered_get_list() 428 else: 429 nlist=[n] 430 for nn in nlist: 431 for cc in clist: 432 # To form a peptide bond, N and C must be 433 # within radius and have the same altloc 434 # identifier or one altloc blank 435 n_altloc=nn.get_altloc() 436 c_altloc=cc.get_altloc() 437 if n_altloc==c_altloc or n_altloc==" " or c_altloc==" ": 438 if test_dist(nn, cc): 439 # Select the disordered atoms that 440 # are indeed bonded 441 if c.is_disordered(): 442 c.disordered_select(c_altloc) 443 if n.is_disordered(): 444 n.disordered_select(n_altloc) 445 return True 446 return False
447
448 - def _test_dist(self, c, n):
449 """Return 1 if distance between atoms<radius (PRIVATE).""" 450 if (c-n)<self.radius: 451 return 1 452 else: 453 return 0
454 455 456 if __name__=="__main__": 457 import sys 458 from Bio.PDB.PDBParser import PDBParser 459 460 p=PDBParser(PERMISSIVE=True) 461 462 s=p.get_structure("scr", sys.argv[1]) 463 464 ppb=PPBuilder() 465 466 print("C-N") 467 for pp in ppb.build_peptides(s): 468 print(pp.get_sequence()) 469 for pp in ppb.build_peptides(s[0]): 470 print(pp.get_sequence()) 471 for pp in ppb.build_peptides(s[0]["A"]): 472 print(pp.get_sequence()) 473 474 for pp in ppb.build_peptides(s): 475 for phi, psi in pp.get_phi_psi_list(): 476 print("%f %f" % (phi, psi)) 477 478 ppb=CaPPBuilder() 479 480 print("CA-CA") 481 for pp in ppb.build_peptides(s): 482 print(pp.get_sequence()) 483 for pp in ppb.build_peptides(s[0]): 484 print(pp.get_sequence()) 485 for pp in ppb.build_peptides(s[0]["A"]): 486 print(pp.get_sequence()) 487