Package Bio :: Package PDB :: Module Polypeptide
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.Polypeptide

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Polypeptide-related classes (construction and representation). 
  7   
  8  Simple example with multiple chains, 
  9   
 10      >>> from Bio.PDB.PDBParser import PDBParser 
 11      >>> from Bio.PDB.Polypeptide import PPBuilder 
 12      >>> structure = PDBParser().get_structure('2BEG', 'PDB/2BEG.pdb') 
 13      >>> ppb=PPBuilder() 
 14      >>> for pp in ppb.build_peptides(structure): 
 15      ...     print pp.get_sequence() 
 16      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 17      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 18      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 19      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 20      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 21   
 22  Example with non-standard amino acids using HETATM lines in the PDB file, 
 23  in this case selenomethionine (MSE): 
 24   
 25      >>> from Bio.PDB.PDBParser import PDBParser 
 26      >>> from Bio.PDB.Polypeptide import PPBuilder 
 27      >>> structure = PDBParser().get_structure('1A8O', 'PDB/1A8O.pdb') 
 28      >>> ppb=PPBuilder() 
 29      >>> for pp in ppb.build_peptides(structure): 
 30      ...     print pp.get_sequence() 
 31      DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW 
 32      TETLLVQNANPDCKTILKALGPGATLEE 
 33      TACQG 
 34   
 35  If you want to, you can include non-standard amino acids in the peptides: 
 36   
 37      >>> for pp in ppb.build_peptides(structure, aa_only=False): 
 38      ...     print pp.get_sequence() 
 39      ...     print pp.get_sequence()[0], pp[0].get_resname() 
 40      ...     print pp.get_sequence()[-7], pp[-7].get_resname() 
 41      ...     print pp.get_sequence()[-6], pp[-6].get_resname() 
 42      MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG 
 43      M MSE 
 44      M MSE 
 45      M MSE 
 46   
 47  In this case the selenomethionines (the first and also seventh and sixth from 
 48  last residues) have been shown as M (methionine) by the get_sequence method. 
 49  """ 
 50   
 51  import warnings 
 52   
 53  from Bio.Alphabet import generic_protein 
 54  from Bio.Seq import Seq 
 55  from Bio.SCOP.Raf import to_one_letter_code 
 56  from Bio.PDB.PDBExceptions import PDBException 
 57  from Bio.PDB.Vector import calc_dihedral, calc_angle 
 58   
 59   
 60  standard_aa_names=["ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", 
 61                     "LEU", "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", 
 62                     "TRP", "TYR"] 
 63   
 64   
 65  aa1="ACDEFGHIKLMNPQRSTVWY" 
 66  aa3=standard_aa_names 
 67   
 68  d1_to_index={} 
 69  dindex_to_1={} 
 70  d3_to_index={} 
 71  dindex_to_3={} 
 72   
 73  # Create some lookup tables 
 74  for i in range(0, 20): 
 75      n1=aa1[i] 
 76      n3=aa3[i] 
 77      d1_to_index[n1]=i 
 78      dindex_to_1[i]=n1 
 79      d3_to_index[n3]=i 
 80      dindex_to_3[i]=n3 
 81   
 82   
83 -def index_to_one(index):
84 """Index to corresponding one letter amino acid name. 85 86 >>> index_to_one(0) 87 'A' 88 >>> index_to_one(19) 89 'Y' 90 """ 91 return dindex_to_1[index]
92 93
94 -def one_to_index(s):
95 """One letter code to index. 96 97 >>> one_to_index('A') 98 0 99 >>> one_to_index('Y') 100 19 101 """ 102 return d1_to_index[s]
103 104
105 -def index_to_three(i):
106 """Index to corresponding three letter amino acid name. 107 108 >>> index_to_three(0) 109 'ALA' 110 >>> index_to_three(19) 111 'TYR' 112 """ 113 return dindex_to_3[i]
114 115
116 -def three_to_index(s):
117 """Three letter code to index. 118 119 >>> three_to_index('ALA') 120 0 121 >>> three_to_index('TYR') 122 19 123 """ 124 return d3_to_index[s]
125 126
127 -def three_to_one(s):
128 """Three letter code to one letter code. 129 130 >>> three_to_one('ALA') 131 'A' 132 >>> three_to_one('TYR') 133 'Y' 134 135 For non-standard amino acids, you get a KeyError: 136 137 >>> three_to_one('MSE') 138 Traceback (most recent call last): 139 ... 140 KeyError: 'MSE' 141 """ 142 i=d3_to_index[s] 143 return dindex_to_1[i]
144 145
146 -def one_to_three(s):
147 """One letter code to three letter code. 148 149 >>> one_to_three('A') 150 'ALA' 151 >>> one_to_three('Y') 152 'TYR' 153 """ 154 i=d1_to_index[s] 155 return dindex_to_3[i]
156 157
158 -def is_aa(residue, standard=False):
159 """Return True if residue object/string is an amino acid. 160 161 @param residue: a L{Residue} object OR a three letter amino acid code 162 @type residue: L{Residue} or string 163 164 @param standard: flag to check for the 20 AA (default false) 165 @type standard: boolean 166 167 >>> is_aa('ALA') 168 True 169 170 Known three letter codes for modified amino acids are supported, 171 172 >>> is_aa('FME') 173 True 174 >>> is_aa('FME', standard=True) 175 False 176 """ 177 #TODO - What about special cases like XXX, can they appear in PDB files? 178 if not isinstance(residue, basestring): 179 residue=residue.get_resname() 180 residue=residue.upper() 181 if standard: 182 return residue in d3_to_index 183 else: 184 return residue in to_one_letter_code
185 186
187 -class Polypeptide(list):
188 """A polypeptide is simply a list of L{Residue} objects."""
189 - def get_ca_list(self):
190 """Get list of C-alpha atoms in the polypeptide. 191 192 @return: the list of C-alpha atoms 193 @rtype: [L{Atom}, L{Atom}, ...] 194 """ 195 ca_list=[] 196 for res in self: 197 ca=res["CA"] 198 ca_list.append(ca) 199 return ca_list
200
201 - def get_phi_psi_list(self):
202 """Return the list of phi/psi dihedral angles.""" 203 ppl=[] 204 lng=len(self) 205 for i in range(0, lng): 206 res=self[i] 207 try: 208 n=res['N'].get_vector() 209 ca=res['CA'].get_vector() 210 c=res['C'].get_vector() 211 except: 212 # Some atoms are missing 213 # Phi/Psi cannot be calculated for this residue 214 ppl.append((None, None)) 215 res.xtra["PHI"]=None 216 res.xtra["PSI"]=None 217 continue 218 # Phi 219 if i>0: 220 rp=self[i-1] 221 try: 222 cp=rp['C'].get_vector() 223 phi=calc_dihedral(cp, n, ca, c) 224 except: 225 phi=None 226 else: 227 # No phi for residue 0! 228 phi=None 229 # Psi 230 if i<(lng-1): 231 rn=self[i+1] 232 try: 233 nn=rn['N'].get_vector() 234 psi=calc_dihedral(n, ca, c, nn) 235 except: 236 psi=None 237 else: 238 # No psi for last residue! 239 psi=None 240 ppl.append((phi, psi)) 241 # Add Phi/Psi to xtra dict of residue 242 res.xtra["PHI"]=phi 243 res.xtra["PSI"]=psi 244 return ppl
245
246 - def get_tau_list(self):
247 """List of tau torsions angles for all 4 consecutive Calpha atoms.""" 248 ca_list=self.get_ca_list() 249 tau_list=[] 250 for i in range(0, len(ca_list)-3): 251 atom_list = (ca_list[i], ca_list[i+1], ca_list[i+2], ca_list[i+3]) 252 v1, v2, v3, v4 = [a.get_vector() for a in atom_list] 253 tau=calc_dihedral(v1, v2, v3, v4) 254 tau_list.append(tau) 255 # Put tau in xtra dict of residue 256 res=ca_list[i+2].get_parent() 257 res.xtra["TAU"]=tau 258 return tau_list
259
260 - def get_theta_list(self):
261 """List of theta angles for all 3 consecutive Calpha atoms.""" 262 theta_list=[] 263 ca_list=self.get_ca_list() 264 for i in range(0, len(ca_list)-2): 265 atom_list = (ca_list[i], ca_list[i+1], ca_list[i+2]) 266 v1, v2, v3 = [a.get_vector() for a in atom_list] 267 theta=calc_angle(v1, v2, v3) 268 theta_list.append(theta) 269 # Put tau in xtra dict of residue 270 res=ca_list[i+1].get_parent() 271 res.xtra["THETA"]=theta 272 return theta_list
273
274 - def get_sequence(self):
275 """Return the AA sequence as a Seq object. 276 277 @return: polypeptide sequence 278 @rtype: L{Seq} 279 """ 280 s="" 281 for res in self: 282 s += to_one_letter_code.get(res.get_resname(), 'X') 283 seq=Seq(s, generic_protein) 284 return seq
285
286 - def __repr__(self):
287 """Return string representation of the polypeptide. 288 289 Return <Polypeptide start=START end=END>, where START 290 and END are sequence identifiers of the outer residues. 291 """ 292 start=self[0].get_id()[1] 293 end=self[-1].get_id()[1] 294 s="<Polypeptide start=%s end=%s>" % (start, end) 295 return s
296 297
298 -class _PPBuilder:
299 """Base class to extract polypeptides. 300 301 It checks if two consecutive residues in a chain are connected. 302 The connectivity test is implemented by a subclass. 303 304 This assumes you want both standard and non-standard amino acids. 305 """
306 - def __init__(self, radius):
307 """ 308 @param radius: distance 309 @type radius: float 310 """ 311 self.radius=radius
312
313 - def _accept(self, residue, standard_aa_only):
314 """Check if the residue is an amino acid (PRIVATE).""" 315 if is_aa(residue, standard=standard_aa_only): 316 return True 317 elif not standard_aa_only and "CA" in residue.child_dict: 318 #It has an alpha carbon... 319 #We probably need to update the hard coded list of 320 #non-standard residues, see function is_aa for details. 321 warnings.warn("Assuming residue %s is an unknown modified " 322 "amino acid" % residue.get_resname()) 323 return True 324 else: 325 # not a standard AA so skip 326 return False
327
328 - def build_peptides(self, entity, aa_only=1):
329 """Build and return a list of Polypeptide objects. 330 331 @param entity: polypeptides are searched for in this object 332 @type entity: L{Structure}, L{Model} or L{Chain} 333 334 @param aa_only: if 1, the residue needs to be a standard AA 335 @type aa_only: int 336 """ 337 is_connected=self._is_connected 338 accept=self._accept 339 level=entity.get_level() 340 # Decide which entity we are dealing with 341 if level=="S": 342 model=entity[0] 343 chain_list=model.get_list() 344 elif level=="M": 345 chain_list=entity.get_list() 346 elif level=="C": 347 chain_list=[entity] 348 else: 349 raise PDBException("Entity should be Structure, Model or Chain.") 350 pp_list=[] 351 for chain in chain_list: 352 chain_it=iter(chain) 353 try: 354 prev_res = chain_it.next() 355 while not accept(prev_res, aa_only): 356 prev_res = chain_it.next() 357 except StopIteration: 358 #No interesting residues at all in this chain 359 continue 360 pp=None 361 for next_res in chain_it: 362 if accept(prev_res, aa_only) \ 363 and accept(next_res, aa_only) \ 364 and is_connected(prev_res, next_res): 365 if pp is None: 366 pp=Polypeptide() 367 pp.append(prev_res) 368 pp_list.append(pp) 369 pp.append(next_res) 370 else: 371 #Either too far apart, or one of the residues is unwanted. 372 #End the current peptide 373 pp=None 374 prev_res=next_res 375 return pp_list
376 377
378 -class CaPPBuilder(_PPBuilder):
379 """Use CA--CA distance to find polypeptides."""
380 - def __init__(self, radius=4.3):
381 _PPBuilder.__init__(self, radius)
382
383 - def _is_connected(self, prev_res, next_res):
384 for r in [prev_res, next_res]: 385 if not r.has_id("CA"): 386 return False 387 n=next_res["CA"] 388 p=prev_res["CA"] 389 # Unpack disordered 390 if n.is_disordered(): 391 nlist=n.disordered_get_list() 392 else: 393 nlist=[n] 394 if p.is_disordered(): 395 plist=p.disordered_get_list() 396 else: 397 plist=[p] 398 for nn in nlist: 399 for pp in plist: 400 if (nn-pp)<self.radius: 401 return True 402 return False
403 404
405 -class PPBuilder(_PPBuilder):
406 """Use C--N distance to find polypeptides."""
407 - def __init__(self, radius=1.8):
408 _PPBuilder.__init__(self, radius)
409
410 - def _is_connected(self, prev_res, next_res):
411 if not prev_res.has_id("C"): 412 return False 413 if not next_res.has_id("N"): 414 return False 415 test_dist=self._test_dist 416 c=prev_res["C"] 417 n=next_res["N"] 418 # Test all disordered atom positions! 419 if c.is_disordered(): 420 clist=c.disordered_get_list() 421 else: 422 clist=[c] 423 if n.is_disordered(): 424 nlist=n.disordered_get_list() 425 else: 426 nlist=[n] 427 for nn in nlist: 428 for cc in clist: 429 # To form a peptide bond, N and C must be 430 # within radius and have the same altloc 431 # identifier or one altloc blank 432 n_altloc=nn.get_altloc() 433 c_altloc=cc.get_altloc() 434 if n_altloc==c_altloc or n_altloc==" " or c_altloc==" ": 435 if test_dist(nn, cc): 436 # Select the disordered atoms that 437 # are indeed bonded 438 if c.is_disordered(): 439 c.disordered_select(c_altloc) 440 if n.is_disordered(): 441 n.disordered_select(n_altloc) 442 return True 443 return False
444
445 - def _test_dist(self, c, n):
446 """Return 1 if distance between atoms<radius (PRIVATE).""" 447 if (c-n)<self.radius: 448 return 1 449 else: 450 return 0
451 452 453 if __name__=="__main__": 454 import sys 455 from Bio.PDB.PDBParser import PDBParser 456 457 p=PDBParser(PERMISSIVE=True) 458 459 s=p.get_structure("scr", sys.argv[1]) 460 461 ppb=PPBuilder() 462 463 print "C-N" 464 for pp in ppb.build_peptides(s): 465 print pp.get_sequence() 466 for pp in ppb.build_peptides(s[0]): 467 print pp.get_sequence() 468 for pp in ppb.build_peptides(s[0]["A"]): 469 print pp.get_sequence() 470 471 for pp in ppb.build_peptides(s): 472 for phi, psi in pp.get_phi_psi_list(): 473 print phi, psi 474 475 ppb=CaPPBuilder() 476 477 print "CA-CA" 478 for pp in ppb.build_peptides(s): 479 print pp.get_sequence() 480 for pp in ppb.build_peptides(s[0]): 481 print pp.get_sequence() 482 for pp in ppb.build_peptides(s[0]["A"]): 483 print pp.get_sequence() 484