Package Bio :: Package PDB :: Module Polypeptide
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.Polypeptide

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Polypeptide-related classes (construction and representation). 
  7   
  8  Simple example with multiple chains, 
  9   
 10      >>> from Bio.PDB.PDBParser import PDBParser 
 11      >>> from Bio.PDB.Polypeptide import PPBuilder 
 12      >>> structure = PDBParser().get_structure('2BEG', 'PDB/2BEG.pdb') 
 13      >>> ppb=PPBuilder() 
 14      >>> for pp in ppb.build_peptides(structure): 
 15      ...     print(pp.get_sequence()) 
 16      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 17      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 18      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 19      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 20      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 21   
 22  Example with non-standard amino acids using HETATM lines in the PDB file, 
 23  in this case selenomethionine (MSE): 
 24   
 25      >>> from Bio.PDB.PDBParser import PDBParser 
 26      >>> from Bio.PDB.Polypeptide import PPBuilder 
 27      >>> structure = PDBParser().get_structure('1A8O', 'PDB/1A8O.pdb') 
 28      >>> ppb=PPBuilder() 
 29      >>> for pp in ppb.build_peptides(structure): 
 30      ...     print(pp.get_sequence()) 
 31      DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW 
 32      TETLLVQNANPDCKTILKALGPGATLEE 
 33      TACQG 
 34   
 35  If you want to, you can include non-standard amino acids in the peptides: 
 36   
 37      >>> for pp in ppb.build_peptides(structure, aa_only=False): 
 38      ...     print(pp.get_sequence()) 
 39      ...     print("%s %s" % (pp.get_sequence()[0], pp[0].get_resname())) 
 40      ...     print("%s %s" % (pp.get_sequence()[-7], pp[-7].get_resname())) 
 41      ...     print("%s %s" % (pp.get_sequence()[-6], pp[-6].get_resname())) 
 42      MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG 
 43      M MSE 
 44      M MSE 
 45      M MSE 
 46   
 47  In this case the selenomethionines (the first and also seventh and sixth from 
 48  last residues) have been shown as M (methionine) by the get_sequence method. 
 49  """ 
 50   
 51  from __future__ import print_function 
 52  from Bio._py3k import basestring 
 53   
 54  import warnings 
 55   
 56  from Bio.Alphabet import generic_protein 
 57  from Bio.Data import SCOPData 
 58  from Bio.Seq import Seq 
 59  from Bio.PDB.PDBExceptions import PDBException 
 60  from Bio.PDB.Vector import calc_dihedral, calc_angle 
 61   
 62   
 63  standard_aa_names = ["ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", 
 64                       "LEU", "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", 
 65                       "TRP", "TYR"] 
 66   
 67   
 68  aa1 = "ACDEFGHIKLMNPQRSTVWY" 
 69  aa3 = standard_aa_names 
 70   
 71  d1_to_index = {} 
 72  dindex_to_1 = {} 
 73  d3_to_index = {} 
 74  dindex_to_3 = {} 
 75   
 76  # Create some lookup tables 
 77  for i in range(0, 20): 
 78      n1 = aa1[i] 
 79      n3 = aa3[i] 
 80      d1_to_index[n1] = i 
 81      dindex_to_1[i] = n1 
 82      d3_to_index[n3] = i 
 83      dindex_to_3[i] = n3 
 84   
 85   
86 -def index_to_one(index):
87 """Index to corresponding one letter amino acid name. 88 89 >>> index_to_one(0) 90 'A' 91 >>> index_to_one(19) 92 'Y' 93 """ 94 return dindex_to_1[index]
95 96
97 -def one_to_index(s):
98 """One letter code to index. 99 100 >>> one_to_index('A') 101 0 102 >>> one_to_index('Y') 103 19 104 """ 105 return d1_to_index[s]
106 107
108 -def index_to_three(i):
109 """Index to corresponding three letter amino acid name. 110 111 >>> index_to_three(0) 112 'ALA' 113 >>> index_to_three(19) 114 'TYR' 115 """ 116 return dindex_to_3[i]
117 118
119 -def three_to_index(s):
120 """Three letter code to index. 121 122 >>> three_to_index('ALA') 123 0 124 >>> three_to_index('TYR') 125 19 126 """ 127 return d3_to_index[s]
128 129
130 -def three_to_one(s):
131 """Three letter code to one letter code. 132 133 >>> three_to_one('ALA') 134 'A' 135 >>> three_to_one('TYR') 136 'Y' 137 138 For non-standard amino acids, you get a KeyError: 139 140 >>> three_to_one('MSE') 141 Traceback (most recent call last): 142 ... 143 KeyError: 'MSE' 144 """ 145 i = d3_to_index[s] 146 return dindex_to_1[i]
147 148
149 -def one_to_three(s):
150 """One letter code to three letter code. 151 152 >>> one_to_three('A') 153 'ALA' 154 >>> one_to_three('Y') 155 'TYR' 156 """ 157 i = d1_to_index[s] 158 return dindex_to_3[i]
159 160
161 -def is_aa(residue, standard=False):
162 """Return True if residue object/string is an amino acid. 163 164 @param residue: a L{Residue} object OR a three letter amino acid code 165 @type residue: L{Residue} or string 166 167 @param standard: flag to check for the 20 AA (default false) 168 @type standard: boolean 169 170 >>> is_aa('ALA') 171 True 172 173 Known three letter codes for modified amino acids are supported, 174 175 >>> is_aa('FME') 176 True 177 >>> is_aa('FME', standard=True) 178 False 179 """ 180 # TODO - What about special cases like XXX, can they appear in PDB files? 181 if not isinstance(residue, basestring): 182 residue = residue.get_resname() 183 residue = residue.upper() 184 if standard: 185 return residue in d3_to_index 186 else: 187 return residue in SCOPData.protein_letters_3to1
188 189
190 -class Polypeptide(list):
191 """A polypeptide is simply a list of L{Residue} objects."""
192 - def get_ca_list(self):
193 """Get list of C-alpha atoms in the polypeptide. 194 195 @return: the list of C-alpha atoms 196 @rtype: [L{Atom}, L{Atom}, ...] 197 """ 198 ca_list = [] 199 for res in self: 200 ca = res["CA"] 201 ca_list.append(ca) 202 return ca_list
203
204 - def get_phi_psi_list(self):
205 """Return the list of phi/psi dihedral angles.""" 206 ppl = [] 207 lng = len(self) 208 for i in range(0, lng): 209 res = self[i] 210 try: 211 n = res['N'].get_vector() 212 ca = res['CA'].get_vector() 213 c = res['C'].get_vector() 214 except Exception: 215 # Some atoms are missing 216 # Phi/Psi cannot be calculated for this residue 217 ppl.append((None, None)) 218 res.xtra["PHI"] = None 219 res.xtra["PSI"] = None 220 continue 221 # Phi 222 if i > 0: 223 rp = self[i - 1] 224 try: 225 cp = rp['C'].get_vector() 226 phi = calc_dihedral(cp, n, ca, c) 227 except Exception: 228 phi = None 229 else: 230 # No phi for residue 0! 231 phi = None 232 # Psi 233 if i < (lng - 1): 234 rn = self[i + 1] 235 try: 236 nn = rn['N'].get_vector() 237 psi = calc_dihedral(n, ca, c, nn) 238 except Exception: 239 psi = None 240 else: 241 # No psi for last residue! 242 psi = None 243 ppl.append((phi, psi)) 244 # Add Phi/Psi to xtra dict of residue 245 res.xtra["PHI"] = phi 246 res.xtra["PSI"] = psi 247 return ppl
248
249 - def get_tau_list(self):
250 """List of tau torsions angles for all 4 consecutive Calpha atoms.""" 251 ca_list = self.get_ca_list() 252 tau_list = [] 253 for i in range(0, len(ca_list) - 3): 254 atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2], ca_list[i + 3]) 255 v1, v2, v3, v4 = [a.get_vector() for a in atom_list] 256 tau = calc_dihedral(v1, v2, v3, v4) 257 tau_list.append(tau) 258 # Put tau in xtra dict of residue 259 res = ca_list[i + 2].get_parent() 260 res.xtra["TAU"] = tau 261 return tau_list
262
263 - def get_theta_list(self):
264 """List of theta angles for all 3 consecutive Calpha atoms.""" 265 theta_list = [] 266 ca_list = self.get_ca_list() 267 for i in range(0, len(ca_list) - 2): 268 atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2]) 269 v1, v2, v3 = [a.get_vector() for a in atom_list] 270 theta = calc_angle(v1, v2, v3) 271 theta_list.append(theta) 272 # Put tau in xtra dict of residue 273 res = ca_list[i + 1].get_parent() 274 res.xtra["THETA"] = theta 275 return theta_list
276
277 - def get_sequence(self):
278 """Return the AA sequence as a Seq object. 279 280 @return: polypeptide sequence 281 @rtype: L{Seq} 282 """ 283 s = "" 284 for res in self: 285 s += SCOPData.protein_letters_3to1.get(res.get_resname(), 'X') 286 seq = Seq(s, generic_protein) 287 return seq
288
289 - def __repr__(self):
290 """Return string representation of the polypeptide. 291 292 Return <Polypeptide start=START end=END>, where START 293 and END are sequence identifiers of the outer residues. 294 """ 295 start = self[0].get_id()[1] 296 end = self[-1].get_id()[1] 297 s = "<Polypeptide start=%s end=%s>" % (start, end) 298 return s
299 300
301 -class _PPBuilder(object):
302 """Base class to extract polypeptides. 303 304 It checks if two consecutive residues in a chain are connected. 305 The connectivity test is implemented by a subclass. 306 307 This assumes you want both standard and non-standard amino acids. 308 """
309 - def __init__(self, radius):
310 """ 311 @param radius: distance 312 @type radius: float 313 """ 314 self.radius = radius
315
316 - def _accept(self, residue, standard_aa_only):
317 """Check if the residue is an amino acid (PRIVATE).""" 318 if is_aa(residue, standard=standard_aa_only): 319 return True 320 elif not standard_aa_only and "CA" in residue.child_dict: 321 # It has an alpha carbon... 322 # We probably need to update the hard coded list of 323 # non-standard residues, see function is_aa for details. 324 warnings.warn("Assuming residue %s is an unknown modified " 325 "amino acid" % residue.get_resname()) 326 return True 327 else: 328 # not a standard AA so skip 329 return False
330
331 - def build_peptides(self, entity, aa_only=1):
332 """Build and return a list of Polypeptide objects. 333 334 @param entity: polypeptides are searched for in this object 335 @type entity: L{Structure}, L{Model} or L{Chain} 336 337 @param aa_only: if 1, the residue needs to be a standard AA 338 @type aa_only: int 339 """ 340 is_connected = self._is_connected 341 accept = self._accept 342 level = entity.get_level() 343 # Decide which entity we are dealing with 344 if level == "S": 345 model = entity[0] 346 chain_list = model.get_list() 347 elif level == "M": 348 chain_list = entity.get_list() 349 elif level == "C": 350 chain_list = [entity] 351 else: 352 raise PDBException("Entity should be Structure, Model or Chain.") 353 pp_list = [] 354 for chain in chain_list: 355 chain_it = iter(chain) 356 try: 357 prev_res = next(chain_it) 358 while not accept(prev_res, aa_only): 359 prev_res = next(chain_it) 360 except StopIteration: 361 # No interesting residues at all in this chain 362 continue 363 pp = None 364 for next_res in chain_it: 365 if accept(prev_res, aa_only) \ 366 and accept(next_res, aa_only) \ 367 and is_connected(prev_res, next_res): 368 if pp is None: 369 pp = Polypeptide() 370 pp.append(prev_res) 371 pp_list.append(pp) 372 pp.append(next_res) 373 else: 374 # Either too far apart, or one of the residues is unwanted. 375 # End the current peptide 376 pp = None 377 prev_res = next_res 378 return pp_list
379 380
381 -class CaPPBuilder(_PPBuilder):
382 """Use CA--CA distance to find polypeptides."""
383 - def __init__(self, radius=4.3):
384 _PPBuilder.__init__(self, radius)
385
386 - def _is_connected(self, prev_res, next_res):
387 for r in [prev_res, next_res]: 388 if not r.has_id("CA"): 389 return False 390 n = next_res["CA"] 391 p = prev_res["CA"] 392 # Unpack disordered 393 if n.is_disordered(): 394 nlist = n.disordered_get_list() 395 else: 396 nlist = [n] 397 if p.is_disordered(): 398 plist = p.disordered_get_list() 399 else: 400 plist = [p] 401 for nn in nlist: 402 for pp in plist: 403 if (nn - pp) < self.radius: 404 return True 405 return False
406 407
408 -class PPBuilder(_PPBuilder):
409 """Use C--N distance to find polypeptides."""
410 - def __init__(self, radius=1.8):
411 _PPBuilder.__init__(self, radius)
412
413 - def _is_connected(self, prev_res, next_res):
414 if not prev_res.has_id("C"): 415 return False 416 if not next_res.has_id("N"): 417 return False 418 test_dist = self._test_dist 419 c = prev_res["C"] 420 n = next_res["N"] 421 # Test all disordered atom positions! 422 if c.is_disordered(): 423 clist = c.disordered_get_list() 424 else: 425 clist = [c] 426 if n.is_disordered(): 427 nlist = n.disordered_get_list() 428 else: 429 nlist = [n] 430 for nn in nlist: 431 for cc in clist: 432 # To form a peptide bond, N and C must be 433 # within radius and have the same altloc 434 # identifier or one altloc blank 435 n_altloc = nn.get_altloc() 436 c_altloc = cc.get_altloc() 437 if n_altloc == c_altloc or n_altloc == " " or c_altloc == " ": 438 if test_dist(nn, cc): 439 # Select the disordered atoms that 440 # are indeed bonded 441 if c.is_disordered(): 442 c.disordered_select(c_altloc) 443 if n.is_disordered(): 444 n.disordered_select(n_altloc) 445 return True 446 return False
447
448 - def _test_dist(self, c, n):
449 """Return 1 if distance between atoms<radius (PRIVATE).""" 450 if (c - n) < self.radius: 451 return 1 452 else: 453 return 0
454 455 456 if __name__ == "__main__": 457 import sys 458 from Bio.PDB.PDBParser import PDBParser 459 460 p = PDBParser(PERMISSIVE=True) 461 462 s = p.get_structure("scr", sys.argv[1]) 463 464 ppb = PPBuilder() 465 466 print("C-N") 467 for pp in ppb.build_peptides(s): 468 print(pp.get_sequence()) 469 for pp in ppb.build_peptides(s[0]): 470 print(pp.get_sequence()) 471 for pp in ppb.build_peptides(s[0]["A"]): 472 print(pp.get_sequence()) 473 474 for pp in ppb.build_peptides(s): 475 for phi, psi in pp.get_phi_psi_list(): 476 print("%f %f" % (phi, psi)) 477 478 ppb = CaPPBuilder() 479 480 print("CA-CA") 481 for pp in ppb.build_peptides(s): 482 print(pp.get_sequence()) 483 for pp in ppb.build_peptides(s[0]): 484 print(pp.get_sequence()) 485 for pp in ppb.build_peptides(s[0]["A"]): 486 print(pp.get_sequence()) 487