Package Bio :: Package PDB :: Module Polypeptide
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.Polypeptide

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Polypeptide-related classes (construction and representation). 
  7   
  8  Simple example with multiple chains, 
  9   
 10      >>> from Bio.PDB.PDBParser import PDBParser 
 11      >>> from Bio.PDB.Polypeptide import PPBuilder 
 12      >>> structure = PDBParser().get_structure('2BEG', 'PDB/2BEG.pdb') 
 13      >>> ppb=PPBuilder() 
 14      >>> for pp in ppb.build_peptides(structure): 
 15      ...     print(pp.get_sequence()) 
 16      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 17      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 18      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 19      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 20      LVFFAEDVGSNKGAIIGLMVGGVVIA 
 21   
 22  Example with non-standard amino acids using HETATM lines in the PDB file, 
 23  in this case selenomethionine (MSE): 
 24   
 25      >>> from Bio.PDB.PDBParser import PDBParser 
 26      >>> from Bio.PDB.Polypeptide import PPBuilder 
 27      >>> structure = PDBParser().get_structure('1A8O', 'PDB/1A8O.pdb') 
 28      >>> ppb=PPBuilder() 
 29      >>> for pp in ppb.build_peptides(structure): 
 30      ...     print(pp.get_sequence()) 
 31      DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW 
 32      TETLLVQNANPDCKTILKALGPGATLEE 
 33      TACQG 
 34   
 35  If you want to, you can include non-standard amino acids in the peptides: 
 36   
 37      >>> for pp in ppb.build_peptides(structure, aa_only=False): 
 38      ...     print(pp.get_sequence()) 
 39      ...     print("%s %s" % (pp.get_sequence()[0], pp[0].get_resname())) 
 40      ...     print("%s %s" % (pp.get_sequence()[-7], pp[-7].get_resname())) 
 41      ...     print("%s %s" % (pp.get_sequence()[-6], pp[-6].get_resname())) 
 42      MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG 
 43      M MSE 
 44      M MSE 
 45      M MSE 
 46   
 47  In this case the selenomethionines (the first and also seventh and sixth from 
 48  last residues) have been shown as M (methionine) by the get_sequence method. 
 49  """ 
 50   
 51  from __future__ import print_function 
 52  from Bio._py3k import basestring 
 53   
 54  import warnings 
 55   
 56  from Bio.Alphabet import generic_protein 
 57  from Bio.Data import SCOPData 
 58  from Bio.Seq import Seq 
 59  from Bio.PDB.PDBExceptions import PDBException 
 60  from Bio.PDB.Vector import calc_dihedral, calc_angle 
 61   
 62   
 63  standard_aa_names = ["ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", 
 64                       "LEU", "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", 
 65                       "TRP", "TYR"] 
 66   
 67   
 68  aa1 = "ACDEFGHIKLMNPQRSTVWY" 
 69  aa3 = standard_aa_names 
 70   
 71  d1_to_index = {} 
 72  dindex_to_1 = {} 
 73  d3_to_index = {} 
 74  dindex_to_3 = {} 
 75   
 76  # Create some lookup tables 
 77  for i in range(0, 20): 
 78      n1 = aa1[i] 
 79      n3 = aa3[i] 
 80      d1_to_index[n1] = i 
 81      dindex_to_1[i] = n1 
 82      d3_to_index[n3] = i 
 83      dindex_to_3[i] = n3 
 84   
 85   
86 -def index_to_one(index):
87 """Index to corresponding one letter amino acid name. 88 89 >>> index_to_one(0) 90 'A' 91 >>> index_to_one(19) 92 'Y' 93 """ 94 return dindex_to_1[index]
95 96
97 -def one_to_index(s):
98 """One letter code to index. 99 100 >>> one_to_index('A') 101 0 102 >>> one_to_index('Y') 103 19 104 """ 105 return d1_to_index[s]
106 107
108 -def index_to_three(i):
109 """Index to corresponding three letter amino acid name. 110 111 >>> index_to_three(0) 112 'ALA' 113 >>> index_to_three(19) 114 'TYR' 115 """ 116 return dindex_to_3[i]
117 118
119 -def three_to_index(s):
120 """Three letter code to index. 121 122 >>> three_to_index('ALA') 123 0 124 >>> three_to_index('TYR') 125 19 126 """ 127 return d3_to_index[s]
128 129
130 -def three_to_one(s):
131 """Three letter code to one letter code. 132 133 >>> three_to_one('ALA') 134 'A' 135 >>> three_to_one('TYR') 136 'Y' 137 138 For non-standard amino acids, you get a KeyError: 139 140 >>> three_to_one('MSE') 141 Traceback (most recent call last): 142 ... 143 KeyError: 'MSE' 144 """ 145 i = d3_to_index[s] 146 return dindex_to_1[i]
147 148
149 -def one_to_three(s):
150 """One letter code to three letter code. 151 152 >>> one_to_three('A') 153 'ALA' 154 >>> one_to_three('Y') 155 'TYR' 156 """ 157 i = d1_to_index[s] 158 return dindex_to_3[i]
159 160
161 -def is_aa(residue, standard=False):
162 """Return True if residue object/string is an amino acid. 163 164 @param residue: a L{Residue} object OR a three letter amino acid code 165 @type residue: L{Residue} or string 166 167 @param standard: flag to check for the 20 AA (default false) 168 @type standard: boolean 169 170 >>> is_aa('ALA') 171 True 172 173 Known three letter codes for modified amino acids are supported, 174 175 >>> is_aa('FME') 176 True 177 >>> is_aa('FME', standard=True) 178 False 179 """ 180 # TODO - What about special cases like XXX, can they appear in PDB files? 181 if not isinstance(residue, basestring): 182 residue = residue.get_resname() 183 residue = residue.upper() 184 if standard: 185 return residue in d3_to_index 186 else: 187 return residue in SCOPData.protein_letters_3to1
188 189
190 -class Polypeptide(list):
191 """A polypeptide is simply a list of L{Residue} objects.""" 192
193 - def get_ca_list(self):
194 """Get list of C-alpha atoms in the polypeptide. 195 196 @return: the list of C-alpha atoms 197 @rtype: [L{Atom}, L{Atom}, ...] 198 """ 199 ca_list = [] 200 for res in self: 201 ca = res["CA"] 202 ca_list.append(ca) 203 return ca_list
204
205 - def get_phi_psi_list(self):
206 """Return the list of phi/psi dihedral angles.""" 207 ppl = [] 208 lng = len(self) 209 for i in range(0, lng): 210 res = self[i] 211 try: 212 n = res['N'].get_vector() 213 ca = res['CA'].get_vector() 214 c = res['C'].get_vector() 215 except Exception: 216 # Some atoms are missing 217 # Phi/Psi cannot be calculated for this residue 218 ppl.append((None, None)) 219 res.xtra["PHI"] = None 220 res.xtra["PSI"] = None 221 continue 222 # Phi 223 if i > 0: 224 rp = self[i - 1] 225 try: 226 cp = rp['C'].get_vector() 227 phi = calc_dihedral(cp, n, ca, c) 228 except Exception: 229 phi = None 230 else: 231 # No phi for residue 0! 232 phi = None 233 # Psi 234 if i < (lng - 1): 235 rn = self[i + 1] 236 try: 237 nn = rn['N'].get_vector() 238 psi = calc_dihedral(n, ca, c, nn) 239 except Exception: 240 psi = None 241 else: 242 # No psi for last residue! 243 psi = None 244 ppl.append((phi, psi)) 245 # Add Phi/Psi to xtra dict of residue 246 res.xtra["PHI"] = phi 247 res.xtra["PSI"] = psi 248 return ppl
249
250 - def get_tau_list(self):
251 """List of tau torsions angles for all 4 consecutive Calpha atoms.""" 252 ca_list = self.get_ca_list() 253 tau_list = [] 254 for i in range(0, len(ca_list) - 3): 255 atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2], ca_list[i + 3]) 256 v1, v2, v3, v4 = [a.get_vector() for a in atom_list] 257 tau = calc_dihedral(v1, v2, v3, v4) 258 tau_list.append(tau) 259 # Put tau in xtra dict of residue 260 res = ca_list[i + 2].get_parent() 261 res.xtra["TAU"] = tau 262 return tau_list
263
264 - def get_theta_list(self):
265 """List of theta angles for all 3 consecutive Calpha atoms.""" 266 theta_list = [] 267 ca_list = self.get_ca_list() 268 for i in range(0, len(ca_list) - 2): 269 atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2]) 270 v1, v2, v3 = [a.get_vector() for a in atom_list] 271 theta = calc_angle(v1, v2, v3) 272 theta_list.append(theta) 273 # Put tau in xtra dict of residue 274 res = ca_list[i + 1].get_parent() 275 res.xtra["THETA"] = theta 276 return theta_list
277
278 - def get_sequence(self):
279 """Return the AA sequence as a Seq object. 280 281 @return: polypeptide sequence 282 @rtype: L{Seq} 283 """ 284 s = "" 285 for res in self: 286 s += SCOPData.protein_letters_3to1.get(res.get_resname(), 'X') 287 seq = Seq(s, generic_protein) 288 return seq
289
290 - def __repr__(self):
291 """Return string representation of the polypeptide. 292 293 Return <Polypeptide start=START end=END>, where START 294 and END are sequence identifiers of the outer residues. 295 """ 296 start = self[0].get_id()[1] 297 end = self[-1].get_id()[1] 298 s = "<Polypeptide start=%s end=%s>" % (start, end) 299 return s
300 301
302 -class _PPBuilder(object):
303 """Base class to extract polypeptides. 304 305 It checks if two consecutive residues in a chain are connected. 306 The connectivity test is implemented by a subclass. 307 308 This assumes you want both standard and non-standard amino acids. 309 """ 310
311 - def __init__(self, radius):
312 """Initialize the base class. 313 314 @param radius: distance 315 @type radius: float 316 """ 317 self.radius = radius
318
319 - def _accept(self, residue, standard_aa_only):
320 """Check if the residue is an amino acid (PRIVATE).""" 321 if is_aa(residue, standard=standard_aa_only): 322 return True 323 elif not standard_aa_only and "CA" in residue.child_dict: 324 # It has an alpha carbon... 325 # We probably need to update the hard coded list of 326 # non-standard residues, see function is_aa for details. 327 warnings.warn("Assuming residue %s is an unknown modified " 328 "amino acid" % residue.get_resname()) 329 return True 330 else: 331 # not a standard AA so skip 332 return False
333
334 - def build_peptides(self, entity, aa_only=1):
335 """Build and return a list of Polypeptide objects. 336 337 @param entity: polypeptides are searched for in this object 338 @type entity: L{Structure}, L{Model} or L{Chain} 339 340 @param aa_only: if 1, the residue needs to be a standard AA 341 @type aa_only: int 342 """ 343 is_connected = self._is_connected 344 accept = self._accept 345 level = entity.get_level() 346 # Decide which entity we are dealing with 347 if level == "S": 348 model = entity[0] 349 chain_list = model.get_list() 350 elif level == "M": 351 chain_list = entity.get_list() 352 elif level == "C": 353 chain_list = [entity] 354 else: 355 raise PDBException("Entity should be Structure, Model or Chain.") 356 pp_list = [] 357 for chain in chain_list: 358 chain_it = iter(chain) 359 try: 360 prev_res = next(chain_it) 361 while not accept(prev_res, aa_only): 362 prev_res = next(chain_it) 363 except StopIteration: 364 # No interesting residues at all in this chain 365 continue 366 pp = None 367 for next_res in chain_it: 368 if accept(prev_res, aa_only) \ 369 and accept(next_res, aa_only) \ 370 and is_connected(prev_res, next_res): 371 if pp is None: 372 pp = Polypeptide() 373 pp.append(prev_res) 374 pp_list.append(pp) 375 pp.append(next_res) 376 else: 377 # Either too far apart, or one of the residues is unwanted. 378 # End the current peptide 379 pp = None 380 prev_res = next_res 381 return pp_list
382 383
384 -class CaPPBuilder(_PPBuilder):
385 """Use CA--CA distance to find polypeptides.""" 386
387 - def __init__(self, radius=4.3):
388 _PPBuilder.__init__(self, radius)
389
390 - def _is_connected(self, prev_res, next_res):
391 for r in [prev_res, next_res]: 392 if not r.has_id("CA"): 393 return False 394 n = next_res["CA"] 395 p = prev_res["CA"] 396 # Unpack disordered 397 if n.is_disordered(): 398 nlist = n.disordered_get_list() 399 else: 400 nlist = [n] 401 if p.is_disordered(): 402 plist = p.disordered_get_list() 403 else: 404 plist = [p] 405 for nn in nlist: 406 for pp in plist: 407 if (nn - pp) < self.radius: 408 return True 409 return False
410 411
412 -class PPBuilder(_PPBuilder):
413 """Use C--N distance to find polypeptides.""" 414
415 - def __init__(self, radius=1.8):
416 _PPBuilder.__init__(self, radius)
417
418 - def _is_connected(self, prev_res, next_res):
419 if not prev_res.has_id("C"): 420 return False 421 if not next_res.has_id("N"): 422 return False 423 test_dist = self._test_dist 424 c = prev_res["C"] 425 n = next_res["N"] 426 # Test all disordered atom positions! 427 if c.is_disordered(): 428 clist = c.disordered_get_list() 429 else: 430 clist = [c] 431 if n.is_disordered(): 432 nlist = n.disordered_get_list() 433 else: 434 nlist = [n] 435 for nn in nlist: 436 for cc in clist: 437 # To form a peptide bond, N and C must be 438 # within radius and have the same altloc 439 # identifier or one altloc blank 440 n_altloc = nn.get_altloc() 441 c_altloc = cc.get_altloc() 442 if n_altloc == c_altloc or n_altloc == " " or c_altloc == " ": 443 if test_dist(nn, cc): 444 # Select the disordered atoms that 445 # are indeed bonded 446 if c.is_disordered(): 447 c.disordered_select(c_altloc) 448 if n.is_disordered(): 449 n.disordered_select(n_altloc) 450 return True 451 return False
452
453 - def _test_dist(self, c, n):
454 """Return 1 if distance between atoms<radius (PRIVATE).""" 455 if (c - n) < self.radius: 456 return 1 457 else: 458 return 0
459