Package Bio :: Package SCOP :: Module Raf
[hide private]
[frames] | no frames]

Source Code for Module Bio.SCOP.Raf

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Gavin E. Crooks 2001-10-10 
  7   
  8  """ASTRAL RAF (Rapid Access Format) Sequence Maps. 
  9   
 10  The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES 
 11  records (representing the sequence of the molecule used in an experiment) to 
 12  the ATOM records (representing the atoms experimentally observed). 
 13   
 14  This data is derived from the Protein Data Bank CIF files. Known errors in the 
 15  CIF files are corrected manually, with the original PDB file serving as the 
 16  final arbiter in case of discrepancies. 
 17   
 18  Residues are referenced by residue ID. This consists of a the PDB residue 
 19  sequence number (up to 4 digits) and an optional PDB insertion code (an 
 20  ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" 
 21   
 22  See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html 
 23   
 24  protein_letters_3to1 -- A mapping from the 3-letter amino acid codes found 
 25                          in PDB files to 1-letter codes.  The 3-letter codes 
 26                          include chemically modified residues. 
 27  """ 
 28   
 29  from __future__ import print_function 
 30  from Bio._py3k import basestring 
 31   
 32  from copy import copy 
 33   
 34  from Bio.Data.SCOPData import protein_letters_3to1 
 35   
 36  from Bio.SCOP.Residues import Residues 
 37   
 38   
39 -def normalize_letters(one_letter_code):
40 """Convert RAF one-letter amino acid codes into IUPAC standard codes. 41 42 Letters are uppercased, and "." ("Unknown") is converted to "X". 43 """ 44 if one_letter_code == '.': 45 return 'X' 46 else: 47 return one_letter_code.upper()
48 49
50 -class SeqMapIndex(dict):
51 """An RAF file index. 52 53 The RAF file itself is about 50 MB. This index provides rapid, random 54 access of RAF records without having to load the entire file into memory. 55 56 The index key is a concatenation of the PDB ID and chain ID. e.g 57 "2drcA", "155c_". RAF uses an underscore to indicate blank 58 chain IDs. 59 """ 60
61 - def __init__(self, filename):
62 """ 63 Arguments: 64 65 filename -- The file to index 66 """ 67 dict.__init__(self) 68 self.filename = filename 69 70 with open(self.filename, "rU") as f: 71 position = 0 72 while True: 73 line = f.readline() 74 if not line: 75 break 76 key = line[0:5] 77 if key is not None: 78 self[key]=position 79 position = f.tell()
80
81 - def __getitem__(self, key):
82 """ Return an item from the indexed file. """ 83 position = dict.__getitem__(self, key) 84 85 with open(self.filename, "rU") as f: 86 f.seek(position) 87 line = f.readline() 88 record = SeqMap(line) 89 return record
90
91 - def getSeqMap(self, residues):
92 """Get the sequence map for a collection of residues. 93 94 residues -- A Residues instance, or a string that can be converted into 95 a Residues instance. 96 """ 97 if isinstance(residues, basestring): 98 residues = Residues(residues) 99 100 pdbid = residues.pdbid 101 frags = residues.fragments 102 if not frags: 103 frags =(('_', '', ''),) # All residues of unnamed chain 104 105 seqMap = None 106 for frag in frags: 107 chainid = frag[0] 108 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_': 109 chainid = '_' 110 id = pdbid + chainid 111 112 sm = self[id] 113 114 #Cut out fragment of interest 115 start = 0 116 end = len(sm.res) 117 if frag[1]: 118 start = int(sm.index(frag[1], chainid)) 119 if frag[2]: 120 end = int(sm.index(frag[2], chainid)+1) 121 122 sm = sm[start:end] 123 124 if seqMap is None: 125 seqMap = sm 126 else: 127 seqMap += sm 128 129 return seqMap
130 131
132 -class SeqMap(object):
133 """An ASTRAL RAF (Rapid Access Format) Sequence Map. 134 135 This is a list like object; You can find the location of particular residues 136 with index(), slice this SeqMap into fragments, and glue fragments back 137 together with extend(). 138 139 pdbid -- The PDB 4 character ID 140 141 pdb_datestamp -- From the PDB file 142 143 version -- The RAF format version. e.g. 0.01 144 145 flags -- RAF flags. (See release notes for more information.) 146 147 res -- A list of Res objects, one for each residue in this sequence map 148 """ 149
150 - def __init__(self, line=None):
151 self.pdbid = '' 152 self.pdb_datestamp = '' 153 self.version = '' 154 self.flags = '' 155 self.res = [] 156 if line: 157 self._process(line)
158
159 - def _process(self, line):
160 """Parses a RAF record into a SeqMap object. 161 """ 162 header_len = 38 163 164 line = line.rstrip() # no trailing whitespace 165 166 if len(line)<header_len: 167 raise ValueError("Incomplete header: "+line) 168 169 self.pdbid = line[0:4] 170 chainid = line[4:5] 171 172 self.version = line[6:10] 173 174 #Raf format versions 0.01 and 0.02 are identical for practical purposes 175 if(self.version != "0.01" and self.version !="0.02"): 176 raise ValueError("Incompatible RAF version: "+self.version) 177 178 self.pdb_datestamp = line[14:20] 179 self.flags = line[21:27] 180 181 for i in range(header_len, len(line), 7): 182 f = line[i : i+7] 183 if len(f)!=7: 184 raise ValueError("Corrupt Field: ("+f+")") 185 r = Res() 186 r.chainid = chainid 187 r.resid = f[0:5].strip() 188 r.atom = normalize_letters(f[5:6]) 189 r.seqres = normalize_letters(f[6:7]) 190 191 self.res.append(r)
192
193 - def index(self, resid, chainid="_"):
194 for i in range(0, len(self.res)): 195 if self.res[i].resid == resid and self.res[i].chainid == chainid: 196 return i 197 raise KeyError("No such residue "+chainid+resid)
198
199 - def __getitem__(self, index):
200 if not isinstance(index, slice): 201 raise NotImplementedError 202 s = copy(self) 203 s.res = s.res[index] 204 return s
205
206 - def append(self, res):
207 """Append another Res object onto the list of residue mappings.""" 208 self.res.append(res)
209
210 - def extend(self, other):
211 """Append another SeqMap onto the end of self. 212 213 Both SeqMaps must have the same PDB ID, PDB datestamp and 214 RAF version. The RAF flags are erased if they are inconsistent. This 215 may happen when fragments are taken from different chains. 216 """ 217 if not isinstance(other, SeqMap): 218 raise TypeError("Can only extend a SeqMap with a SeqMap.") 219 if self.pdbid != other.pdbid: 220 raise TypeError("Cannot add fragments from different proteins") 221 if self.version != other.version: 222 raise TypeError("Incompatible rafs") 223 if self.pdb_datestamp != other.pdb_datestamp: 224 raise TypeError("Different pdb dates!") 225 if self.flags != other.flags: 226 self.flags = '' 227 self.res += other.res
228
229 - def __iadd__(self, other):
230 self.extend(other) 231 return self
232
233 - def __add__(self, other):
234 s = copy(self) 235 s.extend(other) 236 return s
237
238 - def getAtoms(self, pdb_handle, out_handle):
239 """Extract all relevant ATOM and HETATOM records from a PDB file. 240 241 The PDB file is scanned for ATOM and HETATOM records. If the 242 chain ID, residue ID (seqNum and iCode), and residue type match 243 a residue in this sequence map, then the record is echoed to the 244 output handle. 245 246 This is typically used to find the coordinates of a domain, or other 247 residue subset. 248 249 pdb_handle -- A handle to the relevant PDB file. 250 251 out_handle -- All output is written to this file like object. 252 """ 253 #This code should be refactored when (if?) biopython gets a PDB parser 254 255 #The set of residues that I have to find records for. 256 resSet = {} 257 for r in self.res: 258 if r.atom=='X': # Unknown residue type 259 continue 260 chainid = r.chainid 261 if chainid == '_': 262 chainid = ' ' 263 resid = r.resid 264 resSet[(chainid, resid)] = r 265 266 resFound = {} 267 for line in pdb_handle: 268 if line.startswith("ATOM ") or line.startswith("HETATM"): 269 chainid = line[21:22] 270 resid = line[22:27].strip() 271 key = (chainid, resid) 272 if key in resSet: 273 res = resSet[key] 274 atom_aa = res.atom 275 resName = line[17:20] 276 if resName in protein_letters_3to1: 277 if protein_letters_3to1[resName] == atom_aa: 278 out_handle.write(line) 279 resFound[key] = res 280 281 if len(resSet) != len(resFound): 282 #for k in resFound: 283 # del resSet[k] 284 #print(resSet) 285 286 raise RuntimeError('I could not find at least one ATOM or HETATM' 287 +' record for each and every residue in this sequence map.')
288 289
290 -class Res(object):
291 """ A single residue mapping from a RAF record. 292 293 chainid -- A single character chain ID. 294 295 resid -- The residue ID. 296 297 atom -- amino acid one-letter code from ATOM records. 298 299 seqres -- amino acid one-letter code from SEQRES records. 300 """
301 - def __init__(self):
302 self.chainid = '' 303 self.resid = '' 304 self.atom = '' 305 self.seqres = ''
306 307
308 -def parse(handle):
309 """Iterates over a RAF file, returning a SeqMap object for each line 310 in the file. 311 312 Arguments: 313 314 handle -- file-like object. 315 """ 316 for line in handle: 317 yield SeqMap(line)
318