Package Bio :: Package SCOP :: Module Raf
[hide private]
[frames] | no frames]

Source Code for Module Bio.SCOP.Raf

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Gavin E. Crooks 2001-10-10 
  7   
  8  """ASTRAL RAF (Rapid Access Format) Sequence Maps. 
  9   
 10  The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES 
 11  records (representing the sequence of the molecule used in an experiment) to 
 12  the ATOM records (representing the atoms experimentally observed). 
 13   
 14  This data is derived from the Protein Data Bank CIF files. Known errors in the 
 15  CIF files are corrected manually, with the original PDB file serving as the 
 16  final arbiter in case of discrepancies. 
 17   
 18  Residues are referenced by residue ID. This consists of a the PDB residue 
 19  sequence number (up to 4 digits) and an optional PDB insertion code (an 
 20  ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" 
 21   
 22  See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html 
 23   
 24  protein_letters_3to1 -- A mapping from the 3-letter amino acid codes found 
 25                          in PDB files to 1-letter codes.  The 3-letter codes 
 26                          include chemically modified residues. 
 27  """ 
 28   
 29  from __future__ import print_function 
 30  from Bio._py3k import basestring 
 31  from Bio._py3k import _universal_read_mode 
 32   
 33  from copy import copy 
 34   
 35  from Bio.Data.SCOPData import protein_letters_3to1 
 36   
 37  from Bio.SCOP.Residues import Residues 
 38   
 39  __docformat__ = "restructuredtext en" 
 40   
 41   
42 -def normalize_letters(one_letter_code):
43 """Convert RAF one-letter amino acid codes into IUPAC standard codes. 44 45 Letters are uppercased, and "." ("Unknown") is converted to "X". 46 """ 47 if one_letter_code == '.': 48 return 'X' 49 else: 50 return one_letter_code.upper()
51 52
53 -class SeqMapIndex(dict):
54 """An RAF file index. 55 56 The RAF file itself is about 50 MB. This index provides rapid, random 57 access of RAF records without having to load the entire file into memory. 58 59 The index key is a concatenation of the PDB ID and chain ID. e.g 60 "2drcA", ``"155c_"``. RAF uses an underscore to indicate blank 61 chain IDs. 62 """ 63
64 - def __init__(self, filename):
65 """ 66 Arguments: 67 68 filename -- The file to index 69 """ 70 dict.__init__(self) 71 self.filename = filename 72 73 with open(self.filename, _universal_read_mode) as f: 74 position = 0 75 while True: 76 line = f.readline() 77 if not line: 78 break 79 key = line[0:5] 80 if key is not None: 81 self[key] = position 82 position = f.tell()
83
84 - def __getitem__(self, key):
85 """ Return an item from the indexed file. """ 86 position = dict.__getitem__(self, key) 87 88 with open(self.filename, _universal_read_mode) as f: 89 f.seek(position) 90 line = f.readline() 91 record = SeqMap(line) 92 return record
93
94 - def getSeqMap(self, residues):
95 """Get the sequence map for a collection of residues. 96 97 residues -- A Residues instance, or a string that can be converted into 98 a Residues instance. 99 """ 100 if isinstance(residues, basestring): 101 residues = Residues(residues) 102 103 pdbid = residues.pdbid 104 frags = residues.fragments 105 if not frags: 106 frags = (('_', '', ''),) # All residues of unnamed chain 107 108 seqMap = None 109 for frag in frags: 110 chainid = frag[0] 111 if chainid in ['', '-', ' ', '_']: 112 chainid = '_' 113 id = pdbid + chainid 114 115 sm = self[id] 116 117 # Cut out fragment of interest 118 start = 0 119 end = len(sm.res) 120 if frag[1]: 121 start = int(sm.index(frag[1], chainid)) 122 if frag[2]: 123 end = int(sm.index(frag[2], chainid)) + 1 124 125 sm = sm[start:end] 126 127 if seqMap is None: 128 seqMap = sm 129 else: 130 seqMap += sm 131 132 return seqMap
133 134
135 -class SeqMap(object):
136 """An ASTRAL RAF (Rapid Access Format) Sequence Map. 137 138 This is a list like object; You can find the location of particular residues 139 with index(), slice this SeqMap into fragments, and glue fragments back 140 together with extend(). 141 142 pdbid -- The PDB 4 character ID 143 144 pdb_datestamp -- From the PDB file 145 146 version -- The RAF format version. e.g. 0.01 147 148 flags -- RAF flags. (See release notes for more information.) 149 150 res -- A list of Res objects, one for each residue in this sequence map 151 """ 152
153 - def __init__(self, line=None):
154 self.pdbid = '' 155 self.pdb_datestamp = '' 156 self.version = '' 157 self.flags = '' 158 self.res = [] 159 if line: 160 self._process(line)
161
162 - def _process(self, line):
163 """Parses a RAF record into a SeqMap object. 164 """ 165 header_len = 38 166 167 line = line.rstrip() # no trailing whitespace 168 169 if len(line) < header_len: 170 raise ValueError("Incomplete header: " + line) 171 172 self.pdbid = line[0:4] 173 chainid = line[4:5] 174 175 self.version = line[6:10] 176 177 # Raf format versions 0.01 and 0.02 are identical for practical purposes 178 if(self.version != "0.01" and self.version != "0.02"): 179 raise ValueError("Incompatible RAF version: " + self.version) 180 181 self.pdb_datestamp = line[14:20] 182 self.flags = line[21:27] 183 184 for i in range(header_len, len(line), 7): 185 f = line[i:i + 7] 186 if len(f) != 7: 187 raise ValueError("Corrupt Field: (" + f + ")") 188 r = Res() 189 r.chainid = chainid 190 r.resid = f[0:5].strip() 191 r.atom = normalize_letters(f[5:6]) 192 r.seqres = normalize_letters(f[6:7]) 193 194 self.res.append(r)
195
196 - def index(self, resid, chainid="_"):
197 for i in range(0, len(self.res)): 198 if self.res[i].resid == resid and self.res[i].chainid == chainid: 199 return i 200 raise KeyError("No such residue " + chainid + resid)
201
202 - def __getitem__(self, index):
203 if not isinstance(index, slice): 204 raise NotImplementedError 205 s = copy(self) 206 s.res = s.res[index] 207 return s
208
209 - def append(self, res):
210 """Append another Res object onto the list of residue mappings.""" 211 self.res.append(res)
212
213 - def extend(self, other):
214 """Append another SeqMap onto the end of self. 215 216 Both SeqMaps must have the same PDB ID, PDB datestamp and 217 RAF version. The RAF flags are erased if they are inconsistent. This 218 may happen when fragments are taken from different chains. 219 """ 220 if not isinstance(other, SeqMap): 221 raise TypeError("Can only extend a SeqMap with a SeqMap.") 222 if self.pdbid != other.pdbid: 223 raise TypeError("Cannot add fragments from different proteins") 224 if self.version != other.version: 225 raise TypeError("Incompatible rafs") 226 if self.pdb_datestamp != other.pdb_datestamp: 227 raise TypeError("Different pdb dates!") 228 if self.flags != other.flags: 229 self.flags = '' 230 self.res += other.res
231
232 - def __iadd__(self, other):
233 self.extend(other) 234 return self
235
236 - def __add__(self, other):
237 s = copy(self) 238 s.extend(other) 239 return s
240
241 - def getAtoms(self, pdb_handle, out_handle):
242 """Extract all relevant ATOM and HETATOM records from a PDB file. 243 244 The PDB file is scanned for ATOM and HETATOM records. If the 245 chain ID, residue ID (seqNum and iCode), and residue type match 246 a residue in this sequence map, then the record is echoed to the 247 output handle. 248 249 This is typically used to find the coordinates of a domain, or other 250 residue subset. 251 252 pdb_handle -- A handle to the relevant PDB file. 253 254 out_handle -- All output is written to this file like object. 255 """ 256 # This code should be refactored when (if?) biopython gets a PDB parser 257 258 # The set of residues that I have to find records for. 259 resSet = {} 260 for r in self.res: 261 if r.atom == 'X': # Unknown residue type 262 continue 263 chainid = r.chainid 264 if chainid == '_': 265 chainid = ' ' 266 resid = r.resid 267 resSet[(chainid, resid)] = r 268 269 resFound = {} 270 for line in pdb_handle: 271 if line.startswith("ATOM ") or line.startswith("HETATM"): 272 chainid = line[21:22] 273 resid = line[22:27].strip() 274 key = (chainid, resid) 275 if key in resSet: 276 res = resSet[key] 277 atom_aa = res.atom 278 resName = line[17:20] 279 if resName in protein_letters_3to1: 280 if protein_letters_3to1[resName] == atom_aa: 281 out_handle.write(line) 282 resFound[key] = res 283 284 if len(resSet) != len(resFound): 285 # for k in resFound: 286 # del resSet[k] 287 # print(resSet) 288 289 raise RuntimeError('I could not find at least one ATOM or HETATM' 290 + ' record for each and every residue in this sequence map.')
291 292
293 -class Res(object):
294 """ A single residue mapping from a RAF record. 295 296 chainid -- A single character chain ID. 297 298 resid -- The residue ID. 299 300 atom -- amino acid one-letter code from ATOM records. 301 302 seqres -- amino acid one-letter code from SEQRES records. 303 """
304 - def __init__(self):
305 self.chainid = '' 306 self.resid = '' 307 self.atom = '' 308 self.seqres = ''
309 310
311 -def parse(handle):
312 """Iterates over a RAF file, returning a SeqMap object for each line 313 in the file. 314 315 Arguments: 316 317 handle -- file-like object. 318 """ 319 for line in handle: 320 yield SeqMap(line)
321