Package Bio :: Package SCOP :: Module Raf
[hide private]
[frames] | no frames]

Source Code for Module Bio.SCOP.Raf

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Gavin E. Crooks 2001-10-10 
  7   
  8  """ASTRAL RAF (Rapid Access Format) Sequence Maps. 
  9   
 10  The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES 
 11  records (representing the sequence of the molecule used in an experiment) to 
 12  the ATOM records (representing the atoms experimentally observed). 
 13   
 14  This data is derived from the Protein Data Bank CIF files. Known errors in the 
 15  CIF files are corrected manually, with the original PDB file serving as the 
 16  final arbiter in case of discrepancies. 
 17   
 18  Residues are referenced by residue ID. This consists of a the PDB residue 
 19  sequence number (up to 4 digits) and an optional PDB insertion code (an 
 20  ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" 
 21   
 22  See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html 
 23   
 24  protein_letters_3to1 -- A mapping from the 3-letter amino acid codes found 
 25                          in PDB files to 1-letter codes.  The 3-letter codes 
 26                          include chemically modified residues. 
 27  """ 
 28   
 29  from __future__ import print_function 
 30  from Bio._py3k import basestring 
 31  from Bio._py3k import _universal_read_mode 
 32   
 33  from copy import copy 
 34   
 35  from Bio.Data.SCOPData import protein_letters_3to1 
 36   
 37  from Bio.SCOP.Residues import Residues 
 38   
 39   
40 -def normalize_letters(one_letter_code):
41 """Convert RAF one-letter amino acid codes into IUPAC standard codes. 42 43 Letters are uppercased, and "." ("Unknown") is converted to "X". 44 """ 45 if one_letter_code == '.': 46 return 'X' 47 else: 48 return one_letter_code.upper()
49 50
51 -class SeqMapIndex(dict):
52 """An RAF file index. 53 54 The RAF file itself is about 50 MB. This index provides rapid, random 55 access of RAF records without having to load the entire file into memory. 56 57 The index key is a concatenation of the PDB ID and chain ID. e.g 58 "2drcA", "155c_". RAF uses an underscore to indicate blank 59 chain IDs. 60 """ 61
62 - def __init__(self, filename):
63 """ 64 Arguments: 65 66 filename -- The file to index 67 """ 68 dict.__init__(self) 69 self.filename = filename 70 71 with open(self.filename, _universal_read_mode) as f: 72 position = 0 73 while True: 74 line = f.readline() 75 if not line: 76 break 77 key = line[0:5] 78 if key is not None: 79 self[key]=position 80 position = f.tell()
81
82 - def __getitem__(self, key):
83 """ Return an item from the indexed file. """ 84 position = dict.__getitem__(self, key) 85 86 with open(self.filename, _universal_read_mode) as f: 87 f.seek(position) 88 line = f.readline() 89 record = SeqMap(line) 90 return record
91
92 - def getSeqMap(self, residues):
93 """Get the sequence map for a collection of residues. 94 95 residues -- A Residues instance, or a string that can be converted into 96 a Residues instance. 97 """ 98 if isinstance(residues, basestring): 99 residues = Residues(residues) 100 101 pdbid = residues.pdbid 102 frags = residues.fragments 103 if not frags: 104 frags = (('_', '', ''),) # All residues of unnamed chain 105 106 seqMap = None 107 for frag in frags: 108 chainid = frag[0] 109 if chainid in ['', '-', ' ', '_']: 110 chainid = '_' 111 id = pdbid + chainid 112 113 sm = self[id] 114 115 #Cut out fragment of interest 116 start = 0 117 end = len(sm.res) 118 if frag[1]: 119 start = int(sm.index(frag[1], chainid)) 120 if frag[2]: 121 end = int(sm.index(frag[2], chainid)) + 1 122 123 sm = sm[start:end] 124 125 if seqMap is None: 126 seqMap = sm 127 else: 128 seqMap += sm 129 130 return seqMap
131 132
133 -class SeqMap(object):
134 """An ASTRAL RAF (Rapid Access Format) Sequence Map. 135 136 This is a list like object; You can find the location of particular residues 137 with index(), slice this SeqMap into fragments, and glue fragments back 138 together with extend(). 139 140 pdbid -- The PDB 4 character ID 141 142 pdb_datestamp -- From the PDB file 143 144 version -- The RAF format version. e.g. 0.01 145 146 flags -- RAF flags. (See release notes for more information.) 147 148 res -- A list of Res objects, one for each residue in this sequence map 149 """ 150
151 - def __init__(self, line=None):
152 self.pdbid = '' 153 self.pdb_datestamp = '' 154 self.version = '' 155 self.flags = '' 156 self.res = [] 157 if line: 158 self._process(line)
159
160 - def _process(self, line):
161 """Parses a RAF record into a SeqMap object. 162 """ 163 header_len = 38 164 165 line = line.rstrip() # no trailing whitespace 166 167 if len(line) < header_len: 168 raise ValueError("Incomplete header: "+line) 169 170 self.pdbid = line[0:4] 171 chainid = line[4:5] 172 173 self.version = line[6:10] 174 175 #Raf format versions 0.01 and 0.02 are identical for practical purposes 176 if(self.version != "0.01" and self.version != "0.02"): 177 raise ValueError("Incompatible RAF version: " + self.version) 178 179 self.pdb_datestamp = line[14:20] 180 self.flags = line[21:27] 181 182 for i in range(header_len, len(line), 7): 183 f = line[i : i+7] 184 if len(f)!=7: 185 raise ValueError("Corrupt Field: ("+f+")") 186 r = Res() 187 r.chainid = chainid 188 r.resid = f[0:5].strip() 189 r.atom = normalize_letters(f[5:6]) 190 r.seqres = normalize_letters(f[6:7]) 191 192 self.res.append(r)
193
194 - def index(self, resid, chainid="_"):
195 for i in range(0, len(self.res)): 196 if self.res[i].resid == resid and self.res[i].chainid == chainid: 197 return i 198 raise KeyError("No such residue " + chainid + resid)
199
200 - def __getitem__(self, index):
201 if not isinstance(index, slice): 202 raise NotImplementedError 203 s = copy(self) 204 s.res = s.res[index] 205 return s
206
207 - def append(self, res):
208 """Append another Res object onto the list of residue mappings.""" 209 self.res.append(res)
210
211 - def extend(self, other):
212 """Append another SeqMap onto the end of self. 213 214 Both SeqMaps must have the same PDB ID, PDB datestamp and 215 RAF version. The RAF flags are erased if they are inconsistent. This 216 may happen when fragments are taken from different chains. 217 """ 218 if not isinstance(other, SeqMap): 219 raise TypeError("Can only extend a SeqMap with a SeqMap.") 220 if self.pdbid != other.pdbid: 221 raise TypeError("Cannot add fragments from different proteins") 222 if self.version != other.version: 223 raise TypeError("Incompatible rafs") 224 if self.pdb_datestamp != other.pdb_datestamp: 225 raise TypeError("Different pdb dates!") 226 if self.flags != other.flags: 227 self.flags = '' 228 self.res += other.res
229
230 - def __iadd__(self, other):
231 self.extend(other) 232 return self
233
234 - def __add__(self, other):
235 s = copy(self) 236 s.extend(other) 237 return s
238
239 - def getAtoms(self, pdb_handle, out_handle):
240 """Extract all relevant ATOM and HETATOM records from a PDB file. 241 242 The PDB file is scanned for ATOM and HETATOM records. If the 243 chain ID, residue ID (seqNum and iCode), and residue type match 244 a residue in this sequence map, then the record is echoed to the 245 output handle. 246 247 This is typically used to find the coordinates of a domain, or other 248 residue subset. 249 250 pdb_handle -- A handle to the relevant PDB file. 251 252 out_handle -- All output is written to this file like object. 253 """ 254 #This code should be refactored when (if?) biopython gets a PDB parser 255 256 #The set of residues that I have to find records for. 257 resSet = {} 258 for r in self.res: 259 if r.atom == 'X': # Unknown residue type 260 continue 261 chainid = r.chainid 262 if chainid == '_': 263 chainid = ' ' 264 resid = r.resid 265 resSet[(chainid, resid)] = r 266 267 resFound = {} 268 for line in pdb_handle: 269 if line.startswith("ATOM ") or line.startswith("HETATM"): 270 chainid = line[21:22] 271 resid = line[22:27].strip() 272 key = (chainid, resid) 273 if key in resSet: 274 res = resSet[key] 275 atom_aa = res.atom 276 resName = line[17:20] 277 if resName in protein_letters_3to1: 278 if protein_letters_3to1[resName] == atom_aa: 279 out_handle.write(line) 280 resFound[key] = res 281 282 if len(resSet) != len(resFound): 283 #for k in resFound: 284 # del resSet[k] 285 #print(resSet) 286 287 raise RuntimeError('I could not find at least one ATOM or HETATM' 288 +' record for each and every residue in this sequence map.')
289 290
291 -class Res(object):
292 """ A single residue mapping from a RAF record. 293 294 chainid -- A single character chain ID. 295 296 resid -- The residue ID. 297 298 atom -- amino acid one-letter code from ATOM records. 299 300 seqres -- amino acid one-letter code from SEQRES records. 301 """
302 - def __init__(self):
303 self.chainid = '' 304 self.resid = '' 305 self.atom = '' 306 self.seqres = ''
307 308
309 -def parse(handle):
310 """Iterates over a RAF file, returning a SeqMap object for each line 311 in the file. 312 313 Arguments: 314 315 handle -- file-like object. 316 """ 317 for line in handle: 318 yield SeqMap(line)
319