1
2
3
4
5
6
7
8 """ASTRAL RAF (Rapid Access Format) Sequence Maps.
9
10 The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES
11 records (representing the sequence of the molecule used in an experiment) to
12 the ATOM records (representing the atoms experimentally observed).
13
14 This data is derived from the Protein Data Bank CIF files. Known errors in the
15 CIF files are corrected manually, with the original PDB file serving as the
16 final arbiter in case of discrepancies.
17
18 Residues are referenced by residue ID. This consists of a the PDB residue
19 sequence number (up to 4 digits) and an optional PDB insertion code (an
20 ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1"
21
22 See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html
23
24 to_one_letter_code -- A mapping from the 3-letter amino acid codes found
25 in PDB files to 1-letter codes. The 3-letter codes
26 include chemically modified residues.
27 """
28
29 from copy import copy
30
31 from Bio.SCOP.Residues import Residues
32
33 from three_to_one_dict import to_one_letter_code
34
35
37 """Convert RAF one-letter amino acid codes into IUPAC standard codes.
38
39 Letters are uppercased, and "." ("Unknown") is converted to "X".
40 """
41 if one_letter_code == '.':
42 return 'X'
43 else:
44 return one_letter_code.upper()
45
46
48 """An RAF file index.
49
50 The RAF file itself is about 50 MB. This index provides rapid, random
51 access of RAF records without having to load the entire file into memory.
52
53 The index key is a concatenation of the PDB ID and chain ID. e.g
54 "2drcA", "155c_". RAF uses an underscore to indicate blank
55 chain IDs.
56 """
57
59 """
60 Arguments:
61
62 filename -- The file to index
63 """
64 dict.__init__(self)
65 self.filename = filename
66
67 f = open(self.filename, "rU")
68 try:
69 position = 0
70 while True:
71 line = f.readline()
72 if not line:
73 break
74 key = line[0:5]
75 if key is not None:
76 self[key]=position
77 position = f.tell()
78 finally:
79 f.close()
80
93
95 """Get the sequence map for a collection of residues.
96
97 residues -- A Residues instance, or a string that can be converted into
98 a Residues instance.
99 """
100 if isinstance(residues, basestring):
101 residues = Residues(residues)
102
103 pdbid = residues.pdbid
104 frags = residues.fragments
105 if not frags:
106 frags =(('_','',''),)
107
108 seqMap = None
109 for frag in frags:
110 chainid = frag[0]
111 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_':
112 chainid = '_'
113 id = pdbid + chainid
114
115 sm = self[id]
116
117
118 start = 0
119 end = len(sm.res)
120 if frag[1]:
121 start = int(sm.index(frag[1], chainid))
122 if frag[2]:
123 end = int(sm.index(frag[2], chainid)+1)
124
125 sm = sm[start:end]
126
127 if seqMap is None:
128 seqMap = sm
129 else:
130 seqMap += sm
131
132 return seqMap
133
134
136 """An ASTRAL RAF (Rapid Access Format) Sequence Map.
137
138 This is a list like object; You can find the location of particular residues
139 with index(), slice this SeqMap into fragments, and glue fragments back
140 together with extend().
141
142 pdbid -- The PDB 4 character ID
143
144 pdb_datestamp -- From the PDB file
145
146 version -- The RAF format version. e.g. 0.01
147
148 flags -- RAF flags. (See release notes for more information.)
149
150 res -- A list of Res objects, one for each residue in this sequence map
151 """
152
154 self.pdbid = ''
155 self.pdb_datestamp = ''
156 self.version = ''
157 self.flags = ''
158 self.res = []
159 if line:
160 self._process(line)
161
163 """Parses a RAF record into a SeqMap object.
164 """
165 header_len = 38
166
167 line = line.rstrip()
168
169 if len(line)<header_len:
170 raise ValueError("Incomplete header: "+line)
171
172 self.pdbid = line[0:4]
173 chainid = line[4:5]
174
175 self.version = line[6:10]
176
177
178 if(self.version != "0.01" and self.version !="0.02"):
179 raise ValueError("Incompatible RAF version: "+self.version)
180
181 self.pdb_datestamp = line[14:20]
182 self.flags = line[21:27]
183
184 for i in range(header_len, len(line), 7):
185 f = line[i : i+7]
186 if len(f)!=7:
187 raise ValueError("Corrupt Field: ("+f+")")
188 r = Res()
189 r.chainid = chainid
190 r.resid = f[0:5].strip()
191 r.atom = normalize_letters(f[5:6])
192 r.seqres = normalize_letters(f[6:7])
193
194 self.res.append(r)
195
196 - def index(self, resid, chainid="_"):
197 for i in range(0, len(self.res)):
198 if self.res[i].resid == resid and self.res[i].chainid == chainid:
199 return i
200 raise KeyError("No such residue "+chainid+resid)
201
203 if not isinstance(index, slice):
204 raise NotImplementedError
205 s = copy(self)
206 s.res = s.res[index]
207 return s
208
210 """Append another Res object onto the list of residue mappings."""
211 self.res.append(res)
212
214 """Append another SeqMap onto the end of self.
215
216 Both SeqMaps must have the same PDB ID, PDB datestamp and
217 RAF version. The RAF flags are erased if they are inconsistent. This
218 may happen when fragments are taken from different chains.
219 """
220 if not isinstance(other, SeqMap):
221 raise TypeError("Can only extend a SeqMap with a SeqMap.")
222 if self.pdbid != other.pdbid:
223 raise TypeError("Cannot add fragments from different proteins")
224 if self.version != other.version:
225 raise TypeError("Incompatible rafs")
226 if self.pdb_datestamp != other.pdb_datestamp:
227 raise TypeError("Different pdb dates!")
228 if self.flags != other.flags:
229 self.flags = ''
230 self.res += other.res
231
235
240
241 - def getAtoms(self, pdb_handle, out_handle):
242 """Extract all relevant ATOM and HETATOM records from a PDB file.
243
244 The PDB file is scanned for ATOM and HETATOM records. If the
245 chain ID, residue ID (seqNum and iCode), and residue type match
246 a residue in this sequence map, then the record is echoed to the
247 output handle.
248
249 This is typically used to find the coordinates of a domain, or other
250 residue subset.
251
252 pdb_handle -- A handle to the relevant PDB file.
253
254 out_handle -- All output is written to this file like object.
255 """
256
257
258
259 resSet = {}
260 for r in self.res:
261 if r.atom=='X':
262 continue
263 chainid = r.chainid
264 if chainid == '_':
265 chainid = ' '
266 resid = r.resid
267 resSet[(chainid,resid)] = r
268
269 resFound = {}
270 for line in pdb_handle.xreadlines():
271 if line.startswith("ATOM ") or line.startswith("HETATM"):
272 chainid = line[21:22]
273 resid = line[22:27].strip()
274 key = (chainid, resid)
275 if key in resSet:
276 res = resSet[key]
277 atom_aa = res.atom
278 resName = line[17:20]
279 if resName in to_one_letter_code:
280 if to_one_letter_code[resName] == atom_aa:
281 out_handle.write(line)
282 resFound[key] = res
283
284 if len(resSet) != len(resFound):
285
286
287
288
289 raise RuntimeError('I could not find at least one ATOM or HETATM'
290 +' record for each and every residue in this sequence map.')
291
292
294 """ A single residue mapping from a RAF record.
295
296 chainid -- A single character chain ID.
297
298 resid -- The residue ID.
299
300 atom -- amino acid one-letter code from ATOM records.
301
302 seqres -- amino acid one-letter code from SEQRES records.
303 """
305 self.chainid = ''
306 self.resid = ''
307 self.atom = ''
308 self.seqres = ''
309
310
312 """Iterates over a RAF file, returning a SeqMap object for each line
313 in the file.
314
315 Arguments:
316
317 handle -- file-like object.
318 """
319 for line in handle:
320 yield SeqMap(line)
321