1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 """Parse the header of a PDB file."""
25
26
27 from __future__ import with_statement
28 import re
29
30 from Bio import File
31
32
41
42
44
45
46 references=[]
47 actref=""
48 for l in inl:
49 if re.search("\AREMARK 1",l):
50 if re.search("\AREMARK 1 REFERENCE",l):
51 if actref!="":
52 actref=re.sub("\s\s+"," ",actref)
53 if actref!=" ":
54 references.append(actref)
55 actref=""
56 else:
57 actref+=l[19:72].lower()
58
59 if actref!="":
60 actref=re.sub("\s\s+"," ",actref)
61 if actref!=" ":
62 references.append(actref)
63 return references
64
65
66
83
84
86 """Chops lines ending with ' 1CSA 14' and the like."""
87 return re.sub("\s\s\s\s+[\w]{4}.\s+\d*\Z","",line)
88
89
91 """Chops lines ending with ' 14-JUL-97 1CSA' and the like."""
92 return re.sub("\s\s\s\s+.*\Z","",line)
93
94
96 """Makes A Lowercase String With Capitals."""
97 l=line.lower()
98 s=""
99 i=0
100 nextCap=1
101 while i<len(l):
102 c=l[i]
103 if c>='a' and c<='z' and nextCap:
104 c=c.upper()
105 nextCap=0
106 elif c==' ' or c=='.' or c==',' or c==';' or c==':' or c=='\t' or\
107 c=='-' or c=='_':
108 nextCap=1
109 s+=c
110 i+=1
111 return s
112
113
115 """
116 Returns the header lines of a pdb file as a dictionary.
117
118 Dictionary keys are: head, deposition_date, release_date, structure_method,
119 resolution, structure_reference, journal_reference, author and
120 compound.
121 """
122 header = []
123 with File.as_handle(infile, 'r') as f:
124 for l in f:
125 record_type=l[0:6]
126 if (record_type=='ATOM ' or record_type=='HETATM' or
127 record_type=='MODEL '):
128 break
129 else:
130 header.append(l)
131 return _parse_pdb_header_list(header)
132
133
135
136 dict={'name':"",
137 'head':'',
138 'deposition_date' : "1909-01-08",
139 'release_date' : "1909-01-08",
140 'structure_method' : "unknown",
141 'resolution' : 0.0,
142 'structure_reference' : "unknown",
143 'journal_reference' : "unknown",
144 'author' : "",
145 'compound':{'1':{'misc':''}},'source':{'1':{'misc':''}}}
146
147 dict['structure_reference'] = _get_references(header)
148 dict['journal_reference'] = _get_journal(header)
149 comp_molid="1"
150 src_molid="1"
151 last_comp_key="misc"
152 last_src_key="misc"
153
154 for hh in header:
155 h=re.sub("[\s\n\r]*\Z","",hh)
156
157 key = h[:6].strip()
158
159 tail = h[10:].strip()
160
161
162
163 if key=="TITLE":
164 name=_chop_end_codes(tail).lower()
165 if 'name' in dict:
166 dict['name'] += " "+name
167 else:
168 dict['name']=name
169 elif key=="HEADER":
170 rr=re.search("\d\d-\w\w\w-\d\d",tail)
171 if rr is not None:
172 dict['deposition_date']=_format_date(_nice_case(rr.group()))
173 head=_chop_end_misc(tail).lower()
174 dict['head']=head
175 elif key=="COMPND":
176 tt=re.sub("\;\s*\Z","",_chop_end_codes(tail)).lower()
177
178 rec = re.search('\d+\.\d+\.\d+\.\d+',tt)
179 if rec:
180 dict['compound'][comp_molid]['ec_number']=rec.group()
181 tt=re.sub("\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)","",tt)
182 tok=tt.split(":")
183 if len(tok)>=2:
184 ckey=tok[0]
185 cval=re.sub("\A\s*","",tok[1])
186 if ckey=='mol_id':
187 dict['compound'][cval]={'misc':''}
188 comp_molid=cval
189 last_comp_key="misc"
190 else:
191 dict['compound'][comp_molid][ckey]=cval
192 last_comp_key=ckey
193 else:
194 dict['compound'][comp_molid][last_comp_key]+=tok[0]+" "
195 elif key=="SOURCE":
196 tt=re.sub("\;\s*\Z","",_chop_end_codes(tail)).lower()
197 tok=tt.split(":")
198
199 if len(tok)>=2:
200 ckey=tok[0]
201 cval=re.sub("\A\s*","",tok[1])
202 if ckey=='mol_id':
203 dict['source'][cval]={'misc':''}
204 comp_molid=cval
205 last_src_key="misc"
206 else:
207 dict['source'][comp_molid][ckey]=cval
208 last_src_key=ckey
209 else:
210 dict['source'][comp_molid][last_src_key]+=tok[0]+" "
211 elif key=="KEYWDS":
212 kwd=_chop_end_codes(tail).lower()
213 if 'keywords' in dict:
214 dict['keywords']+=" "+kwd
215 else:
216 dict['keywords']=kwd
217 elif key=="EXPDTA":
218 expd=_chop_end_codes(tail)
219
220 expd=re.sub('\s\s\s\s\s\s\s.*\Z','',expd)
221
222
223 dict['structure_method']=expd.lower()
224 elif key=="CAVEAT":
225
226 pass
227 elif key=="REVDAT":
228 rr=re.search("\d\d-\w\w\w-\d\d",tail)
229 if rr is not None:
230 dict['release_date']=_format_date(_nice_case(rr.group()))
231 elif key=="JRNL":
232
233 if 'journal' in dict:
234 dict['journal']+=tail
235 else:
236 dict['journal']=tail
237 elif key=="AUTHOR":
238 auth = _nice_case(_chop_end_codes(tail))
239 if 'author' in dict:
240 dict['author']+=auth
241 else:
242 dict['author']=auth
243 elif key=="REMARK":
244 if re.search("REMARK 2 RESOLUTION.",hh):
245 r=_chop_end_codes(re.sub("REMARK 2 RESOLUTION.",'',hh))
246 r=re.sub("\s+ANGSTROM.*","",r)
247 try:
248 dict['resolution']=float(r)
249 except:
250
251 dict['resolution']=None
252 else:
253
254 pass
255 if dict['structure_method']=='unknown':
256 if dict['resolution']>0.0:
257 dict['structure_method']='x-ray diffraction'
258 return dict
259
260 if __name__=='__main__':
261
262
263 import sys
264 filename = sys.argv[1]
265 handle = open(filename,'r')
266 data_dict = parse_pdb_header(handle)
267 handle.close()
268
269
270 for k, y in data_dict.iteritems():
271 print "-"*40
272 print k
273 print y
274