1
2
3
4
5
6 """Bio.SearchIO parser for HMMER domain table output format."""
7
8 from itertools import chain
9
10 from Bio.Alphabet import generic_protein
11 from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
12
13 from hmmer3_tab import Hmmer3TabParser, Hmmer3TabIndexer
14
15
17
18 """Base hmmer3-domtab iterator."""
19
21 """Returns a dictionary of parsed row values."""
22 assert self.line
23 cols = filter(None, self.line.strip().split(' '))
24
25
26 if len(cols) > 23:
27 cols[22] = ' '.join(cols[22:])
28 elif len(cols) < 23:
29 cols.append('')
30 assert len(cols) == 23
31
32
33 qresult = {}
34 qresult['id'] = cols[3]
35 qresult['accession'] = cols[4]
36 qresult['seq_len'] = int(cols[5])
37 hit = {}
38 hit['id'] = cols[0]
39 hit['accession'] = cols[1]
40 hit['seq_len'] = int(cols[2])
41 hit['evalue'] = float(cols[6])
42 hit['bitscore'] = float(cols[7])
43 hit['bias'] = float(cols[8])
44 hit['description'] = cols[22]
45 hsp = {}
46 hsp['domain_index'] = int(cols[9])
47
48 hsp['evalue_cond'] = float(cols[11])
49 hsp['evalue'] = float(cols[12])
50 hsp['bitscore'] = float(cols[13])
51 hsp['bias'] = float(cols[14])
52 hsp['env_start'] = int(cols[19]) - 1
53 hsp['env_end'] = int(cols[20])
54 hsp['acc_avg'] = float(cols[21])
55 frag = {}
56
57 frag['hit_strand'] = frag['query_strand'] = 0
58 frag['hit_start'] = int(cols[15]) - 1
59 frag['hit_end'] = int(cols[16])
60 frag['query_start'] = int(cols[17]) - 1
61 frag['query_end'] = int(cols[18])
62
63 frag['alphabet'] = generic_protein
64
65
66 if not self.hmm_as_hit:
67 frag['hit_end'], frag['query_end'] = \
68 frag['query_end'], frag['hit_end']
69 frag['hit_start'], frag['query_start'] = \
70 frag['query_start'], frag['hit_start']
71
72 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
73
75 """Generator function that returns QueryResult objects."""
76
77 state_EOF = 0
78 state_QRES_NEW = 1
79 state_QRES_SAME = 3
80 state_HIT_NEW = 2
81 state_HIT_SAME = 4
82
83 qres_state = None
84 hit_state = None
85 file_state = None
86
87 prev_qid = None
88 prev_hid = None
89
90 cur, prev = None, None
91 hit_list, hsp_list = [], []
92
93 while True:
94
95 if cur is not None:
96 prev = cur
97 prev_qid = cur_qid
98 prev_hid = cur_hid
99
100 if self.line:
101 cur = self._parse_row()
102 cur_qid = cur['qresult']['id']
103 cur_hid = cur['hit']['id']
104 else:
105 file_state = state_EOF
106
107 cur_qid, cur_hid = None, None
108
109
110 if prev_qid != cur_qid:
111 qres_state = state_QRES_NEW
112 else:
113 qres_state = state_QRES_SAME
114
115 if prev_hid != cur_hid or qres_state == state_QRES_NEW:
116 hit_state = state_HIT_NEW
117 else:
118 hit_state = state_HIT_SAME
119
120
121 if prev is not None:
122
123 frag = HSPFragment(prev_hid, prev_qid)
124 for attr, value in prev['frag'].items():
125 setattr(frag, attr, value)
126 hsp = HSP([frag])
127 for attr, value in prev['hsp'].items():
128 setattr(hsp, attr, value)
129 hsp_list.append(hsp)
130
131
132
133 if hit_state == state_HIT_NEW:
134 hit = Hit(hsp_list)
135 for attr, value in prev['hit'].items():
136 setattr(hit, attr, value)
137 hit_list.append(hit)
138 hsp_list = []
139
140
141 if qres_state == state_QRES_NEW or file_state == state_EOF:
142 qresult = QueryResult(prev_qid, hits=hit_list)
143 for attr, value in prev['qresult'].items():
144 setattr(qresult, attr, value)
145 yield qresult
146
147 if file_state == state_EOF:
148 break
149 hit_list = []
150
151 self.line = self.handle.readline()
152
153
155
156 """Parser for the HMMER domain table format that assumes HMM profile
157 coordinates are hit coordinates."""
158
159 hmm_as_hit = True
160
161
163
164 """Parser for the HMMER domain table format that assumes HMM profile
165 coordinates are query coordinates."""
166
167 hmm_as_hit = False
168
169
177
178
186
187
189
190 """Writer for hmmer3-domtab output format which writes hit coordinates
191 as HMM profile coordinates."""
192
193 hmm_as_hit = True
194
197
199 """Writes to the handle.
200
201 Returns a tuple of how many QueryResult, Hit, and HSP objects were written.
202
203 """
204 handle = self.handle
205 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0
206
207 try:
208 first_qresult = qresults.next()
209 except StopIteration:
210 handle.write(self._build_header())
211 else:
212
213 handle.write(self._build_header(first_qresult))
214
215 for qresult in chain([first_qresult], qresults):
216 if qresult:
217 handle.write(self._build_row(qresult))
218 qresult_counter += 1
219 hit_counter += len(qresult)
220 hsp_counter += sum([len(hit) for hit in qresult])
221 frag_counter += sum([len(hit.fragments) for hit in qresult])
222
223 return qresult_counter, hit_counter, hsp_counter, frag_counter
224
226 """Returns the header string of a domain HMMER table output."""
227
228
229
230 if first_qresult:
231
232 qnamew = 20
233 tnamew = max(20, len(first_qresult[0].id))
234 try:
235 qaccw = max(10, len(first_qresult.acc))
236 taccw = max(10, len(first_qresult[0].acc))
237 except AttributeError:
238 qaccw, taccw = 10, 10
239 else:
240 qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10
241
242 header = "#%*s %22s %40s %11s %11s %11s\n" % \
243 (tnamew+qnamew-1+15+taccw+qaccw, "", "--- full sequence ---",
244 "-------------- this domain -------------", "hmm coord",
245 "ali coord", "env coord")
246 header += "#%-*s %-*s %5s %-*s %-*s %5s %9s %6s %5s %3s %3s %9s " \
247 "%9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" % (tnamew-1,
248 " target name", taccw, "accession", "tlen", qnamew,
249 "query name", qaccw, "accession", "qlen", "E-value", "score",
250 "bias", "#", "of", "c-Evalue", "i-Evalue", "score", "bias",
251 "from", "to", "from", "to", "from", "to", "acc",
252 "description of target")
253 header += "#%*s %*s %5s %*s %*s %5s %9s %6s %5s %3s %3s %9s %9s " \
254 "%6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" % (tnamew-1,
255 "-------------------", taccw, "----------", "-----",
256 qnamew, "--------------------", qaccw, "----------",
257 "-----", "---------", "------", "-----", "---", "---",
258 "---------", "---------", "------", "-----", "-----", "-----",
259 "-----", "-----", "-----", "-----", "----",
260 "---------------------")
261
262 return header
263
265 """Returns a string or one row or more of the QueryResult object."""
266 rows = ''
267
268
269
270 qnamew = max(20, len(qresult.id))
271 tnamew = max(20, len(qresult[0].id))
272 try:
273 qaccw = max(10, len(qresult.accession))
274 taccw = max(10, len(qresult[0].accession))
275 qresult_acc = qresult.accession
276 except AttributeError:
277 qaccw, taccw = 10, 10
278 qresult_acc = '-'
279
280 for hit in qresult:
281
282
283 try:
284 hit_acc = hit.accession
285 except AttributeError:
286 hit_acc = '-'
287
288 for hsp in hit.hsps:
289 if self.hmm_as_hit:
290 hmm_to = hsp.hit_end
291 hmm_from = hsp.hit_start + 1
292 ali_to = hsp.query_end
293 ali_from = hsp.query_start + 1
294 else:
295 hmm_to = hsp.query_end
296 hmm_from = hsp.query_start + 1
297 ali_to = hsp.hit_end
298 ali_from = hsp.hit_start + 1
299
300 rows += "%-*s %-*s %5d %-*s %-*s %5d %9.2g %6.1f %5.1f %3d %3d" \
301 " %9.2g %9.2g %6.1f %5.1f %5d %5d %5ld %5ld %5d %5d %4.2f %s\n" % \
302 (tnamew, hit.id, taccw, hit_acc, hit.seq_len, qnamew, qresult.id,
303 qaccw, qresult_acc, qresult.seq_len, hit.evalue, hit.bitscore,
304 hit.bias, hsp.domain_index, len(hit.hsps), hsp.evalue_cond, hsp.evalue,
305 hsp.bitscore, hsp.bias, hmm_from, hmm_to, ali_from, ali_to,
306 hsp.env_start + 1, hsp.env_end, hsp.acc_avg, hit.description)
307
308 return rows
309
310
312
313 """Writer for hmmer3-domtab output format which writes query coordinates
314 as HMM profile coordinates."""
315
316 hmm_as_hit = False
317
318
319
320 if __name__ == "__main__":
321 from Bio._utils import run_doctest
322 run_doctest()
323