1
2
3
4
5
6 """Bio.SearchIO parser for HMMER table output format."""
7
8 from itertools import chain
9
10 from Bio._py3k import _as_bytes, _bytes_to_string
11 from Bio.Alphabet import generic_protein
12 from Bio.SearchIO._index import SearchIndexer
13 from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
14
15
16 __all__ = ['Hmmer3TabParser', 'Hmmer3TabIndexer', 'Hmmer3TabWriter']
17
18
20
21 """Parser for the HMMER table format."""
22
26
36
38 """Returns a dictionary of parsed row values."""
39 cols = filter(None, self.line.strip().split(' '))
40
41
42 if len(cols) > 19:
43 cols[18] = ' '.join(cols[18:])
44
45
46 elif len(cols) < 19:
47 cols.append('')
48 assert len(cols) == 19
49
50
51 qresult = {}
52 qresult['id'] = cols[2]
53 qresult['acc'] = cols[3]
54 hit = {}
55 hit['id'] = cols[0]
56 hit['acc'] = cols[1]
57 hit['evalue'] = float(cols[4])
58 hit['bitscore'] = float(cols[5])
59 hit['bias'] = float(cols[6])
60 hit['domain_exp_num'] = float(cols[10])
61 hit['region_num'] = int(cols[11])
62 hit['cluster_num'] = int(cols[12])
63 hit['overlap_num'] = int(cols[13])
64 hit['env_num'] = int(cols[14])
65 hit['domain_obs_num'] = int(cols[15])
66 hit['domain_reported_num'] = int(cols[16])
67 hit['domain_included_num'] = int(cols[17])
68 hit['description'] = cols[18]
69 hsp = {}
70 hsp['evalue'] = float(cols[7])
71 hsp['bitscore'] = float(cols[8])
72 hsp['bias'] = float(cols[9])
73
74 frag = {}
75 frag['hit_strand'] = frag['query_strand'] = 0
76 frag['alphabet'] = generic_protein
77
78 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
79
81 """Generator function that returns QueryResult objects."""
82
83 state_EOF = 0
84 state_QRES_NEW = 1
85 state_QRES_SAME = 3
86
87 qres_state = None
88 file_state = None
89 prev_qid = None
90 cur, prev = None, None
91
92 hit_list = []
93
94 while True:
95
96 if cur is not None:
97 prev = cur
98 prev_qid = cur_qid
99
100 if self.line:
101 cur = self._parse_row()
102 cur_qid = cur['qresult']['id']
103 else:
104 file_state = state_EOF
105
106 cur_qid = None
107
108 if prev_qid != cur_qid:
109 qres_state = state_QRES_NEW
110 else:
111 qres_state = state_QRES_SAME
112
113 if prev is not None:
114
115
116 prev_hid = prev['hit']['id']
117
118
119 frag = HSPFragment(prev_hid, prev_qid)
120 for attr, value in prev['frag'].items():
121 setattr(frag, attr, value)
122 hsp = HSP([frag])
123 for attr, value in prev['hsp'].items():
124 setattr(hsp, attr, value)
125
126
127 hit = Hit([hsp])
128 for attr, value in prev['hit'].items():
129 setattr(hit, attr, value)
130 hit_list.append(hit)
131
132
133 if qres_state == state_QRES_NEW or file_state == state_EOF:
134 qresult = QueryResult(prev_qid, hits=hit_list)
135 for attr, value in prev['qresult'].items():
136 setattr(qresult, attr, value)
137 yield qresult
138
139 if file_state == state_EOF:
140 break
141 hit_list = []
142
143 self.line = self.handle.readline()
144
145
147
148 """Indexer class for HMMER table output."""
149
150 _parser = Hmmer3TabParser
151
152 _query_id_idx = 2
153
155 """Iterates over the file handle; yields key, start offset, and length."""
156 handle = self._handle
157 handle.seek(0)
158 query_id_idx = self._query_id_idx
159 qresult_key = None
160 header_mark = _as_bytes('#')
161 split_mark = _as_bytes(' ')
162
163 line = header_mark
164
165
166 while line.startswith(header_mark):
167 start_offset = handle.tell()
168 line = handle.readline()
169
170
171 while True:
172 end_offset = handle.tell()
173
174 if not line:
175 break
176
177 cols = line.strip().split(split_mark)
178 if qresult_key is None:
179 qresult_key = list(filter(None, cols))[query_id_idx]
180 else:
181 curr_key = list(filter(None, cols))[query_id_idx]
182
183 if curr_key != qresult_key:
184 adj_end = end_offset - len(line)
185 yield _bytes_to_string(qresult_key), start_offset, \
186 adj_end - start_offset
187 qresult_key = curr_key
188 start_offset = adj_end
189
190 line = handle.readline()
191 if not line:
192 yield _bytes_to_string(qresult_key), start_offset, \
193 end_offset - start_offset
194 break
195
197 """Returns the raw string of a QueryResult object from the given offset."""
198 handle = self._handle
199 handle.seek(offset)
200 query_id_idx = self._query_id_idx
201 qresult_key = None
202 qresult_raw = _as_bytes('')
203 split_mark = _as_bytes(' ')
204
205 while True:
206 line = handle.readline()
207 if not line:
208 break
209 cols = list(filter(None, line.strip().split(split_mark)))
210 if qresult_key is None:
211 qresult_key = cols[query_id_idx]
212 else:
213 curr_key = cols[query_id_idx]
214 if curr_key != qresult_key:
215 break
216 qresult_raw += line
217
218 return qresult_raw
219
220
222
223 """Writer for hmmer3-tab output format."""
224
227
229 """Writes to the handle.
230
231 Returns a tuple of how many QueryResult, Hit, and HSP objects were written.
232
233 """
234 handle = self.handle
235 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0
236
237 try:
238 first_qresult = qresults.next()
239 except StopIteration:
240 handle.write(self._build_header())
241 else:
242
243 handle.write(self._build_header(first_qresult))
244
245 for qresult in chain([first_qresult], qresults):
246 if qresult:
247 handle.write(self._build_row(qresult))
248 qresult_counter += 1
249 hit_counter += len(qresult)
250 hsp_counter += sum([len(hit) for hit in qresult])
251 frag_counter += sum([len(hit.fragments) for hit in qresult])
252
253 return qresult_counter, hit_counter, hsp_counter, frag_counter
254
256 """Returns the header string of a HMMER table output."""
257
258
259
260 if first_qresult is not None:
261
262 qnamew = 20
263 tnamew = max(20, len(first_qresult[0].id))
264 qaccw = max(10, len(first_qresult.acc))
265 taccw = max(10, len(first_qresult[0].acc))
266 else:
267 qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10
268
269 header = "#%*s %22s %22s %33s\n" % \
270 (tnamew + qnamew + taccw + qaccw + 2, "",
271 "--- full sequence ----", "--- best 1 domain ----",
272 "--- domain number estimation ----")
273 header += "#%-*s %-*s %-*s %-*s %9s %6s %5s %9s %6s %5s %5s %3s " \
274 "%3s %3s %3s %3s %3s %3s %s\n" % (tnamew-1, " target name",
275 taccw, "accession", qnamew, "query name", qaccw,
276 "accession", " E-value", " score", " bias",
277 " E-value", " score", " bias", "exp",
278 "reg", "clu", " ov", "env", "dom", "rep",
279 "inc", "description of target")
280 header += "#%*s %*s %*s %*s %9s %6s %5s %9s %6s %5s %5s %3s %3s " \
281 "%3s %3s %3s %3s %3s %s\n" % (tnamew-1, "-------------------",
282 taccw, "----------", qnamew, "--------------------", qaccw,
283 "----------", "---------", "------", "-----", "---------",
284 "------", "-----", "---", "---", "---", "---", "---", "---",
285 "---", "---", "---------------------")
286
287 return header
288
290 """Returns a string or one row or more of the QueryResult object."""
291 rows = ''
292
293
294
295 qnamew = max(20, len(qresult.id))
296 tnamew = max(20, len(qresult[0].id))
297 qaccw = max(10, len(qresult.acc))
298 taccw = max(10, len(qresult[0].acc))
299
300 for hit in qresult:
301 rows += "%-*s %-*s %-*s %-*s %9.2g %6.1f %5.1f %9.2g %6.1f %5.1f " \
302 "%5.1f %3d %3d %3d %3d %3d %3d %3d %s\n" % (tnamew, hit.id, taccw,
303 hit.acc, qnamew, qresult.id, qaccw, qresult.acc, hit.evalue,
304 hit.bitscore, hit.bias, hit.hsps[0].evalue, hit.hsps[0].bitscore,
305 hit.hsps[0].bias, hit.domain_exp_num, hit.region_num, hit.cluster_num,
306 hit.overlap_num, hit.env_num, hit.domain_obs_num,
307 hit.domain_reported_num, hit.domain_included_num, hit.description)
308
309 return rows
310
311
312
313 if __name__ == "__main__":
314 from Bio._utils import run_doctest
315 run_doctest()
316