Package Bio :: Package SearchIO :: Package HmmerIO :: Module hmmer3_tab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.HmmerIO.hmmer3_tab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for HMMER table output format.""" 
  7   
  8  from itertools import chain 
  9   
 10  from Bio._py3k import _as_bytes, _bytes_to_string 
 11  from Bio.Alphabet import generic_protein 
 12  from Bio.SearchIO._index import SearchIndexer 
 13  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 14   
 15   
 16  __all__ = ['Hmmer3TabParser', 'Hmmer3TabIndexer', 'Hmmer3TabWriter'] 
 17   
 18  __docformat__ = "restructuredtext en" 
 19   
 20   
21 -class Hmmer3TabParser(object):
22 23 """Parser for the HMMER table format.""" 24
25 - def __init__(self, handle):
26 self.handle = handle 27 self.line = self.handle.readline()
28
29 - def __iter__(self):
30 header_mark = '#' 31 # read through the header if it exists 32 while self.line.startswith(header_mark): 33 self.line = self.handle.readline() 34 # if we have result rows, parse it 35 if self.line: 36 for qresult in self._parse_qresult(): 37 yield qresult
38
39 - def _parse_row(self):
40 """Returns a dictionary of parsed row values.""" 41 cols = [x for x in self.line.strip().split(' ') if x] 42 # if len(cols) > 19, we have extra description columns 43 # combine them all into one string in the 19th column 44 if len(cols) > 19: 45 cols[18] = ' '.join(cols[18:]) 46 # if it's < 19, we have no description columns, so use an empty string 47 # instead 48 elif len(cols) < 19: 49 cols.append('') 50 assert len(cols) == 19 51 52 # assign parsed column data into qresult, hit, and hsp dicts 53 qresult = {} 54 qresult['id'] = cols[2] # query name 55 qresult['accession'] = cols[3] # query accession 56 hit = {} 57 hit['id'] = cols[0] # target name 58 hit['accession'] = cols[1] # target accession 59 hit['evalue'] = float(cols[4]) # evalue (full sequence) 60 hit['bitscore'] = float(cols[5]) # score (full sequence) 61 hit['bias'] = float(cols[6]) # bias (full sequence) 62 hit['domain_exp_num'] = float(cols[10]) # exp 63 hit['region_num'] = int(cols[11]) # reg 64 hit['cluster_num'] = int(cols[12]) # clu 65 hit['overlap_num'] = int(cols[13]) # ov 66 hit['env_num'] = int(cols[14]) # env 67 hit['domain_obs_num'] = int(cols[15]) # dom 68 hit['domain_reported_num'] = int(cols[16]) # rep 69 hit['domain_included_num'] = int(cols[17]) # inc 70 hit['description'] = cols[18] # description of target 71 hsp = {} 72 hsp['evalue'] = float(cols[7]) # evalue (best 1 domain) 73 hsp['bitscore'] = float(cols[8]) # score (best 1 domain) 74 hsp['bias'] = float(cols[9]) # bias (best 1 domain) 75 # strand is always 0, since HMMER now only handles protein 76 frag = {} 77 frag['hit_strand'] = frag['query_strand'] = 0 78 frag['alphabet'] = generic_protein 79 80 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
81
82 - def _parse_qresult(self):
83 """Generator function that returns QueryResult objects.""" 84 # state values, determines what to do for each line 85 state_EOF = 0 86 state_QRES_NEW = 1 87 state_QRES_SAME = 3 88 # initial value dummies 89 qres_state = None 90 file_state = None 91 prev_qid = None 92 cur, prev = None, None 93 # container for Hit objects, used to create QueryResult 94 hit_list = [] 95 96 while True: 97 # store previous line's parsed values for all lines after the first 98 if cur is not None: 99 prev = cur 100 prev_qid = cur_qid 101 # only parse the result row if it's not EOF 102 # NOTE: we are not parsing the extra '#' lines appended to the end 103 # of hmmer31b1 tabular results since storing them in qresult 104 # objects means we can not do a single-pass parsing 105 if self.line and not self.line.startswith('#'): 106 cur = self._parse_row() 107 cur_qid = cur['qresult']['id'] 108 else: 109 file_state = state_EOF 110 # mock value for cur_qid, since we have nothing to parse 111 cur_qid = None 112 113 if prev_qid != cur_qid: 114 qres_state = state_QRES_NEW 115 else: 116 qres_state = state_QRES_SAME 117 118 if prev is not None: 119 # since domain tab formats only have 1 Hit per line 120 # we always create HSPFragment, HSP, and Hit per line 121 prev_hid = prev['hit']['id'] 122 123 # create fragment and HSP and set their attributes 124 frag = HSPFragment(prev_hid, prev_qid) 125 for attr, value in prev['frag'].items(): 126 setattr(frag, attr, value) 127 hsp = HSP([frag]) 128 for attr, value in prev['hsp'].items(): 129 setattr(hsp, attr, value) 130 131 # create Hit and set its attributes 132 hit = Hit([hsp]) 133 for attr, value in prev['hit'].items(): 134 setattr(hit, attr, value) 135 hit_list.append(hit) 136 137 # create qresult and yield if we're at a new qresult or at EOF 138 if qres_state == state_QRES_NEW or file_state == state_EOF: 139 qresult = QueryResult(hit_list, prev_qid) 140 for attr, value in prev['qresult'].items(): 141 setattr(qresult, attr, value) 142 yield qresult 143 # if we're at EOF, break 144 if file_state == state_EOF: 145 break 146 hit_list = [] 147 148 self.line = self.handle.readline()
149 150
151 -class Hmmer3TabIndexer(SearchIndexer):
152 153 """Indexer class for HMMER table output.""" 154 155 _parser = Hmmer3TabParser 156 # denotes column location for query identifier 157 _query_id_idx = 2 158
159 - def __iter__(self):
160 """Iterates over the file handle; yields key, start offset, and length.""" 161 handle = self._handle 162 handle.seek(0) 163 query_id_idx = self._query_id_idx 164 qresult_key = None 165 header_mark = _as_bytes('#') 166 split_mark = _as_bytes(' ') 167 # set line with initial mock value, to emulate header 168 line = header_mark 169 170 # read through header 171 while line.startswith(header_mark): 172 start_offset = handle.tell() 173 line = handle.readline() 174 175 # and index the qresults 176 while True: 177 end_offset = handle.tell() 178 179 if not line: 180 break 181 182 cols = [x for x in line.strip().split(split_mark) if x] 183 if qresult_key is None: 184 qresult_key = cols[query_id_idx] 185 else: 186 curr_key = cols[query_id_idx] 187 188 if curr_key != qresult_key: 189 adj_end = end_offset - len(line) 190 yield _bytes_to_string(qresult_key), start_offset, \ 191 adj_end - start_offset 192 qresult_key = curr_key 193 start_offset = adj_end 194 195 line = handle.readline() 196 if not line: 197 yield _bytes_to_string(qresult_key), start_offset, \ 198 end_offset - start_offset 199 break
200
201 - def get_raw(self, offset):
202 """Returns the raw string of a QueryResult object from the given offset.""" 203 handle = self._handle 204 handle.seek(offset) 205 query_id_idx = self._query_id_idx 206 qresult_key = None 207 qresult_raw = _as_bytes('') 208 split_mark = _as_bytes(' ') 209 210 while True: 211 line = handle.readline() 212 if not line: 213 break 214 cols = [x for x in line.strip().split(split_mark) if x] 215 if qresult_key is None: 216 qresult_key = cols[query_id_idx] 217 else: 218 curr_key = cols[query_id_idx] 219 if curr_key != qresult_key: 220 break 221 qresult_raw += line 222 223 return qresult_raw
224 225
226 -class Hmmer3TabWriter(object):
227 228 """Writer for hmmer3-tab output format.""" 229
230 - def __init__(self, handle):
231 self.handle = handle
232
233 - def write_file(self, qresults):
234 """Writes to the handle. 235 236 Returns a tuple of how many QueryResult, Hit, and HSP objects were written. 237 238 """ 239 handle = self.handle 240 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 241 242 try: 243 first_qresult = next(qresults) 244 except StopIteration: 245 handle.write(self._build_header()) 246 else: 247 # write header 248 handle.write(self._build_header(first_qresult)) 249 # and then the qresults 250 for qresult in chain([first_qresult], qresults): 251 if qresult: 252 handle.write(self._build_row(qresult)) 253 qresult_counter += 1 254 hit_counter += len(qresult) 255 hsp_counter += sum(len(hit) for hit in qresult) 256 frag_counter += sum(len(hit.fragments) for hit in qresult) 257 258 return qresult_counter, hit_counter, hsp_counter, frag_counter
259
260 - def _build_header(self, first_qresult=None):
261 """Returns the header string of a HMMER table output.""" 262 263 # calculate whitespace required 264 # adapted from HMMER's source: src/p7_tophits.c#L1083 265 if first_qresult is not None: 266 # qnamew = max(20, len(first_qresult.id)) 267 qnamew = 20 # why doesn't the above work? 268 tnamew = max(20, len(first_qresult[0].id)) 269 qaccw = max(10, len(first_qresult.accession)) 270 taccw = max(10, len(first_qresult[0].accession)) 271 else: 272 qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10 273 274 header = "#%*s %22s %22s %33s\n" % \ 275 (tnamew + qnamew + taccw + qaccw + 2, "", 276 "--- full sequence ----", "--- best 1 domain ----", 277 "--- domain number estimation ----") 278 header += "#%-*s %-*s %-*s %-*s %9s %6s %5s %9s %6s %5s %5s %3s " \ 279 "%3s %3s %3s %3s %3s %3s %s\n" % (tnamew-1, " target name", 280 taccw, "accession", qnamew, "query name", qaccw, 281 "accession", " E-value", " score", " bias", 282 " E-value", " score", " bias", "exp", 283 "reg", "clu", " ov", "env", "dom", "rep", 284 "inc", "description of target") 285 header += "#%*s %*s %*s %*s %9s %6s %5s %9s %6s %5s %5s %3s %3s " \ 286 "%3s %3s %3s %3s %3s %s\n" % (tnamew-1, "-------------------", 287 taccw, "----------", qnamew, "--------------------", qaccw, 288 "----------", "---------", "------", "-----", "---------", 289 "------", "-----", "---", "---", "---", "---", "---", "---", 290 "---", "---", "---------------------") 291 292 return header
293
294 - def _build_row(self, qresult):
295 """Returns a string or one row or more of the QueryResult object.""" 296 rows = '' 297 298 # calculate whitespace required 299 # adapted from HMMER's source: src/p7_tophits.c#L1083 300 qnamew = max(20, len(qresult.id)) 301 tnamew = max(20, len(qresult[0].id)) 302 qaccw = max(10, len(qresult.accession)) 303 taccw = max(10, len(qresult[0].accession)) 304 305 for hit in qresult: 306 rows += "%-*s %-*s %-*s %-*s %9.2g %6.1f %5.1f %9.2g %6.1f %5.1f " \ 307 "%5.1f %3d %3d %3d %3d %3d %3d %3d %s\n" % (tnamew, hit.id, taccw, 308 hit.accession, qnamew, qresult.id, qaccw, qresult.accession, hit.evalue, 309 hit.bitscore, hit.bias, hit.hsps[0].evalue, hit.hsps[0].bitscore, 310 hit.hsps[0].bias, hit.domain_exp_num, hit.region_num, hit.cluster_num, 311 hit.overlap_num, hit.env_num, hit.domain_obs_num, 312 hit.domain_reported_num, hit.domain_included_num, hit.description) 313 314 return rows
315 316 317 # if not used as a module, run the doctest 318 if __name__ == "__main__": 319 from Bio._utils import run_doctest 320 run_doctest() 321