Package Bio :: Package SearchIO :: Package HmmerIO :: Module hmmer3_domtab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.HmmerIO.hmmer3_domtab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for HMMER domain table output format.""" 
  7   
  8  from itertools import chain 
  9   
 10  from Bio.Alphabet import generic_protein 
 11  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 12   
 13  from .hmmer3_tab import Hmmer3TabParser, Hmmer3TabIndexer 
 14   
 15   
16 -class Hmmer3DomtabParser(Hmmer3TabParser):
17 18 """Base hmmer3-domtab iterator.""" 19
20 - def _parse_row(self):
21 """Returns a dictionary of parsed row values.""" 22 assert self.line 23 cols = [x for x in self.line.strip().split(' ') if x] 24 # if len(cols) > 23, we have extra description columns 25 # combine them all into one string in the 19th column 26 if len(cols) > 23: 27 cols[22] = ' '.join(cols[22:]) 28 elif len(cols) < 23: 29 cols.append('') 30 assert len(cols) == 23 31 32 # assign parsed column data into qresult, hit, and hsp dicts 33 qresult = {} 34 qresult['id'] = cols[3] # query name 35 qresult['accession'] = cols[4] # query accession 36 qresult['seq_len'] = int(cols[5]) # qlen 37 hit = {} 38 hit['id'] = cols[0] # target name 39 hit['accession'] = cols[1] # target accession 40 hit['seq_len'] = int(cols[2]) # tlen 41 hit['evalue'] = float(cols[6]) # evalue 42 hit['bitscore'] = float(cols[7]) # score 43 hit['bias'] = float(cols[8]) # bias 44 hit['description'] = cols[22] # description of target 45 hsp = {} 46 hsp['domain_index'] = int(cols[9]) # # (domain number) 47 # not parsing cols[10] since it's basically len(hit) 48 hsp['evalue_cond'] = float(cols[11]) # c-evalue 49 hsp['evalue'] = float(cols[12]) # i-evalue 50 hsp['bitscore'] = float(cols[13]) # score 51 hsp['bias'] = float(cols[14]) # bias 52 hsp['env_start'] = int(cols[19]) - 1 # env from 53 hsp['env_end'] = int(cols[20]) # env to 54 hsp['acc_avg'] = float(cols[21]) # acc 55 frag = {} 56 # strand is always 0, since HMMER now only handles protein 57 frag['hit_strand'] = frag['query_strand'] = 0 58 frag['hit_start'] = int(cols[15]) - 1 # hmm from 59 frag['hit_end'] = int(cols[16]) # hmm to 60 frag['query_start'] = int(cols[17]) - 1 # ali from 61 frag['query_end'] = int(cols[18]) # ali to 62 # HMMER alphabets are always protein 63 frag['alphabet'] = generic_protein 64 65 # switch hmm<-->ali coordinates if hmm is not hit 66 if not self.hmm_as_hit: 67 frag['hit_end'], frag['query_end'] = \ 68 frag['query_end'], frag['hit_end'] 69 frag['hit_start'], frag['query_start'] = \ 70 frag['query_start'], frag['hit_start'] 71 72 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
73
74 - def _parse_qresult(self):
75 """Generator function that returns QueryResult objects.""" 76 # state values, determines what to do for each line 77 state_EOF = 0 78 state_QRES_NEW = 1 79 state_QRES_SAME = 3 80 state_HIT_NEW = 2 81 state_HIT_SAME = 4 82 # dummies for initial states 83 qres_state = None 84 hit_state = None 85 file_state = None 86 # dummies for initial id caches 87 prev_qid = None 88 prev_hid = None 89 # dummies for initial parsed value containers 90 cur, prev = None, None 91 hit_list, hsp_list = [], [] 92 93 while True: 94 # store previous line's parsed values, for every line after the 1st 95 if cur is not None: 96 prev = cur 97 prev_qid = cur_qid 98 prev_hid = cur_hid 99 # only parse the line if it's not EOF 100 if self.line and not self.line.startswith('#'): 101 cur = self._parse_row() 102 cur_qid = cur['qresult']['id'] 103 cur_hid = cur['hit']['id'] 104 else: 105 file_state = state_EOF 106 # mock ID values since the line is empty 107 cur_qid, cur_hid = None, None 108 109 # get the state of hit and qresult 110 if prev_qid != cur_qid: 111 qres_state = state_QRES_NEW 112 else: 113 qres_state = state_QRES_SAME 114 # new hits are hits with different ids or hits in a new qresult 115 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 116 hit_state = state_HIT_NEW 117 else: 118 hit_state = state_HIT_SAME 119 120 # start creating objects after the first line (i.e. prev is filled) 121 if prev is not None: 122 # each line is basically an HSP with one HSPFragment 123 frag = HSPFragment(prev_hid, prev_qid) 124 for attr, value in prev['frag'].items(): 125 setattr(frag, attr, value) 126 hsp = HSP([frag]) 127 for attr, value in prev['hsp'].items(): 128 setattr(hsp, attr, value) 129 hsp_list.append(hsp) 130 131 # create hit object when we've finished parsing all its hsps 132 # i.e. when hit state is state_HIT_NEW 133 if hit_state == state_HIT_NEW: 134 hit = Hit(hsp_list) 135 for attr, value in prev['hit'].items(): 136 setattr(hit, attr, value) 137 hit_list.append(hit) 138 hsp_list = [] 139 140 # create qresult and yield if we're at a new qresult or EOF 141 if qres_state == state_QRES_NEW or file_state == state_EOF: 142 qresult = QueryResult(hit_list, prev_qid) 143 for attr, value in prev['qresult'].items(): 144 setattr(qresult, attr, value) 145 yield qresult 146 # if current line is EOF, break 147 if file_state == state_EOF: 148 break 149 hit_list = [] 150 151 self.line = self.handle.readline()
152 153
154 -class Hmmer3DomtabHmmhitParser(Hmmer3DomtabParser):
155 156 """Parser for the HMMER domain table format that assumes HMM profile 157 coordinates are hit coordinates.""" 158 159 hmm_as_hit = True
160 161
162 -class Hmmer3DomtabHmmqueryParser(Hmmer3DomtabParser):
163 164 """Parser for the HMMER domain table format that assumes HMM profile 165 coordinates are query coordinates.""" 166 167 hmm_as_hit = False
168 169
170 -class Hmmer3DomtabHmmhitIndexer(Hmmer3TabIndexer):
171 172 """Indexer class for HMMER domain table output that assumes HMM profile 173 coordinates are hit coordinates.""" 174 175 _parser = Hmmer3DomtabHmmhitParser 176 _query_id_idx = 3
177 178
179 -class Hmmer3DomtabHmmqueryIndexer(Hmmer3TabIndexer):
180 181 """Indexer class for HMMER domain table output that assumes HMM profile 182 coordinates are query coordinates.""" 183 184 _parser = Hmmer3DomtabHmmqueryParser 185 _query_id_idx = 3
186 187
188 -class Hmmer3DomtabHmmhitWriter(object):
189 190 """Writer for hmmer3-domtab output format which writes hit coordinates 191 as HMM profile coordinates.""" 192 193 hmm_as_hit = True 194
195 - def __init__(self, handle):
196 self.handle = handle
197
198 - def write_file(self, qresults):
199 """Writes to the handle. 200 201 Returns a tuple of how many QueryResult, Hit, and HSP objects were written. 202 203 """ 204 handle = self.handle 205 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 206 207 try: 208 first_qresult = next(qresults) 209 except StopIteration: 210 handle.write(self._build_header()) 211 else: 212 # write header 213 handle.write(self._build_header(first_qresult)) 214 # and then the qresults 215 for qresult in chain([first_qresult], qresults): 216 if qresult: 217 handle.write(self._build_row(qresult)) 218 qresult_counter += 1 219 hit_counter += len(qresult) 220 hsp_counter += sum(len(hit) for hit in qresult) 221 frag_counter += sum(len(hit.fragments) for hit in qresult) 222 223 return qresult_counter, hit_counter, hsp_counter, frag_counter
224
225 - def _build_header(self, first_qresult=None):
226 """Returns the header string of a domain HMMER table output.""" 227 228 # calculate whitespace required 229 # adapted from HMMER's source: src/p7_tophits.c#L1157 230 if first_qresult: 231 #qnamew = max(20, len(first_qresult.id)) 232 qnamew = 20 233 tnamew = max(20, len(first_qresult[0].id)) 234 try: 235 qaccw = max(10, len(first_qresult.acc)) 236 taccw = max(10, len(first_qresult[0].acc)) 237 except AttributeError: 238 qaccw, taccw = 10, 10 239 else: 240 qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10 241 242 header = "#%*s %22s %40s %11s %11s %11s\n" % \ 243 (tnamew+qnamew-1+15+taccw+qaccw, "", "--- full sequence ---", 244 "-------------- this domain -------------", "hmm coord", 245 "ali coord", "env coord") 246 header += "#%-*s %-*s %5s %-*s %-*s %5s %9s %6s %5s %3s %3s %9s " \ 247 "%9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" % (tnamew-1, 248 " target name", taccw, "accession", "tlen", qnamew, 249 "query name", qaccw, "accession", "qlen", "E-value", "score", 250 "bias", "#", "of", "c-Evalue", "i-Evalue", "score", "bias", 251 "from", "to", "from", "to", "from", "to", "acc", 252 "description of target") 253 header += "#%*s %*s %5s %*s %*s %5s %9s %6s %5s %3s %3s %9s %9s " \ 254 "%6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" % (tnamew-1, 255 "-------------------", taccw, "----------", "-----", 256 qnamew, "--------------------", qaccw, "----------", 257 "-----", "---------", "------", "-----", "---", "---", 258 "---------", "---------", "------", "-----", "-----", "-----", 259 "-----", "-----", "-----", "-----", "----", 260 "---------------------") 261 262 return header
263
264 - def _build_row(self, qresult):
265 """Returns a string or one row or more of the QueryResult object.""" 266 rows = '' 267 268 # calculate whitespace required 269 # adapted from HMMER's source: src/p7_tophits.c#L1083 270 qnamew = max(20, len(qresult.id)) 271 tnamew = max(20, len(qresult[0].id)) 272 try: 273 qaccw = max(10, len(qresult.accession)) 274 taccw = max(10, len(qresult[0].accession)) 275 qresult_acc = qresult.accession 276 except AttributeError: 277 qaccw, taccw = 10, 10 278 qresult_acc = '-' 279 280 for hit in qresult: 281 282 # try to get hit accession 283 try: 284 hit_acc = hit.accession 285 except AttributeError: 286 hit_acc = '-' 287 288 for hsp in hit.hsps: 289 if self.hmm_as_hit: 290 hmm_to = hsp.hit_end 291 hmm_from = hsp.hit_start + 1 292 ali_to = hsp.query_end 293 ali_from = hsp.query_start + 1 294 else: 295 hmm_to = hsp.query_end 296 hmm_from = hsp.query_start + 1 297 ali_to = hsp.hit_end 298 ali_from = hsp.hit_start + 1 299 300 rows += "%-*s %-*s %5d %-*s %-*s %5d %9.2g %6.1f %5.1f %3d %3d" \ 301 " %9.2g %9.2g %6.1f %5.1f %5d %5d %5ld %5ld %5d %5d %4.2f %s\n" % \ 302 (tnamew, hit.id, taccw, hit_acc, hit.seq_len, qnamew, qresult.id, 303 qaccw, qresult_acc, qresult.seq_len, hit.evalue, hit.bitscore, 304 hit.bias, hsp.domain_index, len(hit.hsps), hsp.evalue_cond, hsp.evalue, 305 hsp.bitscore, hsp.bias, hmm_from, hmm_to, ali_from, ali_to, 306 hsp.env_start + 1, hsp.env_end, hsp.acc_avg, hit.description) 307 308 return rows
309 310
311 -class Hmmer3DomtabHmmqueryWriter(Hmmer3DomtabHmmhitWriter):
312 313 """Writer for hmmer3-domtab output format which writes query coordinates 314 as HMM profile coordinates.""" 315 316 hmm_as_hit = False
317 318 319 # if not used as a module, run the doctest 320 if __name__ == "__main__": 321 from Bio._utils import run_doctest 322 run_doctest() 323