Package Bio :: Package SearchIO :: Package HmmerIO :: Module hmmer3_domtab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.HmmerIO.hmmer3_domtab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for HMMER domain table output format.""" 
  7   
  8  from itertools import chain 
  9   
 10  from Bio.Alphabet import generic_protein 
 11  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 12   
 13  from .hmmer3_tab import Hmmer3TabParser, Hmmer3TabIndexer 
 14   
 15   
16 -class Hmmer3DomtabParser(Hmmer3TabParser):
17 """Base hmmer3-domtab iterator.""" 18
19 - def _parse_row(self):
20 """Returns a dictionary of parsed row values.""" 21 assert self.line 22 cols = [x for x in self.line.strip().split(' ') if x] 23 # if len(cols) > 23, we have extra description columns 24 # combine them all into one string in the 19th column 25 if len(cols) > 23: 26 cols[22] = ' '.join(cols[22:]) 27 elif len(cols) < 23: 28 cols.append('') 29 assert len(cols) == 23 30 31 # assign parsed column data into qresult, hit, and hsp dicts 32 qresult = {} 33 qresult['id'] = cols[3] # query name 34 qresult['accession'] = cols[4] # query accession 35 qresult['seq_len'] = int(cols[5]) # qlen 36 hit = {} 37 hit['id'] = cols[0] # target name 38 hit['accession'] = cols[1] # target accession 39 hit['seq_len'] = int(cols[2]) # tlen 40 hit['evalue'] = float(cols[6]) # evalue 41 hit['bitscore'] = float(cols[7]) # score 42 hit['bias'] = float(cols[8]) # bias 43 hit['description'] = cols[22] # description of target 44 hsp = {} 45 hsp['domain_index'] = int(cols[9]) # # (domain number) 46 # not parsing cols[10] since it's basically len(hit) 47 hsp['evalue_cond'] = float(cols[11]) # c-evalue 48 hsp['evalue'] = float(cols[12]) # i-evalue 49 hsp['bitscore'] = float(cols[13]) # score 50 hsp['bias'] = float(cols[14]) # bias 51 hsp['env_start'] = int(cols[19]) - 1 # env from 52 hsp['env_end'] = int(cols[20]) # env to 53 hsp['acc_avg'] = float(cols[21]) # acc 54 frag = {} 55 # strand is always 0, since HMMER now only handles protein 56 frag['hit_strand'] = frag['query_strand'] = 0 57 frag['hit_start'] = int(cols[15]) - 1 # hmm from 58 frag['hit_end'] = int(cols[16]) # hmm to 59 frag['query_start'] = int(cols[17]) - 1 # ali from 60 frag['query_end'] = int(cols[18]) # ali to 61 # HMMER alphabets are always protein 62 frag['alphabet'] = generic_protein 63 64 # switch hmm<-->ali coordinates if hmm is not hit 65 if not self.hmm_as_hit: 66 frag['hit_end'], frag['query_end'] = \ 67 frag['query_end'], frag['hit_end'] 68 frag['hit_start'], frag['query_start'] = \ 69 frag['query_start'], frag['hit_start'] 70 71 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
72
73 - def _parse_qresult(self):
74 """Generator function that returns QueryResult objects.""" 75 # state values, determines what to do for each line 76 state_EOF = 0 77 state_QRES_NEW = 1 78 state_QRES_SAME = 3 79 state_HIT_NEW = 2 80 state_HIT_SAME = 4 81 # dummies for initial states 82 qres_state = None 83 hit_state = None 84 file_state = None 85 # dummies for initial id caches 86 prev_qid = None 87 prev_hid = None 88 # dummies for initial parsed value containers 89 cur, prev = None, None 90 hit_list, hsp_list = [], [] 91 cur_qid = None 92 cur_hid = None 93 while True: 94 # store previous line's parsed values, for every line after the 1st 95 if cur is not None: 96 prev = cur 97 prev_qid = cur_qid 98 prev_hid = cur_hid 99 # only parse the line if it's not EOF 100 if self.line and not self.line.startswith('#'): 101 cur = self._parse_row() 102 cur_qid = cur['qresult']['id'] 103 cur_hid = cur['hit']['id'] 104 else: 105 file_state = state_EOF 106 # mock ID values since the line is empty 107 cur_qid, cur_hid = None, None 108 109 # get the state of hit and qresult 110 if prev_qid != cur_qid: 111 qres_state = state_QRES_NEW 112 else: 113 qres_state = state_QRES_SAME 114 # new hits are hits with different ids or hits in a new qresult 115 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 116 hit_state = state_HIT_NEW 117 else: 118 hit_state = state_HIT_SAME 119 120 # start creating objects after the first line (i.e. prev is filled) 121 if prev is not None: 122 # each line is basically an HSP with one HSPFragment 123 frag = HSPFragment(prev_hid, prev_qid) 124 for attr, value in prev['frag'].items(): 125 setattr(frag, attr, value) 126 hsp = HSP([frag]) 127 for attr, value in prev['hsp'].items(): 128 setattr(hsp, attr, value) 129 hsp_list.append(hsp) 130 131 # create hit object when we've finished parsing all its hsps 132 # i.e. when hit state is state_HIT_NEW 133 if hit_state == state_HIT_NEW: 134 hit = Hit(hsp_list) 135 for attr, value in prev['hit'].items(): 136 setattr(hit, attr, value) 137 hit_list.append(hit) 138 hsp_list = [] 139 140 # create qresult and yield if we're at a new qresult or EOF 141 if qres_state == state_QRES_NEW or file_state == state_EOF: 142 qresult = QueryResult(hit_list, prev_qid) 143 for attr, value in prev['qresult'].items(): 144 setattr(qresult, attr, value) 145 yield qresult 146 # if current line is EOF, break 147 if file_state == state_EOF: 148 break 149 hit_list = [] 150 151 self.line = self.handle.readline()
152 153
154 -class Hmmer3DomtabHmmhitParser(Hmmer3DomtabParser):
155 """Parser for the HMMER domain table format that assumes HMM profile 156 coordinates are hit coordinates. 157 """ 158 159 hmm_as_hit = True
160 161
162 -class Hmmer3DomtabHmmqueryParser(Hmmer3DomtabParser):
163 """Parser for the HMMER domain table format that assumes HMM profile 164 coordinates are query coordinates. 165 """ 166 167 hmm_as_hit = False
168 169
170 -class Hmmer3DomtabHmmhitIndexer(Hmmer3TabIndexer):
171 """Indexer class for HMMER domain table output that assumes HMM profile 172 coordinates are hit coordinates. 173 """ 174 175 _parser = Hmmer3DomtabHmmhitParser 176 _query_id_idx = 3
177 178
179 -class Hmmer3DomtabHmmqueryIndexer(Hmmer3TabIndexer):
180 """Indexer class for HMMER domain table output that assumes HMM profile 181 coordinates are query coordinates. 182 """ 183 184 _parser = Hmmer3DomtabHmmqueryParser 185 _query_id_idx = 3
186 187
188 -class Hmmer3DomtabHmmhitWriter(object):
189 """Writer for hmmer3-domtab output format which writes hit coordinates 190 as HMM profile coordinates. 191 """ 192 193 hmm_as_hit = True 194
195 - def __init__(self, handle):
196 self.handle = handle
197
198 - def write_file(self, qresults):
199 """Writes to the handle. 200 201 Returns a tuple of how many QueryResult, Hit, and HSP objects were written. 202 203 """ 204 handle = self.handle 205 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 206 207 try: 208 first_qresult = next(qresults) 209 except StopIteration: 210 handle.write(self._build_header()) 211 else: 212 # write header 213 handle.write(self._build_header(first_qresult)) 214 # and then the qresults 215 for qresult in chain([first_qresult], qresults): 216 if qresult: 217 handle.write(self._build_row(qresult)) 218 qresult_counter += 1 219 hit_counter += len(qresult) 220 hsp_counter += sum(len(hit) for hit in qresult) 221 frag_counter += sum(len(hit.fragments) for hit in qresult) 222 223 return qresult_counter, hit_counter, hsp_counter, frag_counter
224
225 - def _build_header(self, first_qresult=None):
226 """Returns the header string of a domain HMMER table output.""" 227 # calculate whitespace required 228 # adapted from HMMER's source: src/p7_tophits.c#L1157 229 if first_qresult: 230 # qnamew = max(20, len(first_qresult.id)) 231 qnamew = 20 232 tnamew = max(20, len(first_qresult[0].id)) 233 try: 234 qaccw = max(10, len(first_qresult.acc)) 235 taccw = max(10, len(first_qresult[0].acc)) 236 except AttributeError: 237 qaccw, taccw = 10, 10 238 else: 239 qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10 240 241 header = "#%*s %22s %40s %11s %11s %11s\n" % \ 242 (tnamew + qnamew - 1 + 15 + taccw + qaccw, "", "--- full sequence ---", 243 "-------------- this domain -------------", "hmm coord", 244 "ali coord", "env coord") 245 header += "#%-*s %-*s %5s %-*s %-*s %5s %9s %6s %5s %3s %3s %9s " \ 246 "%9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" % (tnamew - 1, 247 " target name", taccw, "accession", "tlen", qnamew, 248 "query name", qaccw, "accession", "qlen", "E-value", "score", 249 "bias", "#", "of", "c-Evalue", "i-Evalue", "score", "bias", 250 "from", "to", "from", "to", "from", "to", "acc", 251 "description of target") 252 header += "#%*s %*s %5s %*s %*s %5s %9s %6s %5s %3s %3s %9s %9s " \ 253 "%6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n" % (tnamew - 1, 254 "-------------------", taccw, "----------", "-----", 255 qnamew, "--------------------", qaccw, "----------", 256 "-----", "---------", "------", "-----", "---", "---", 257 "---------", "---------", "------", "-----", "-----", "-----", 258 "-----", "-----", "-----", "-----", "----", 259 "---------------------") 260 261 return header
262
263 - def _build_row(self, qresult):
264 """Returns a string or one row or more of the QueryResult object.""" 265 rows = '' 266 267 # calculate whitespace required 268 # adapted from HMMER's source: src/p7_tophits.c#L1083 269 qnamew = max(20, len(qresult.id)) 270 tnamew = max(20, len(qresult[0].id)) 271 try: 272 qaccw = max(10, len(qresult.accession)) 273 taccw = max(10, len(qresult[0].accession)) 274 qresult_acc = qresult.accession 275 except AttributeError: 276 qaccw, taccw = 10, 10 277 qresult_acc = '-' 278 279 for hit in qresult: 280 281 # try to get hit accession 282 try: 283 hit_acc = hit.accession 284 except AttributeError: 285 hit_acc = '-' 286 287 for hsp in hit.hsps: 288 if self.hmm_as_hit: 289 hmm_to = hsp.hit_end 290 hmm_from = hsp.hit_start + 1 291 ali_to = hsp.query_end 292 ali_from = hsp.query_start + 1 293 else: 294 hmm_to = hsp.query_end 295 hmm_from = hsp.query_start + 1 296 ali_to = hsp.hit_end 297 ali_from = hsp.hit_start + 1 298 299 rows += "%-*s %-*s %5d %-*s %-*s %5d %9.2g %6.1f %5.1f %3d %3d" \ 300 " %9.2g %9.2g %6.1f %5.1f %5d %5d %5ld %5ld %5d %5d %4.2f %s\n" % \ 301 (tnamew, hit.id, taccw, hit_acc, hit.seq_len, qnamew, qresult.id, 302 qaccw, qresult_acc, qresult.seq_len, hit.evalue, hit.bitscore, 303 hit.bias, hsp.domain_index, len(hit.hsps), hsp.evalue_cond, hsp.evalue, 304 hsp.bitscore, hsp.bias, hmm_from, hmm_to, ali_from, ali_to, 305 hsp.env_start + 1, hsp.env_end, hsp.acc_avg, hit.description) 306 307 return rows
308 309
310 -class Hmmer3DomtabHmmqueryWriter(Hmmer3DomtabHmmhitWriter):
311 """Writer for hmmer3-domtab output format which writes query coordinates 312 as HMM profile coordinates. 313 """ 314 315 hmm_as_hit = False
316 317 318 # if not used as a module, run the doctest 319 if __name__ == "__main__": 320 from Bio._utils import run_doctest 321 run_doctest() 322