Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (DEPRECATED). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  preferred. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23  """ 
  24   
  25  from __future__ import print_function 
  26   
  27  import sys 
  28  import re 
  29   
  30  from Bio._py3k import StringIO 
  31  from Bio.ParserSupport import AbstractParser, AbstractConsumer 
  32  from Bio.ParserSupport import read_and_call, read_and_call_until 
  33  from Bio.ParserSupport import read_and_call_while, attempt_read_and_call 
  34  from Bio.ParserSupport import is_blank_line, safe_peekline, safe_readline 
  35  from Bio import File 
  36  from Bio.Blast import Record 
  37   
  38  from Bio import BiopythonDeprecationWarning 
  39  import warnings 
  40  warnings.warn("This module has been deprecated. Consider Bio.SearchIO for " 
  41                "parsing BLAST output instead.", BiopythonDeprecationWarning) 
  42   
  43   
  44  _score_e_re = re.compile(r'Score +E') 
  45   
  46   
47 -class LowQualityBlastError(Exception):
48 """Error caused by running a low quality sequence through BLAST. 49 50 When low quality sequences (like GenBank entries containing only 51 stretches of a single nucleotide) are BLASTed, they will result in 52 BLAST generating an error and not being able to perform the BLAST. 53 search. This error should be raised for the BLAST reports produced 54 in this case. 55 """ 56 pass
57 58
59 -class ShortQueryBlastError(Exception):
60 """Error caused by running a short query sequence through BLAST. 61 62 If the query sequence is too short, BLAST outputs warnings and errors:: 63 64 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 65 [blastall] ERROR: [000.000] AT1G08320: Blast: 66 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 67 done 68 69 This exception is raised when that condition is detected. 70 """ 71 pass
72 73
74 -class _Scanner(object):
75 """Scan BLAST output from blastall or blastpgp. 76 77 Tested with blastall and blastpgp v2.0.10, v2.0.11 78 79 Methods: 80 - feed Feed data into the scanner. 81 """
82 - def feed(self, handle, consumer):
83 """S.feed(handle, consumer) 84 85 Feed in a BLAST report for scanning. handle is a file-like 86 object that contains the BLAST report. consumer is a Consumer 87 object that will receive events as the report is scanned. 88 """ 89 if isinstance(handle, File.UndoHandle): 90 uhandle = handle 91 else: 92 uhandle = File.UndoHandle(handle) 93 94 # Try to fast-forward to the beginning of the blast report. 95 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 96 # Now scan the BLAST report. 97 self._scan_header(uhandle, consumer) 98 self._scan_rounds(uhandle, consumer) 99 self._scan_database_report(uhandle, consumer) 100 self._scan_parameters(uhandle, consumer)
101
102 - def _scan_header(self, uhandle, consumer):
103 # BLASTP 2.0.10 [Aug-26-1999] 104 # 105 # 106 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 107 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 108 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 109 # programs", Nucleic Acids Res. 25:3389-3402. 110 # 111 # Query= test 112 # (140 letters) 113 # 114 # Database: sdqib40-1.35.seg.fa 115 # 1323 sequences; 223,339 total letters 116 # 117 # ======================================================== 118 # This next example is from the online version of Blast, 119 # note there are TWO references, an RID line, and also 120 # the database is BEFORE the query line. 121 # Note there possibleuse of non-ASCII in the author names. 122 # ======================================================== 123 # 124 # BLASTP 2.2.15 [Oct-15-2006] 125 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 126 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 127 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 128 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 129 # 130 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 131 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 132 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 133 # protein database searches with composition-based statistics 134 # and other refinements", Nucleic Acids Res. 29:2994-3005. 135 # 136 # RID: 1166022616-19998-65316425856.BLASTQ1 137 # 138 # 139 # Database: All non-redundant GenBank CDS 140 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 141 # 4,254,166 sequences; 1,462,033,012 total letters 142 # Query= gi:16127998 143 # Length=428 144 # 145 146 consumer.start_header() 147 148 read_and_call(uhandle, consumer.version, contains='BLAST') 149 read_and_call_while(uhandle, consumer.noevent, blank=1) 150 151 # There might be a <pre> line, for qblast output. 152 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 153 154 # Read the reference(s) 155 while attempt_read_and_call(uhandle, 156 consumer.reference, start='Reference'): 157 # References are normally multiline terminated by a blank line 158 # (or, based on the old code, the RID line) 159 while True: 160 line = uhandle.readline() 161 if is_blank_line(line): 162 consumer.noevent(line) 163 break 164 elif line.startswith("RID"): 165 break 166 else: 167 # More of the reference 168 consumer.reference(line) 169 170 # Deal with the optional RID: ... 171 read_and_call_while(uhandle, consumer.noevent, blank=1) 172 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 173 read_and_call_while(uhandle, consumer.noevent, blank=1) 174 175 # blastpgp may have a reference for compositional score matrix 176 # adjustment (see Bug 2502): 177 if attempt_read_and_call( 178 uhandle, consumer.reference, start="Reference"): 179 read_and_call_until(uhandle, consumer.reference, blank=1) 180 read_and_call_while(uhandle, consumer.noevent, blank=1) 181 182 # blastpgp has a Reference for composition-based statistics. 183 if attempt_read_and_call( 184 uhandle, consumer.reference, start="Reference"): 185 read_and_call_until(uhandle, consumer.reference, blank=1) 186 read_and_call_while(uhandle, consumer.noevent, blank=1) 187 188 line = uhandle.peekline() 189 assert line.strip() != "" 190 assert not line.startswith("RID:") 191 if line.startswith("Query="): 192 # This is an old style query then database... 193 194 # Read the Query lines and the following blank line. 195 read_and_call(uhandle, consumer.query_info, start='Query=') 196 read_and_call_until(uhandle, consumer.query_info, blank=1) 197 read_and_call_while(uhandle, consumer.noevent, blank=1) 198 199 # Read the database lines and the following blank line. 200 read_and_call_until(uhandle, consumer.database_info, end='total letters') 201 read_and_call(uhandle, consumer.database_info, contains='sequences') 202 read_and_call_while(uhandle, consumer.noevent, blank=1) 203 elif line.startswith("Database:"): 204 # This is a new style database then query... 205 read_and_call_until(uhandle, consumer.database_info, end='total letters') 206 read_and_call(uhandle, consumer.database_info, contains='sequences') 207 read_and_call_while(uhandle, consumer.noevent, blank=1) 208 209 # Read the Query lines and the following blank line. 210 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 211 # the "... Score E" line instead. 212 read_and_call(uhandle, consumer.query_info, start='Query=') 213 # BLAST 2.2.25+ has a blank line before Length= 214 read_and_call_until(uhandle, consumer.query_info, start='Length=') 215 while True: 216 line = uhandle.peekline() 217 if not line.strip() or _score_e_re.search(line) is not None: 218 break 219 # It is more of the query (and its length) 220 read_and_call(uhandle, consumer.query_info) 221 read_and_call_while(uhandle, consumer.noevent, blank=1) 222 else: 223 raise ValueError("Invalid header?") 224 225 consumer.end_header()
226
227 - def _scan_rounds(self, uhandle, consumer):
228 # Scan a bunch of rounds. 229 # Each round begins with either a "Searching......" line 230 # or a 'Score E' line followed by descriptions and alignments. 231 # The email server doesn't give the "Searching....." line. 232 # If there is no 'Searching.....' line then you'll first see a 233 # 'Results from round' line 234 235 while not self._eof(uhandle): 236 line = safe_peekline(uhandle) 237 if not line.startswith('Searching') and \ 238 not line.startswith('Results from round') and \ 239 _score_e_re.search(line) is None and \ 240 'No hits found' not in line: 241 break 242 self._scan_descriptions(uhandle, consumer) 243 self._scan_alignments(uhandle, consumer)
244
245 - def _scan_descriptions(self, uhandle, consumer):
246 # Searching..................................................done 247 # Results from round 2 248 # 249 # 250 # Sc 251 # Sequences producing significant alignments: (b 252 # Sequences used in model and found again: 253 # 254 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 255 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 256 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 257 # 258 # Sequences not found previously or not previously below threshold: 259 # 260 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 261 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 262 # 263 264 # If PSI-BLAST, may also have: 265 # 266 # CONVERGED! 267 268 consumer.start_descriptions() 269 270 # Read 'Searching' 271 # This line seems to be missing in BLASTN 2.1.2 (others?) 272 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 273 274 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 275 # If this happens, the handle will yield no more information. 276 if not uhandle.peekline(): 277 raise ValueError("Unexpected end of blast report. " + 278 "Looks suspiciously like a PSI-BLAST crash.") 279 280 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 281 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 282 # [blastall] ERROR: [000.000] AT1G08320: Blast: 283 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 284 # done 285 # Reported by David Weisman. 286 # Check for these error lines and ignore them for now. Let 287 # the BlastErrorParser deal with them. 288 line = uhandle.peekline() 289 if "ERROR:" in line or line.startswith("done"): 290 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 291 read_and_call(uhandle, consumer.noevent, start="done") 292 293 # Check to see if this is PSI-BLAST. 294 # If it is, the 'Searching' line will be followed by: 295 # (version 2.0.10) 296 # Searching............................. 297 # Results from round 2 298 # or (version 2.0.11) 299 # Searching............................. 300 # 301 # 302 # Results from round 2 303 304 # Skip a bunch of blank lines. 305 read_and_call_while(uhandle, consumer.noevent, blank=1) 306 # Check for the results line if it's there. 307 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 308 read_and_call_while(uhandle, consumer.noevent, blank=1) 309 310 # Three things can happen here: 311 # 1. line contains 'Score E' 312 # 2. line contains "No hits found" 313 # 3. no descriptions 314 # The first one begins a bunch of descriptions. The last two 315 # indicates that no descriptions follow, and we should go straight 316 # to the alignments. 317 if not attempt_read_and_call( 318 uhandle, consumer.description_header, 319 has_re=_score_e_re): 320 # Either case 2 or 3. Look for "No hits found". 321 attempt_read_and_call(uhandle, consumer.no_hits, 322 contains='No hits found') 323 try: 324 read_and_call_while(uhandle, consumer.noevent, blank=1) 325 except ValueError as err: 326 if str(err) != "Unexpected end of stream.": 327 raise err 328 329 consumer.end_descriptions() 330 # Stop processing. 331 return 332 333 # Read the score header lines 334 read_and_call(uhandle, consumer.description_header, 335 start='Sequences producing') 336 337 # If PSI-BLAST, read the 'Sequences used in model' line. 338 attempt_read_and_call(uhandle, consumer.model_sequences, 339 start='Sequences used in model') 340 read_and_call_while(uhandle, consumer.noevent, blank=1) 341 342 # In BLAT, rather than a "No hits found" line, we just 343 # get no descriptions (and no alignments). This can be 344 # spotted because the next line is the database block: 345 if safe_peekline(uhandle).startswith(" Database:"): 346 consumer.end_descriptions() 347 # Stop processing. 348 return 349 350 # Read the descriptions and the following blank lines, making 351 # sure that there are descriptions. 352 if not uhandle.peekline().startswith('Sequences not found'): 353 read_and_call_until(uhandle, consumer.description, blank=1) 354 read_and_call_while(uhandle, consumer.noevent, blank=1) 355 356 # If PSI-BLAST, read the 'Sequences not found' line followed 357 # by more descriptions. However, I need to watch out for the 358 # case where there were no sequences not found previously, in 359 # which case there will be no more descriptions. 360 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 361 start='Sequences not found'): 362 # Read the descriptions and the following blank lines. 363 read_and_call_while(uhandle, consumer.noevent, blank=1) 364 l = safe_peekline(uhandle) 365 # Brad -- added check for QUERY. On some PSI-BLAST outputs 366 # there will be a 'Sequences not found' line followed by no 367 # descriptions. Check for this case since the first thing you'll 368 # get is a blank line and then 'QUERY' 369 if not l.startswith('CONVERGED') and l[0] != '>' \ 370 and not l.startswith('QUERY'): 371 read_and_call_until(uhandle, consumer.description, blank=1) 372 read_and_call_while(uhandle, consumer.noevent, blank=1) 373 374 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 375 read_and_call_while(uhandle, consumer.noevent, blank=1) 376 377 consumer.end_descriptions()
378
379 - def _scan_alignments(self, uhandle, consumer):
380 if self._eof(uhandle): 381 return 382 383 # qblast inserts a helpful line here. 384 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 385 386 # First, check to see if I'm at the database report. 387 line = safe_peekline(uhandle) 388 if not line: 389 # EOF 390 return 391 elif line.startswith(' Database') or line.startswith("Lambda"): 392 return 393 elif line[0] == '>': 394 # XXX make a better check here between pairwise and masterslave 395 self._scan_pairwise_alignments(uhandle, consumer) 396 elif line.startswith('Effective'): 397 return 398 else: 399 # XXX put in a check to make sure I'm in a masterslave alignment 400 self._scan_masterslave_alignment(uhandle, consumer)
401
402 - def _scan_pairwise_alignments(self, uhandle, consumer):
403 while not self._eof(uhandle): 404 line = safe_peekline(uhandle) 405 if line[0] != '>': 406 break 407 self._scan_one_pairwise_alignment(uhandle, consumer)
408
409 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
410 if self._eof(uhandle): 411 return 412 consumer.start_alignment() 413 414 self._scan_alignment_header(uhandle, consumer) 415 416 # Scan a bunch of score/alignment pairs. 417 while True: 418 if self._eof(uhandle): 419 # Shouldn't have issued that _scan_alignment_header event... 420 break 421 line = safe_peekline(uhandle) 422 if not line.startswith(' Score'): 423 break 424 self._scan_hsp(uhandle, consumer) 425 consumer.end_alignment()
426
427 - def _scan_alignment_header(self, uhandle, consumer):
428 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 429 # stearothermophilus] 430 # Length = 81 431 # 432 # Or, more recently with different white space: 433 # 434 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 435 # gi|15829258|ref|NP_308031.1| threonine synthase 436 # ... 437 # Length=428 438 read_and_call(uhandle, consumer.title, start='>') 439 while True: 440 line = safe_readline(uhandle) 441 if line.lstrip().startswith(('Length =', 'Length=')): 442 consumer.length(line) 443 break 444 elif is_blank_line(line): 445 # Check to make sure I haven't missed the Length line 446 raise ValueError("I missed the Length in an alignment header") 447 consumer.title(line) 448 449 # Older versions of BLAST will have a line with some spaces. 450 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 451 if not attempt_read_and_call(uhandle, consumer.noevent, 452 start=' '): 453 read_and_call(uhandle, consumer.noevent, blank=1)
454
455 - def _scan_hsp(self, uhandle, consumer):
456 consumer.start_hsp() 457 self._scan_hsp_header(uhandle, consumer) 458 self._scan_hsp_alignment(uhandle, consumer) 459 consumer.end_hsp()
460
461 - def _scan_hsp_header(self, uhandle, consumer):
462 # Score = 22.7 bits (47), Expect = 2.5 463 # Identities = 10/36 (27%), Positives = 18/36 (49%) 464 # Strand = Plus / Plus 465 # Frame = +3 466 # 467 468 read_and_call(uhandle, consumer.score, start=' Score') 469 read_and_call(uhandle, consumer.identities, start=' Identities') 470 # BLASTN 471 attempt_read_and_call(uhandle, consumer.strand, start=' Strand') 472 # BLASTX, TBLASTN, TBLASTX 473 attempt_read_and_call(uhandle, consumer.frame, start=' Frame') 474 read_and_call(uhandle, consumer.noevent, blank=1)
475
476 - def _scan_hsp_alignment(self, uhandle, consumer):
477 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 478 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 479 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 480 # 481 # Query: 64 AEKILIKR 71 482 # I +K 483 # Sbjct: 70 PNIIQLKD 77 484 # 485 486 while True: 487 # Blastn adds an extra line filled with spaces before Query 488 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 489 read_and_call(uhandle, consumer.query, start='Query') 490 read_and_call(uhandle, consumer.align, start=' ') 491 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 492 try: 493 read_and_call_while(uhandle, consumer.noevent, blank=1) 494 except ValueError as err: 495 if str(err) != "Unexpected end of stream.": 496 raise err 497 # End of File (well, it looks like it with recent versions 498 # of BLAST for multiple queries after the Iterator class 499 # has broken up the whole file into chunks). 500 break 501 line = safe_peekline(uhandle) 502 # Alignment continues if I see a 'Query' or the spaces for Blastn. 503 if not (line.startswith('Query') or line.startswith(' ')): 504 break
505
506 - def _scan_masterslave_alignment(self, uhandle, consumer):
507 consumer.start_alignment() 508 while True: 509 line = safe_readline(uhandle) 510 # Check to see whether I'm finished reading the alignment. 511 # This is indicated by 1) database section, 2) next psi-blast 512 # round, which can also be a 'Results from round' if no 513 # searching line is present 514 # patch by chapmanb 515 if line.startswith('Searching') or \ 516 line.startswith('Results from round'): 517 uhandle.saveline(line) 518 break 519 elif line.startswith(' Database'): 520 uhandle.saveline(line) 521 break 522 elif is_blank_line(line): 523 consumer.noevent(line) 524 else: 525 consumer.multalign(line) 526 read_and_call_while(uhandle, consumer.noevent, blank=1) 527 consumer.end_alignment()
528
529 - def _eof(self, uhandle):
530 try: 531 line = safe_peekline(uhandle) 532 except ValueError as err: 533 if str(err) != "Unexpected end of stream.": 534 raise err 535 line = "" 536 return not line
537
538 - def _scan_database_report(self, uhandle, consumer):
539 # Database: sdqib40-1.35.seg.fa 540 # Posted date: Nov 1, 1999 4:25 PM 541 # Number of letters in database: 223,339 542 # Number of sequences in database: 1323 543 # 544 # Lambda K H 545 # 0.322 0.133 0.369 546 # 547 # Gapped 548 # Lambda K H 549 # 0.270 0.0470 0.230 550 # 551 ########################################## 552 # Or, more recently Blast 2.2.15 gives less blank lines 553 ########################################## 554 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 555 # environmental samples 556 # Posted date: Dec 12, 2006 5:51 PM 557 # Number of letters in database: 667,088,753 558 # Number of sequences in database: 2,094,974 559 # Lambda K H 560 # 0.319 0.136 0.395 561 # Gapped 562 # Lambda K H 563 # 0.267 0.0410 0.140 564 565 if self._eof(uhandle): 566 return 567 568 consumer.start_database_report() 569 570 # Subset of the database(s) listed below 571 # Number of letters searched: 562,618,960 572 # Number of sequences searched: 228,924 573 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 574 read_and_call(uhandle, consumer.noevent, contains="letters") 575 read_and_call(uhandle, consumer.noevent, contains="sequences") 576 read_and_call(uhandle, consumer.noevent, start=" ") 577 578 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 579 # was missing the "Database" stanza completely. 580 while attempt_read_and_call(uhandle, consumer.database, 581 start=' Database'): 582 # BLAT output ends abruptly here, without any of the other 583 # information. Check to see if this is the case. If so, 584 # then end the database report here gracefully. 585 if not uhandle.peekline().strip() or \ 586 uhandle.peekline().startswith("BLAST"): 587 consumer.end_database_report() 588 return 589 590 # Database can span multiple lines. 591 read_and_call_until(uhandle, consumer.database, start=' Posted') 592 read_and_call(uhandle, consumer.posted_date, start=' Posted') 593 read_and_call(uhandle, consumer.num_letters_in_database, 594 start=' Number of letters') 595 read_and_call(uhandle, consumer.num_sequences_in_database, 596 start=' Number of sequences') 597 # There may not be a line starting with spaces... 598 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 599 600 line = safe_readline(uhandle) 601 uhandle.saveline(line) 602 if 'Lambda' in line: 603 break 604 605 try: 606 read_and_call(uhandle, consumer.noevent, start='Lambda') 607 read_and_call(uhandle, consumer.ka_params) 608 except Exception: # TODO: ValueError, AttributeError? 609 pass 610 611 # This blank line is optional: 612 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 613 614 # not BLASTP 615 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 616 # not TBLASTX 617 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 618 read_and_call(uhandle, consumer.ka_params_gap) 619 620 # Blast 2.2.4 can sometimes skip the whole parameter section. 621 # Thus, I need to be careful not to read past the end of the 622 # file. 623 try: 624 read_and_call_while(uhandle, consumer.noevent, blank=1) 625 except ValueError as x: 626 if str(x) != "Unexpected end of stream.": 627 raise 628 consumer.end_database_report()
629
630 - def _scan_parameters(self, uhandle, consumer):
631 # Matrix: BLOSUM62 632 # Gap Penalties: Existence: 11, Extension: 1 633 # Number of Hits to DB: 50604 634 # Number of Sequences: 1323 635 # Number of extensions: 1526 636 # Number of successful extensions: 6 637 # Number of sequences better than 10.0: 5 638 # Number of HSP's better than 10.0 without gapping: 5 639 # Number of HSP's successfully gapped in prelim test: 0 640 # Number of HSP's that attempted gapping in prelim test: 1 641 # Number of HSP's gapped (non-prelim): 5 642 # length of query: 140 643 # length of database: 223,339 644 # effective HSP length: 39 645 # effective length of query: 101 646 # effective length of database: 171,742 647 # effective search space: 17345942 648 # effective search space used: 17345942 649 # T: 11 650 # A: 40 651 # X1: 16 ( 7.4 bits) 652 # X2: 38 (14.8 bits) 653 # X3: 64 (24.9 bits) 654 # S1: 41 (21.9 bits) 655 # S2: 42 (20.8 bits) 656 ########################################## 657 # Or, more recently Blast(x) 2.2.15 gives 658 ########################################## 659 # Matrix: BLOSUM62 660 # Gap Penalties: Existence: 11, Extension: 1 661 # Number of Sequences: 4535438 662 # Number of Hits to DB: 2,588,844,100 663 # Number of extensions: 60427286 664 # Number of successful extensions: 126433 665 # Number of sequences better than 2.0: 30 666 # Number of HSP's gapped: 126387 667 # Number of HSP's successfully gapped: 35 668 # Length of query: 291 669 # Length of database: 1,573,298,872 670 # Length adjustment: 130 671 # Effective length of query: 161 672 # Effective length of database: 983,691,932 673 # Effective search space: 158374401052 674 # Effective search space used: 158374401052 675 # Neighboring words threshold: 12 676 # Window for multiple hits: 40 677 # X1: 16 ( 7.3 bits) 678 # X2: 38 (14.6 bits) 679 # X3: 64 (24.7 bits) 680 # S1: 41 (21.7 bits) 681 # S2: 32 (16.9 bits) 682 683 # Blast 2.2.4 can sometimes skip the whole parameter section. 684 # BLAT also skips the whole parameter section. 685 # Thus, check to make sure that the parameter section really 686 # exists. 687 if not uhandle.peekline().strip(): 688 return 689 690 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 691 # "Number of Sequences" lines. 692 consumer.start_parameters() 693 694 # Matrix line may be missing in BLASTN 2.2.9 695 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 696 # not TBLASTX 697 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 698 699 attempt_read_and_call(uhandle, consumer.num_sequences, 700 start='Number of Sequences') 701 attempt_read_and_call(uhandle, consumer.num_hits, 702 start='Number of Hits') 703 attempt_read_and_call(uhandle, consumer.num_sequences, 704 start='Number of Sequences') 705 attempt_read_and_call(uhandle, consumer.num_extends, 706 start='Number of extensions') 707 attempt_read_and_call(uhandle, consumer.num_good_extends, 708 start='Number of successful') 709 710 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 711 start='Number of sequences') 712 713 # not BLASTN, TBLASTX 714 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 715 start="Number of HSP's better"): 716 # BLASTN 2.2.9 717 if attempt_read_and_call(uhandle, consumer.noevent, 718 start="Number of HSP's gapped:"): 719 read_and_call(uhandle, consumer.noevent, 720 start="Number of HSP's successfully") 721 # This is omitted in 2.2.15 722 attempt_read_and_call(uhandle, consumer.noevent, 723 start="Number of extra gapped extensions") 724 else: 725 read_and_call(uhandle, consumer.hsps_prelim_gapped, 726 start="Number of HSP's successfully") 727 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 728 start="Number of HSP's that") 729 read_and_call(uhandle, consumer.hsps_gapped, 730 start="Number of HSP's gapped") 731 # e.g. BLASTX 2.2.15 where the "better" line is missing 732 elif attempt_read_and_call(uhandle, consumer.noevent, 733 start="Number of HSP's gapped"): 734 read_and_call(uhandle, consumer.noevent, 735 start="Number of HSP's successfully") 736 737 # not in blastx 2.2.1 738 attempt_read_and_call(uhandle, consumer.query_length, 739 has_re=re.compile(r"[Ll]ength of query")) 740 # Not in BLASTX 2.2.22+ 741 attempt_read_and_call(uhandle, consumer.database_length, 742 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 743 744 # BLASTN 2.2.9 745 attempt_read_and_call(uhandle, consumer.noevent, 746 start="Length adjustment") 747 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 748 start='effective HSP') 749 # Not in blastx 2.2.1 750 attempt_read_and_call( 751 uhandle, consumer.effective_query_length, 752 has_re=re.compile(r'[Ee]ffective length of query')) 753 754 # This is not in BLASTP 2.2.15 755 attempt_read_and_call( 756 uhandle, consumer.effective_database_length, 757 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 758 # Not in blastx 2.2.1, added a ':' to distinguish between 759 # this and the 'effective search space used' line 760 attempt_read_and_call( 761 uhandle, consumer.effective_search_space, 762 has_re=re.compile(r'[Ee]ffective search space:')) 763 # Does not appear in BLASTP 2.0.5 764 attempt_read_and_call( 765 uhandle, consumer.effective_search_space_used, 766 has_re=re.compile(r'[Ee]ffective search space used')) 767 768 # BLASTX, TBLASTN, TBLASTX 769 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 770 771 # not in BLASTN 2.2.9 772 attempt_read_and_call(uhandle, consumer.threshold, start='T') 773 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 774 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 775 776 # not in BLASTX 2.2.15 777 attempt_read_and_call(uhandle, consumer.window_size, start='A') 778 # get this instead: "Window for multiple hits: 40" 779 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 780 781 # not in BLASTX 2.2.22+ 782 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 783 # not TBLASTN 784 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 785 786 # not BLASTN, TBLASTX 787 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 788 start='X3') 789 790 # not TBLASTN 791 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 792 # not in blastx 2.2.1 793 # first we make sure we have additional lines to work with, if 794 # not then the file is done and we don't have a final S2 795 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 796 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 797 798 consumer.end_parameters()
799 800
801 -class BlastParser(AbstractParser):
802 """Parses BLAST data into a Record.Blast object. 803 804 """
805 - def __init__(self):
806 """__init__(self)""" 807 self._scanner = _Scanner() 808 self._consumer = _BlastConsumer()
809
810 - def parse(self, handle):
811 """parse(self, handle)""" 812 self._scanner.feed(handle, self._consumer) 813 return self._consumer.data
814 815
816 -class PSIBlastParser(AbstractParser):
817 """Parses BLAST data into a Record.PSIBlast object. 818 819 """
820 - def __init__(self):
821 """__init__(self)""" 822 self._scanner = _Scanner() 823 self._consumer = _PSIBlastConsumer()
824
825 - def parse(self, handle):
826 """parse(self, handle)""" 827 self._scanner.feed(handle, self._consumer) 828 return self._consumer.data
829 830
831 -class _HeaderConsumer(object):
832 - def start_header(self):
833 self._header = Record.Header()
834
835 - def version(self, line):
836 c = line.split() 837 self._header.application = c[0] 838 self._header.version = c[1] 839 if len(c) > 2: 840 # The date is missing in the new C++ output from blastx 2.2.22+ 841 # Just get "BLASTX 2.2.22+\n" and that's all. 842 self._header.date = c[2][1:-1]
843
844 - def reference(self, line):
845 if line.startswith('Reference: '): 846 self._header.reference = line[11:] 847 else: 848 self._header.reference = self._header.reference + line
849
850 - def query_info(self, line):
851 if line.startswith('Query= '): 852 self._header.query = line[7:].lstrip() 853 elif line.startswith('Length='): 854 # New style way to give the query length in BLAST 2.2.22+ (the C++ code) 855 self._header.query_letters = _safe_int(line[7:].strip()) 856 elif not line.startswith(' '): # continuation of query_info 857 self._header.query = "%s%s" % (self._header.query, line) 858 else: 859 # Hope it is the old style way to give the query length: 860 letters, = _re_search( 861 r"([0-9,]+) letters", line, 862 "I could not find the number of letters in line\n%s" % line) 863 self._header.query_letters = _safe_int(letters)
864
865 - def database_info(self, line):
866 line = line.rstrip() 867 if line.startswith('Database: '): 868 self._header.database = line[10:] 869 elif not line.endswith('total letters'): 870 if self._header.database: 871 # Need to include a space when merging multi line datase descr 872 self._header.database = self._header.database + " " + line.strip() 873 else: 874 self._header.database = line.strip() 875 else: 876 sequences, letters = _re_search( 877 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 878 "I could not find the sequences and letters in line\n%s" % line) 879 self._header.database_sequences = _safe_int(sequences) 880 self._header.database_letters = _safe_int(letters)
881
882 - def end_header(self):
883 # Get rid of the trailing newlines 884 self._header.reference = self._header.reference.rstrip() 885 self._header.query = self._header.query.rstrip()
886 887
888 -class _DescriptionConsumer(object):
889 - def start_descriptions(self):
890 self._descriptions = [] 891 self._model_sequences = [] 892 self._nonmodel_sequences = [] 893 self._converged = 0 894 self._type = None 895 self._roundnum = None 896 897 self.__has_n = 0 # Does the description line contain an N value?
898
899 - def description_header(self, line):
900 if line.startswith('Sequences producing'): 901 cols = line.split() 902 if cols[-1] == 'N': 903 self.__has_n = 1
904
905 - def description(self, line):
906 dh = self._parse(line) 907 if self._type == 'model': 908 self._model_sequences.append(dh) 909 elif self._type == 'nonmodel': 910 self._nonmodel_sequences.append(dh) 911 else: 912 self._descriptions.append(dh)
913
914 - def model_sequences(self, line):
915 self._type = 'model'
916
917 - def nonmodel_sequences(self, line):
918 self._type = 'nonmodel'
919
920 - def converged(self, line):
921 self._converged = 1
922
923 - def no_hits(self, line):
924 pass
925
926 - def round(self, line):
927 if not line.startswith('Results from round'): 928 raise ValueError("I didn't understand the round line\n%s" % line) 929 self._roundnum = _safe_int(line[18:].strip())
930
931 - def end_descriptions(self):
932 pass
933
934 - def _parse(self, description_line):
935 line = description_line # for convenience 936 dh = Record.Description() 937 938 # I need to separate the score and p-value from the title. 939 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 940 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 941 # special cases to handle: 942 # - title must be preserved exactly (including whitespaces) 943 # - score could be equal to e-value (not likely, but what if??) 944 # - sometimes there's an "N" score of '1'. 945 cols = line.split() 946 if len(cols) < 3: 947 raise ValueError( 948 "Line does not appear to contain description:\n%s" % line) 949 if self.__has_n: 950 i = line.rfind(cols[-1]) # find start of N 951 i = line.rfind(cols[-2], 0, i) # find start of p-value 952 i = line.rfind(cols[-3], 0, i) # find start of score 953 else: 954 i = line.rfind(cols[-1]) # find start of p-value 955 i = line.rfind(cols[-2], 0, i) # find start of score 956 if self.__has_n: 957 dh.title, dh.score, dh.e, dh.num_alignments = \ 958 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 959 else: 960 dh.title, dh.score, dh.e, dh.num_alignments = \ 961 line[:i].rstrip(), cols[-2], cols[-1], 1 962 dh.num_alignments = _safe_int(dh.num_alignments) 963 dh.score = _safe_int(dh.score) 964 dh.e = _safe_float(dh.e) 965 return dh
966 967
968 -class _AlignmentConsumer(object):
969 # This is a little bit tricky. An alignment can either be a 970 # pairwise alignment or a multiple alignment. Since it's difficult 971 # to know a-priori which one the blast record will contain, I'm going 972 # to make one class that can parse both of them.
973 - def start_alignment(self):
974 self._alignment = Record.Alignment() 975 self._multiple_alignment = Record.MultipleAlignment()
976
977 - def title(self, line):
978 if self._alignment.title: 979 self._alignment.title += " " 980 self._alignment.title += line.strip()
981
982 - def length(self, line):
983 # e.g. "Length = 81" or more recently, "Length=428" 984 parts = line.replace(" ", "").split("=") 985 assert len(parts) == 2, "Unrecognised format length line" 986 self._alignment.length = parts[1] 987 self._alignment.length = _safe_int(self._alignment.length)
988
989 - def multalign(self, line):
990 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 991 if line.startswith('QUERY') or line.startswith('blast_tmp'): 992 # If this is the first line of the multiple alignment, 993 # then I need to figure out how the line is formatted. 994 995 # Format of line is: 996 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 997 try: 998 name, start, seq, end = line.split() 999 except ValueError: 1000 raise ValueError("I do not understand the line\n%s" % line) 1001 self._start_index = line.index(start, len(name)) 1002 self._seq_index = line.index(seq, 1003 self._start_index + len(start)) 1004 # subtract 1 for the space 1005 self._name_length = self._start_index - 1 1006 self._start_length = self._seq_index - self._start_index - 1 1007 self._seq_length = line.rfind(end) - self._seq_index - 1 1008 1009 # self._seq_index = line.index(seq) 1010 # # subtract 1 for the space 1011 # self._seq_length = line.rfind(end) - self._seq_index - 1 1012 # self._start_index = line.index(start) 1013 # self._start_length = self._seq_index - self._start_index - 1 1014 # self._name_length = self._start_index 1015 1016 # Extract the information from the line 1017 name = line[:self._name_length] 1018 name = name.rstrip() 1019 start = line[self._start_index:self._start_index + self._start_length] 1020 start = start.rstrip() 1021 if start: 1022 start = _safe_int(start) 1023 end = line[self._seq_index + self._seq_length:].rstrip() 1024 if end: 1025 end = _safe_int(end) 1026 seq = line[self._seq_index:self._seq_index + self._seq_length].rstrip() 1027 # right pad the sequence with spaces if necessary 1028 if len(seq) < self._seq_length: 1029 seq += ' ' * (self._seq_length - len(seq)) 1030 1031 # I need to make sure the sequence is aligned correctly with the query. 1032 # First, I will find the length of the query. Then, if necessary, 1033 # I will pad my current sequence with spaces so that they will line 1034 # up correctly. 1035 1036 # Two possible things can happen: 1037 # QUERY 1038 # 504 1039 # 1040 # QUERY 1041 # 403 1042 # 1043 # Sequence 504 will need padding at the end. Since I won't know 1044 # this until the end of the alignment, this will be handled in 1045 # end_alignment. 1046 # Sequence 403 will need padding before being added to the alignment. 1047 1048 align = self._multiple_alignment.alignment # for convenience 1049 align.append((name, start, seq, end))
1050 1051 # This is old code that tried to line up all the sequences 1052 # in a multiple alignment by using the sequence title's as 1053 # identifiers. The problem with this is that BLAST assigns 1054 # different HSP's from the same sequence the same id. Thus, 1055 # in one alignment block, there may be multiple sequences with 1056 # the same id. I'm not sure how to handle this, so I'm not 1057 # going to. 1058 1059 # # If the sequence is the query, then just add it. 1060 # if name == 'QUERY': 1061 # if len(align) == 0: 1062 # align.append((name, start, seq)) 1063 # else: 1064 # aname, astart, aseq = align[0] 1065 # if name != aname: 1066 # raise ValueError, "Query is not the first sequence" 1067 # aseq = aseq + seq 1068 # align[0] = aname, astart, aseq 1069 # else: 1070 # if len(align) == 0: 1071 # raise ValueError, "I could not find the query sequence" 1072 # qname, qstart, qseq = align[0] 1073 # 1074 # # Now find my sequence in the multiple alignment. 1075 # for i in range(1, len(align)): 1076 # aname, astart, aseq = align[i] 1077 # if name == aname: 1078 # index = i 1079 # break 1080 # else: 1081 # # If I couldn't find it, then add a new one. 1082 # align.append((None, None, None)) 1083 # index = len(align)-1 1084 # # Make sure to left-pad it. 1085 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1086 # 1087 # if len(qseq) != len(aseq) + len(seq): 1088 # # If my sequences are shorter than the query sequence, 1089 # # then I will need to pad some spaces to make them line up. 1090 # # Since I've already right padded seq, that means aseq 1091 # # must be too short. 1092 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1093 # aseq = aseq + seq 1094 # if astart is None: 1095 # astart = start 1096 # align[index] = aname, astart, aseq 1097
1098 - def end_alignment(self):
1099 # Remove trailing newlines 1100 if self._alignment: 1101 self._alignment.title = self._alignment.title.rstrip() 1102 1103 # This code is also obsolete. See note above. 1104 # If there's a multiple alignment, I will need to make sure 1105 # all the sequences are aligned. That is, I may need to 1106 # right-pad the sequences. 1107 # if self._multiple_alignment is not None: 1108 # align = self._multiple_alignment.alignment 1109 # seqlen = None 1110 # for i in range(len(align)): 1111 # name, start, seq = align[i] 1112 # if seqlen is None: 1113 # seqlen = len(seq) 1114 # else: 1115 # if len(seq) < seqlen: 1116 # seq = seq + ' '*(seqlen - len(seq)) 1117 # align[i] = name, start, seq 1118 # elif len(seq) > seqlen: 1119 # raise ValueError, \ 1120 # "Sequence %s is longer than the query" % name 1121 1122 # Clean up some variables, if they exist. 1123 try: 1124 del self._seq_index 1125 del self._seq_length 1126 del self._start_index 1127 del self._start_length 1128 del self._name_length 1129 except AttributeError: 1130 pass
1131 1132
1133 -class _HSPConsumer(object):
1134 - def start_hsp(self):
1135 self._hsp = Record.HSP()
1136
1137 - def score(self, line):
1138 self._hsp.bits, self._hsp.score = _re_search( 1139 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1140 "I could not find the score in line\n%s" % line) 1141 self._hsp.score = _safe_float(self._hsp.score) 1142 self._hsp.bits = _safe_float(self._hsp.bits) 1143 1144 x, y = _re_search( 1145 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1146 "I could not find the expect in line\n%s" % line) 1147 if x: 1148 self._hsp.num_alignments = _safe_int(x) 1149 else: 1150 self._hsp.num_alignments = 1 1151 self._hsp.expect = _safe_float(y)
1152
1153 - def identities(self, line):
1154 x, y = _re_search( 1155 r"Identities = (\d+)\/(\d+)", line, 1156 "I could not find the identities in line\n%s" % line) 1157 self._hsp.identities = _safe_int(x), _safe_int(y) 1158 self._hsp.align_length = _safe_int(y) 1159 1160 if 'Positives' in line: 1161 x, y = _re_search( 1162 r"Positives = (\d+)\/(\d+)", line, 1163 "I could not find the positives in line\n%s" % line) 1164 self._hsp.positives = _safe_int(x), _safe_int(y) 1165 assert self._hsp.align_length == _safe_int(y) 1166 1167 if 'Gaps' in line: 1168 x, y = _re_search( 1169 r"Gaps = (\d+)\/(\d+)", line, 1170 "I could not find the gaps in line\n%s" % line) 1171 self._hsp.gaps = _safe_int(x), _safe_int(y) 1172 assert self._hsp.align_length == _safe_int(y)
1173
1174 - def strand(self, line):
1175 self._hsp.strand = _re_search( 1176 r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", line, 1177 "I could not find the strand in line\n%s" % line)
1178
1179 - def frame(self, line):
1180 # Frame can be in formats: 1181 # Frame = +1 1182 # Frame = +2 / +2 1183 if '/' in line: 1184 self._hsp.frame = _re_search( 1185 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", line, 1186 "I could not find the frame in line\n%s" % line) 1187 else: 1188 self._hsp.frame = _re_search( 1189 r"Frame = ([-+][123])", line, 1190 "I could not find the frame in line\n%s" % line)
1191 1192 # Match a space, if one is available. Masahir Ishikawa found a 1193 # case where there's no space between the start and the sequence: 1194 # Query: 100tt 101 1195 # line below modified by Yair Benita, Sep 2004 1196 # Note that the colon is not always present. 2006 1197 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)") 1198
1199 - def query(self, line):
1200 m = self._query_re.search(line) 1201 if m is None: 1202 if line.strip() == "Query ------------------------------------------------------------": 1203 # Special case - long gap relative to the subject, 1204 # note there is no start/end present, cannot update those 1205 self._hsp.query += "-" * 60 1206 self._query_len = 60 # number of dashes 1207 self._query_start_index = 13 # offset of first dash 1208 return 1209 raise ValueError("I could not find the query in line\n%s" % line) 1210 1211 # line below modified by Yair Benita, Sep 2004. 1212 # added the end attribute for the query 1213 colon, start, seq, end = m.groups() 1214 seq = seq.strip() 1215 self._hsp.query += seq 1216 if self._hsp.query_start is None: 1217 self._hsp.query_start = _safe_int(start) 1218 1219 # line below added by Yair Benita, Sep 2004. 1220 # added the end attribute for the query 1221 self._hsp.query_end = _safe_int(end) 1222 1223 # Get index for sequence start (regular expression element 3) 1224 self._query_start_index = m.start(3) 1225 self._query_len = len(seq)
1226
1227 - def align(self, line):
1228 seq = line[self._query_start_index:].rstrip() 1229 if len(seq) < self._query_len: 1230 # Make sure the alignment is the same length as the query 1231 seq += ' ' * (self._query_len - len(seq)) 1232 elif len(seq) < self._query_len: 1233 raise ValueError("Match is longer than the query in line\n%s" 1234 % line) 1235 self._hsp.match = self._hsp.match + seq
1236 1237 # To match how we do the query, cache the regular expression. 1238 # Note that the colon is not always present. 1239 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)") 1240
1241 - def sbjct(self, line):
1242 m = self._sbjct_re.search(line) 1243 if m is None: 1244 raise ValueError("I could not find the sbjct in line\n%s" % line) 1245 colon, start, seq, end = m.groups() 1246 # mikep 26/9/00 1247 # On occasion, there is a blast hit with no subject match 1248 # so far, it only occurs with 1-line short "matches" 1249 # I have decided to let these pass as they appear 1250 if not seq.strip(): 1251 seq = ' ' * self._query_len 1252 else: 1253 seq = seq.strip() 1254 self._hsp.sbjct += seq 1255 if self._hsp.sbjct_start is None: 1256 self._hsp.sbjct_start = _safe_int(start) 1257 1258 self._hsp.sbjct_end = _safe_int(end) 1259 if len(seq) != self._query_len: 1260 raise ValueError( 1261 "QUERY and SBJCT sequence lengths don't match (%i %r vs %i) in line\n%s" 1262 % (self._query_len, self._hsp.query, len(seq), line)) 1263 1264 del self._query_start_index # clean up unused variables 1265 del self._query_len
1266
1267 - def end_hsp(self):
1268 pass
1269 1270
1271 -class _DatabaseReportConsumer(object):
1272
1273 - def start_database_report(self):
1274 self._dr = Record.DatabaseReport()
1275
1276 - def database(self, line):
1277 m = re.search(r"Database: (.+)$", line) 1278 if m: 1279 self._dr.database_name.append(m.group(1)) 1280 elif self._dr.database_name: 1281 # This must be a continuation of the previous name. 1282 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1283 line.strip())
1284
1285 - def posted_date(self, line):
1286 self._dr.posted_date.append(_re_search( 1287 r"Posted date:\s*(.+)$", line, 1288 "I could not find the posted date in line\n%s" % line))
1289
1290 - def num_letters_in_database(self, line):
1291 letters, = _get_cols( 1292 line, (-1,), ncols=6, expected={2: "letters", 4: "database:"}) 1293 self._dr.num_letters_in_database.append(_safe_int(letters))
1294
1295 - def num_sequences_in_database(self, line):
1296 sequences, = _get_cols( 1297 line, (-1,), ncols=6, expected={2: "sequences", 4: "database:"}) 1298 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1299
1300 - def ka_params(self, line):
1301 self._dr.ka_params = [_safe_float(x) for x in line.split()]
1302
1303 - def gapped(self, line):
1304 self._dr.gapped = 1
1305
1306 - def ka_params_gap(self, line):
1307 self._dr.ka_params_gap = [_safe_float(x) for x in line.split()]
1308
1309 - def end_database_report(self):
1310 pass
1311 1312
1313 -class _ParametersConsumer(object):
1314 - def start_parameters(self):
1315 self._params = Record.Parameters()
1316
1317 - def matrix(self, line):
1318 self._params.matrix = line[8:].rstrip()
1319
1320 - def gap_penalties(self, line):
1321 self._params.gap_penalties = [_safe_float(x) for x in _get_cols( 1322 line, (3, 5), ncols=6, expected={2: "Existence:", 4: "Extension:"})]
1323
1324 - def num_hits(self, line):
1325 if '1st pass' in line: 1326 x, = _get_cols(line, (-4,), ncols=11, expected={2: "Hits"}) 1327 self._params.num_hits = _safe_int(x) 1328 else: 1329 x, = _get_cols(line, (-1,), ncols=6, expected={2: "Hits"}) 1330 self._params.num_hits = _safe_int(x)
1331
1332 - def num_sequences(self, line):
1333 if '1st pass' in line: 1334 x, = _get_cols(line, (-4,), ncols=9, expected={2: "Sequences:"}) 1335 self._params.num_sequences = _safe_int(x) 1336 else: 1337 x, = _get_cols(line, (-1,), ncols=4, expected={2: "Sequences:"}) 1338 self._params.num_sequences = _safe_int(x)
1339
1340 - def num_extends(self, line):
1341 if '1st pass' in line: 1342 x, = _get_cols(line, (-4,), ncols=9, expected={2: "extensions:"}) 1343 self._params.num_extends = _safe_int(x) 1344 else: 1345 x, = _get_cols(line, (-1,), ncols=4, expected={2: "extensions:"}) 1346 self._params.num_extends = _safe_int(x)
1347
1348 - def num_good_extends(self, line):
1349 if '1st pass' in line: 1350 x, = _get_cols(line, (-4,), ncols=10, expected={3: "extensions:"}) 1351 self._params.num_good_extends = _safe_int(x) 1352 else: 1353 x, = _get_cols(line, (-1,), ncols=5, expected={3: "extensions:"}) 1354 self._params.num_good_extends = _safe_int(x)
1355
1356 - def num_seqs_better_e(self, line):
1357 self._params.num_seqs_better_e, = _get_cols( 1358 line, (-1,), ncols=7, expected={2: "sequences"}) 1359 self._params.num_seqs_better_e = _safe_int( 1360 self._params.num_seqs_better_e)
1361
1362 - def hsps_no_gap(self, line):
1363 self._params.hsps_no_gap, = _get_cols( 1364 line, (-1,), ncols=9, expected={3: "better", 7: "gapping:"}) 1365 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1366
1367 - def hsps_prelim_gapped(self, line):
1368 self._params.hsps_prelim_gapped, = _get_cols( 1369 line, (-1,), ncols=9, expected={4: "gapped", 6: "prelim"}) 1370 self._params.hsps_prelim_gapped = _safe_int( 1371 self._params.hsps_prelim_gapped)
1372
1373 - def hsps_prelim_gapped_attempted(self, line):
1374 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1375 line, (-1,), ncols=10, expected={4: "attempted", 7: "prelim"}) 1376 self._params.hsps_prelim_gapped_attempted = _safe_int( 1377 self._params.hsps_prelim_gapped_attempted)
1378
1379 - def hsps_gapped(self, line):
1380 self._params.hsps_gapped, = _get_cols( 1381 line, (-1,), ncols=6, expected={3: "gapped"}) 1382 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1383
1384 - def query_length(self, line):
1385 self._params.query_length, = _get_cols( 1386 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "query:"}) 1387 self._params.query_length = _safe_int(self._params.query_length)
1388
1389 - def database_length(self, line):
1390 self._params.database_length, = _get_cols( 1391 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "database:"}) 1392 self._params.database_length = _safe_int(self._params.database_length)
1393
1394 - def effective_hsp_length(self, line):
1395 self._params.effective_hsp_length, = _get_cols( 1396 line, (-1,), ncols=4, expected={1: "HSP", 2: "length:"}) 1397 self._params.effective_hsp_length = _safe_int( 1398 self._params.effective_hsp_length)
1399
1400 - def effective_query_length(self, line):
1401 self._params.effective_query_length, = _get_cols( 1402 line, (-1,), ncols=5, expected={1: "length", 3: "query:"}) 1403 self._params.effective_query_length = _safe_int( 1404 self._params.effective_query_length)
1405
1406 - def effective_database_length(self, line):
1407 self._params.effective_database_length, = _get_cols( 1408 line.lower(), (-1,), ncols=5, expected={1: "length", 3: "database:"}) 1409 self._params.effective_database_length = _safe_int( 1410 self._params.effective_database_length)
1411
1412 - def effective_search_space(self, line):
1413 self._params.effective_search_space, = _get_cols( 1414 line, (-1,), ncols=4, expected={1: "search"}) 1415 self._params.effective_search_space = _safe_int( 1416 self._params.effective_search_space)
1417
1418 - def effective_search_space_used(self, line):
1419 self._params.effective_search_space_used, = _get_cols( 1420 line, (-1,), ncols=5, expected={1: "search", 3: "used:"}) 1421 self._params.effective_search_space_used = _safe_int( 1422 self._params.effective_search_space_used)
1423
1424 - def frameshift(self, line):
1425 self._params.frameshift = _get_cols(line, (4, 5), ncols=6, 1426 expected={0: "frameshift", 2: "decay"})
1427
1428 - def threshold(self, line):
1429 if line[:2] == "T:": 1430 # Assume its an old style line like "T: 123" 1431 self._params.threshold, = _get_cols(line, (1,), ncols=2, 1432 expected={0: "T:"}) 1433 elif line[:28] == "Neighboring words threshold:": 1434 self._params.threshold, = _get_cols(line, (3,), ncols=4, 1435 expected={0: "Neighboring", 1436 1: "words", 1437 2: "threshold:"}) 1438 else: 1439 raise ValueError("Unrecognised threshold line:\n%s" % line) 1440 self._params.threshold = _safe_int(self._params.threshold)
1441
1442 - def window_size(self, line):
1443 if line[:2] == "A:": 1444 self._params.window_size, = _get_cols(line, (1,), ncols=2, 1445 expected={0: "A:"}) 1446 elif line[:25] == "Window for multiple hits:": 1447 self._params.window_size, = _get_cols(line, (4,), ncols=5, 1448 expected={0: "Window", 1449 2: "multiple", 1450 3: "hits:"}) 1451 else: 1452 raise ValueError("Unrecognised window size line:\n%s" % line) 1453 self._params.window_size = _safe_int(self._params.window_size)
1454
1455 - def dropoff_1st_pass(self, line):
1456 score, bits = _re_search( 1457 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1458 "I could not find the dropoff in line\n%s" % line) 1459 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1460
1461 - def gap_x_dropoff(self, line):
1462 score, bits = _re_search( 1463 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1464 "I could not find the gap dropoff in line\n%s" % line) 1465 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1466
1467 - def gap_x_dropoff_final(self, line):
1468 score, bits = _re_search( 1469 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1470 "I could not find the gap dropoff final in line\n%s" % line) 1471 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1472
1473 - def gap_trigger(self, line):
1474 score, bits = _re_search( 1475 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1476 "I could not find the gap trigger in line\n%s" % line) 1477 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1478
1479 - def blast_cutoff(self, line):
1480 score, bits = _re_search( 1481 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1482 "I could not find the blast cutoff in line\n%s" % line) 1483 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1484
1485 - def end_parameters(self):
1486 pass
1487 1488
1489 -class _BlastConsumer(AbstractConsumer, 1490 _HeaderConsumer, 1491 _DescriptionConsumer, 1492 _AlignmentConsumer, 1493 _HSPConsumer, 1494 _DatabaseReportConsumer, 1495 _ParametersConsumer 1496 ):
1497 # This Consumer is inherits from many other consumer classes that handle 1498 # the actual dirty work. An alternate way to do it is to create objects 1499 # of those classes and then delegate the parsing tasks to them in a 1500 # decorator-type pattern. The disadvantage of that is that the method 1501 # names will need to be resolved in this classes. However, using 1502 # a decorator will retain more control in this class (which may or 1503 # may not be a bad thing). In addition, having each sub-consumer as 1504 # its own object prevents this object's dictionary from being cluttered 1505 # with members and reduces the chance of member collisions.
1506 - def __init__(self):
1507 self.data = None
1508
1509 - def round(self, line):
1510 # Make sure nobody's trying to pass me PSI-BLAST data! 1511 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1512
1513 - def start_header(self):
1514 self.data = Record.Blast() 1515 _HeaderConsumer.start_header(self)
1516
1517 - def end_header(self):
1518 _HeaderConsumer.end_header(self) 1519 self.data.__dict__.update(self._header.__dict__)
1520
1521 - def end_descriptions(self):
1522 self.data.descriptions = self._descriptions
1523
1524 - def end_alignment(self):
1525 _AlignmentConsumer.end_alignment(self) 1526 if self._alignment.hsps: 1527 self.data.alignments.append(self._alignment) 1528 if self._multiple_alignment.alignment: 1529 self.data.multiple_alignment = self._multiple_alignment
1530
1531 - def end_hsp(self):
1532 _HSPConsumer.end_hsp(self) 1533 try: 1534 self._alignment.hsps.append(self._hsp) 1535 except AttributeError: 1536 raise ValueError("Found an HSP before an alignment")
1537
1538 - def end_database_report(self):
1539 _DatabaseReportConsumer.end_database_report(self) 1540 self.data.__dict__.update(self._dr.__dict__)
1541
1542 - def end_parameters(self):
1543 _ParametersConsumer.end_parameters(self) 1544 self.data.__dict__.update(self._params.__dict__)
1545 1546
1547 -class _PSIBlastConsumer(AbstractConsumer, 1548 _HeaderConsumer, 1549 _DescriptionConsumer, 1550 _AlignmentConsumer, 1551 _HSPConsumer, 1552 _DatabaseReportConsumer, 1553 _ParametersConsumer 1554 ):
1555 - def __init__(self):
1556 self.data = None
1557
1558 - def start_header(self):
1559 self.data = Record.PSIBlast() 1560 _HeaderConsumer.start_header(self)
1561
1562 - def end_header(self):
1563 _HeaderConsumer.end_header(self) 1564 self.data.__dict__.update(self._header.__dict__)
1565
1566 - def start_descriptions(self):
1567 self._round = Record.Round() 1568 self.data.rounds.append(self._round) 1569 _DescriptionConsumer.start_descriptions(self)
1570
1571 - def end_descriptions(self):
1572 _DescriptionConsumer.end_descriptions(self) 1573 self._round.number = self._roundnum 1574 if self._descriptions: 1575 self._round.new_seqs.extend(self._descriptions) 1576 self._round.reused_seqs.extend(self._model_sequences) 1577 self._round.new_seqs.extend(self._nonmodel_sequences) 1578 if self._converged: 1579 self.data.converged = 1
1580
1581 - def end_alignment(self):
1582 _AlignmentConsumer.end_alignment(self) 1583 if self._alignment.hsps: 1584 self._round.alignments.append(self._alignment) 1585 if self._multiple_alignment: 1586 self._round.multiple_alignment = self._multiple_alignment
1587
1588 - def end_hsp(self):
1589 _HSPConsumer.end_hsp(self) 1590 try: 1591 self._alignment.hsps.append(self._hsp) 1592 except AttributeError: 1593 raise ValueError("Found an HSP before an alignment")
1594
1595 - def end_database_report(self):
1596 _DatabaseReportConsumer.end_database_report(self) 1597 self.data.__dict__.update(self._dr.__dict__)
1598
1599 - def end_parameters(self):
1600 _ParametersConsumer.end_parameters(self) 1601 self.data.__dict__.update(self._params.__dict__)
1602 1603
1604 -class Iterator(object):
1605 """Iterates over a file of multiple BLAST results. 1606 1607 Methods: 1608 next Return the next record from the stream, or None. 1609 1610 """
1611 - def __init__(self, handle, parser=None):
1612 """__init__(self, handle, parser=None) 1613 1614 Create a new iterator. handle is a file-like object. parser 1615 is an optional Parser object to change the results into another form. 1616 If set to None, then the raw contents of the file will be returned. 1617 1618 """ 1619 try: 1620 handle.readline 1621 except AttributeError: 1622 raise ValueError( 1623 "I expected a file handle or file-like object, got %s" 1624 % type(handle)) 1625 self._uhandle = File.UndoHandle(handle) 1626 self._parser = parser 1627 self._header = []
1628
1629 - def __next__(self):
1630 """next(self) -> object 1631 1632 Return the next Blast record from the file. If no more records, 1633 return None. 1634 1635 """ 1636 lines = [] 1637 query = False 1638 while True: 1639 line = self._uhandle.readline() 1640 if not line: 1641 break 1642 # If I've reached the next one, then put the line back and stop. 1643 if lines and (line.startswith('BLAST') or 1644 line.startswith('BLAST', 1) or 1645 line.startswith('<?xml ')): 1646 self._uhandle.saveline(line) 1647 break 1648 # New style files omit the BLAST line to mark a new query: 1649 if line.startswith("Query="): 1650 if not query: 1651 if not self._header: 1652 self._header = lines[:] 1653 query = True 1654 else: 1655 # Start of another record 1656 self._uhandle.saveline(line) 1657 break 1658 lines.append(line) 1659 1660 if query and "BLAST" not in lines[0]: 1661 # Cheat and re-insert the header 1662 # print "-"*50 1663 # print "".join(self._header) 1664 # print "-"*50 1665 # print "".join(lines) 1666 # print "-"*50 1667 lines = self._header + lines 1668 1669 if not lines: 1670 return None 1671 1672 data = ''.join(lines) 1673 if self._parser is not None: 1674 return self._parser.parse(StringIO(data)) 1675 return data
1676 1677 if sys.version_info[0] < 3:
1678 - def next(self):
1679 """Python 2 style alias for Python 3 style __next__ method.""" 1680 return self.__next__()
1681
1682 - def __iter__(self):
1683 return iter(self.__next__, None)
1684 1685
1686 -def _re_search(regex, line, error_msg):
1687 m = re.search(regex, line) 1688 if not m: 1689 raise ValueError(error_msg) 1690 return m.groups()
1691 1692
1693 -def _get_cols(line, cols_to_get, ncols=None, expected=None):
1694 if expected is None: 1695 expected = {} 1696 cols = line.split() 1697 1698 # Check to make sure number of columns is correct 1699 if ncols is not None and len(cols) != ncols: 1700 raise ValueError("I expected %d columns (got %d) in line\n%s" 1701 % (ncols, len(cols), line)) 1702 1703 # Check to make sure columns contain the correct data 1704 for k in expected: 1705 if cols[k] != expected[k]: 1706 raise ValueError("I expected '%s' in column %d in line\n%s" 1707 % (expected[k], k, line)) 1708 1709 # Construct the answer tuple 1710 results = [] 1711 for c in cols_to_get: 1712 results.append(cols[c]) 1713 return tuple(results)
1714 1715
1716 -def _safe_int(str):
1717 try: 1718 return int(str) 1719 except ValueError: 1720 # Something went wrong. Try to clean up the string. 1721 # Remove all commas from the string 1722 str = str.replace(',', '') 1723 # try again after removing commas. 1724 # Note int() will return a long rather than overflow 1725 try: 1726 return int(str) 1727 except ValueError: 1728 pass 1729 # Call float to handle things like "54.3", note could lose precision, e.g. 1730 # >>> int("5399354557888517312") 1731 # 5399354557888517312 1732 # >>> int(float("5399354557888517312")) 1733 # 5399354557888517120 1734 return int(float(str))
1735 1736
1737 -def _safe_float(str):
1738 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1739 # float('e-172') does not produce an error on his platform. Thus, 1740 # we need to check the string for this condition. 1741 1742 # Sometimes BLAST leaves of the '1' in front of an exponent. 1743 if str and str[0] in ['E', 'e']: 1744 str = '1' + str 1745 try: 1746 return float(str) 1747 except ValueError: 1748 # Remove all commas from the string 1749 str = str.replace(',', '') 1750 # try again. 1751 return float(str)
1752 1753
1754 -class _BlastErrorConsumer(_BlastConsumer):
1755 - def __init__(self):
1757
1758 - def noevent(self, line):
1759 if 'Query must be at least wordsize' in line: 1760 raise ShortQueryBlastError("Query must be at least wordsize") 1761 # Now pass the line back up to the superclass. 1762 method = getattr(_BlastConsumer, 'noevent', 1763 _BlastConsumer.__getattr__(self, 'noevent')) 1764 method(line)
1765 1766
1767 -class BlastErrorParser(AbstractParser):
1768 """Attempt to catch and diagnose BLAST errors while parsing. 1769 1770 This utilizes the BlastParser module but adds an additional layer 1771 of complexity on top of it by attempting to diagnose ValueErrors 1772 that may actually indicate problems during BLAST parsing. 1773 1774 Current BLAST problems this detects are: 1775 o LowQualityBlastError - When BLASTing really low quality sequences 1776 (ie. some GenBank entries which are just short stretches of a single 1777 nucleotide), BLAST will report an error with the sequence and be 1778 unable to search with this. This will lead to a badly formatted 1779 BLAST report that the parsers choke on. The parser will convert the 1780 ValueError to a LowQualityBlastError and attempt to provide useful 1781 information. 1782 """
1783 - def __init__(self, bad_report_handle=None):
1784 """Initialize a parser that tries to catch BlastErrors. 1785 1786 Arguments: 1787 o bad_report_handle - An optional argument specifying a handle 1788 where bad reports should be sent. This would allow you to save 1789 all of the bad reports to a file, for instance. If no handle 1790 is specified, the bad reports will not be saved. 1791 """ 1792 self._bad_report_handle = bad_report_handle 1793 1794 # self._b_parser = BlastParser() 1795 self._scanner = _Scanner() 1796 self._consumer = _BlastErrorConsumer()
1797
1798 - def parse(self, handle):
1799 """Parse a handle, attempting to diagnose errors. 1800 """ 1801 results = handle.read() 1802 1803 try: 1804 self._scanner.feed(StringIO(