Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (DEPRECATED). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  preferred. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23  """ 
  24   
  25  from __future__ import print_function 
  26   
  27  import sys 
  28  import re 
  29   
  30  from Bio._py3k import StringIO 
  31  from Bio.ParserSupport import AbstractParser, AbstractConsumer 
  32  from Bio.ParserSupport import read_and_call, read_and_call_until 
  33  from Bio.ParserSupport import read_and_call_while, attempt_read_and_call 
  34  from Bio.ParserSupport import is_blank_line, safe_peekline, safe_readline 
  35  from Bio import File 
  36  from Bio.Blast import Record 
  37   
  38  from Bio import BiopythonDeprecationWarning 
  39  import warnings 
  40  warnings.warn("This module has been deprecated. Consider Bio.SearchIO for " 
  41                "parsing BLAST output instead.", BiopythonDeprecationWarning) 
  42   
  43   
  44  _score_e_re = re.compile(r'Score +E') 
  45   
  46   
47 -class LowQualityBlastError(Exception):
48 """Error caused by running a low quality sequence through BLAST. 49 50 When low quality sequences (like GenBank entries containing only 51 stretches of a single nucleotide) are BLASTed, they will result in 52 BLAST generating an error and not being able to perform the BLAST. 53 search. This error should be raised for the BLAST reports produced 54 in this case. 55 """ 56 pass
57 58
59 -class ShortQueryBlastError(Exception):
60 """Error caused by running a short query sequence through BLAST. 61 62 If the query sequence is too short, BLAST outputs warnings and errors:: 63 64 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 65 [blastall] ERROR: [000.000] AT1G08320: Blast: 66 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 67 done 68 69 This exception is raised when that condition is detected. 70 """ 71 pass
72 73
74 -class _Scanner(object):
75 """Scan BLAST output from blastall or blastpgp. 76 77 Tested with blastall and blastpgp v2.0.10, v2.0.11 78 79 Methods: 80 - feed Feed data into the scanner. 81 """
82 - def feed(self, handle, consumer):
83 """S.feed(handle, consumer) 84 85 Feed in a BLAST report for scanning. handle is a file-like 86 object that contains the BLAST report. consumer is a Consumer 87 object that will receive events as the report is scanned. 88 """ 89 if isinstance(handle, File.UndoHandle): 90 uhandle = handle 91 else: 92 uhandle = File.UndoHandle(handle) 93 94 # Try to fast-forward to the beginning of the blast report. 95 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 96 # Now scan the BLAST report. 97 self._scan_header(uhandle, consumer) 98 self._scan_rounds(uhandle, consumer) 99 self._scan_database_report(uhandle, consumer) 100 self._scan_parameters(uhandle, consumer)
101
102 - def _scan_header(self, uhandle, consumer):
103 # BLASTP 2.0.10 [Aug-26-1999] 104 # 105 # 106 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 107 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 108 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 109 # programs", Nucleic Acids Res. 25:3389-3402. 110 # 111 # Query= test 112 # (140 letters) 113 # 114 # Database: sdqib40-1.35.seg.fa 115 # 1323 sequences; 223,339 total letters 116 # 117 # ======================================================== 118 # This next example is from the online version of Blast, 119 # note there are TWO references, an RID line, and also 120 # the database is BEFORE the query line. 121 # Note there possibleuse of non-ASCII in the author names. 122 # ======================================================== 123 # 124 # BLASTP 2.2.15 [Oct-15-2006] 125 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 126 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 127 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 128 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 129 # 130 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 131 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 132 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 133 # protein database searches with composition-based statistics 134 # and other refinements", Nucleic Acids Res. 29:2994-3005. 135 # 136 # RID: 1166022616-19998-65316425856.BLASTQ1 137 # 138 # 139 # Database: All non-redundant GenBank CDS 140 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 141 # 4,254,166 sequences; 1,462,033,012 total letters 142 # Query= gi:16127998 143 # Length=428 144 # 145 146 consumer.start_header() 147 148 read_and_call(uhandle, consumer.version, contains='BLAST') 149 read_and_call_while(uhandle, consumer.noevent, blank=1) 150 151 # There might be a <pre> line, for qblast output. 152 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 153 154 # Read the reference(s) 155 while attempt_read_and_call(uhandle, 156 consumer.reference, start='Reference'): 157 # References are normally multiline terminated by a blank line 158 # (or, based on the old code, the RID line) 159 while True: 160 line = uhandle.readline() 161 if is_blank_line(line): 162 consumer.noevent(line) 163 break 164 elif line.startswith("RID"): 165 break 166 else: 167 # More of the reference 168 consumer.reference(line) 169 170 # Deal with the optional RID: ... 171 read_and_call_while(uhandle, consumer.noevent, blank=1) 172 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 173 read_and_call_while(uhandle, consumer.noevent, blank=1) 174 175 # blastpgp may have a reference for compositional score matrix 176 # adjustment (see Bug 2502): 177 if attempt_read_and_call( 178 uhandle, consumer.reference, start="Reference"): 179 read_and_call_until(uhandle, consumer.reference, blank=1) 180 read_and_call_while(uhandle, consumer.noevent, blank=1) 181 182 # blastpgp has a Reference for composition-based statistics. 183 if attempt_read_and_call( 184 uhandle, consumer.reference, start="Reference"): 185 read_and_call_until(uhandle, consumer.reference, blank=1) 186 read_and_call_while(uhandle, consumer.noevent, blank=1) 187 188 line = uhandle.peekline() 189 assert line.strip() != "" 190 assert not line.startswith("RID:") 191 if line.startswith("Query="): 192 # This is an old style query then database... 193 194 # Read the Query lines and the following blank line. 195 read_and_call(uhandle, consumer.query_info, start='Query=') 196 read_and_call_until(uhandle, consumer.query_info, blank=1) 197 read_and_call_while(uhandle, consumer.noevent, blank=1) 198 199 # Read the database lines and the following blank line. 200 read_and_call_until(uhandle, consumer.database_info, end='total letters') 201 read_and_call(uhandle, consumer.database_info, contains='sequences') 202 read_and_call_while(uhandle, consumer.noevent, blank=1) 203 elif line.startswith("Database:"): 204 # This is a new style database then query... 205 read_and_call_until(uhandle, consumer.database_info, end='total letters') 206 read_and_call(uhandle, consumer.database_info, contains='sequences') 207 read_and_call_while(uhandle, consumer.noevent, blank=1) 208 209 # Read the Query lines and the following blank line. 210 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 211 # the "... Score E" line instead. 212 read_and_call(uhandle, consumer.query_info, start='Query=') 213 # BLAST 2.2.25+ has a blank line before Length= 214 read_and_call_until(uhandle, consumer.query_info, start='Length=') 215 while True: 216 line = uhandle.peekline() 217 if not line.strip() or _score_e_re.search(line) is not None: 218 break 219 # It is more of the query (and its length) 220 read_and_call(uhandle, consumer.query_info) 221 read_and_call_while(uhandle, consumer.noevent, blank=1) 222 else: 223 raise ValueError("Invalid header?") 224 225 consumer.end_header()
226
227 - def _scan_rounds(self, uhandle, consumer):
228 # Scan a bunch of rounds. 229 # Each round begins with either a "Searching......" line 230 # or a 'Score E' line followed by descriptions and alignments. 231 # The email server doesn't give the "Searching....." line. 232 # If there is no 'Searching.....' line then you'll first see a 233 # 'Results from round' line 234 235 while not self._eof(uhandle): 236 line = safe_peekline(uhandle) 237 if not line.startswith('Searching') and \ 238 not line.startswith('Results from round') and \ 239 _score_e_re.search(line) is None and \ 240 'No hits found' not in line: 241 break 242 self._scan_descriptions(uhandle, consumer) 243 self._scan_alignments(uhandle, consumer)
244
245 - def _scan_descriptions(self, uhandle, consumer):
246 # Searching..................................................done 247 # Results from round 2 248 # 249 # 250 # Sc 251 # Sequences producing significant alignments: (b 252 # Sequences used in model and found again: 253 # 254 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 255 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 256 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 257 # 258 # Sequences not found previously or not previously below threshold: 259 # 260 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 261 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 262 # 263 264 # If PSI-BLAST, may also have: 265 # 266 # CONVERGED! 267 268 consumer.start_descriptions() 269 270 # Read 'Searching' 271 # This line seems to be missing in BLASTN 2.1.2 (others?) 272 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 273 274 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 275 # If this happens, the handle will yield no more information. 276 if not uhandle.peekline(): 277 raise ValueError("Unexpected end of blast report. " + 278 "Looks suspiciously like a PSI-BLAST crash.") 279 280 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 281 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 282 # [blastall] ERROR: [000.000] AT1G08320: Blast: 283 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 284 # done 285 # Reported by David Weisman. 286 # Check for these error lines and ignore them for now. Let 287 # the BlastErrorParser deal with them. 288 line = uhandle.peekline() 289 if "ERROR:" in line or line.startswith("done"): 290 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 291 read_and_call(uhandle, consumer.noevent, start="done") 292 293 # Check to see if this is PSI-BLAST. 294 # If it is, the 'Searching' line will be followed by: 295 # (version 2.0.10) 296 # Searching............................. 297 # Results from round 2 298 # or (version 2.0.11) 299 # Searching............................. 300 # 301 # 302 # Results from round 2 303 304 # Skip a bunch of blank lines. 305 read_and_call_while(uhandle, consumer.noevent, blank=1) 306 # Check for the results line if it's there. 307 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 308 read_and_call_while(uhandle, consumer.noevent, blank=1) 309 310 # Three things can happen here: 311 # 1. line contains 'Score E' 312 # 2. line contains "No hits found" 313 # 3. no descriptions 314 # The first one begins a bunch of descriptions. The last two 315 # indicates that no descriptions follow, and we should go straight 316 # to the alignments. 317 if not attempt_read_and_call( 318 uhandle, consumer.description_header, 319 has_re=_score_e_re): 320 # Either case 2 or 3. Look for "No hits found". 321 attempt_read_and_call(uhandle, consumer.no_hits, 322 contains='No hits found') 323 try: 324 read_and_call_while(uhandle, consumer.noevent, blank=1) 325 except ValueError as err: 326 if str(err) != "Unexpected end of stream.": 327 raise err 328 329 consumer.end_descriptions() 330 # Stop processing. 331 return 332 333 # Read the score header lines 334 read_and_call(uhandle, consumer.description_header, 335 start='Sequences producing') 336 337 # If PSI-BLAST, read the 'Sequences used in model' line. 338 attempt_read_and_call(uhandle, consumer.model_sequences, 339 start='Sequences used in model') 340 read_and_call_while(uhandle, consumer.noevent, blank=1) 341 342 # In BLAT, rather than a "No hits found" line, we just 343 # get no descriptions (and no alignments). This can be 344 # spotted because the next line is the database block: 345 if safe_peekline(uhandle).startswith(" Database:"): 346 consumer.end_descriptions() 347 # Stop processing. 348 return 349 350 # Read the descriptions and the following blank lines, making 351 # sure that there are descriptions. 352 if not uhandle.peekline().startswith('Sequences not found'): 353 read_and_call_until(uhandle, consumer.description, blank=1) 354 read_and_call_while(uhandle, consumer.noevent, blank=1) 355 356 # If PSI-BLAST, read the 'Sequences not found' line followed 357 # by more descriptions. However, I need to watch out for the 358 # case where there were no sequences not found previously, in 359 # which case there will be no more descriptions. 360 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 361 start='Sequences not found'): 362 # Read the descriptions and the following blank lines. 363 read_and_call_while(uhandle, consumer.noevent, blank=1) 364 l = safe_peekline(uhandle) 365 # Brad -- added check for QUERY. On some PSI-BLAST outputs 366 # there will be a 'Sequences not found' line followed by no 367 # descriptions. Check for this case since the first thing you'll 368 # get is a blank line and then 'QUERY' 369 if not l.startswith('CONVERGED') and l[0] != '>' \ 370 and not l.startswith('QUERY'): 371 read_and_call_until(uhandle, consumer.description, blank=1) 372 read_and_call_while(uhandle, consumer.noevent, blank=1) 373 374 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 375 read_and_call_while(uhandle, consumer.noevent, blank=1) 376 377 consumer.end_descriptions()
378
379 - def _scan_alignments(self, uhandle, consumer):
380 if self._eof(uhandle): 381 return 382 383 # qblast inserts a helpful line here. 384 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 385 386 # First, check to see if I'm at the database report. 387 line = safe_peekline(uhandle) 388 if not line: 389 # EOF 390 return 391 elif line.startswith(' Database') or line.startswith("Lambda"): 392 return 393 elif line[0] == '>': 394 # XXX make a better check here between pairwise and masterslave 395 self._scan_pairwise_alignments(uhandle, consumer) 396 elif line.startswith('Effective'): 397 return 398 else: 399 # XXX put in a check to make sure I'm in a masterslave alignment 400 self._scan_masterslave_alignment(uhandle, consumer)
401
402 - def _scan_pairwise_alignments(self, uhandle, consumer):
403 while not self._eof(uhandle): 404 line = safe_peekline(uhandle) 405 if line[0] != '>': 406 break 407 self._scan_one_pairwise_alignment(uhandle, consumer)
408
409 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
410 if self._eof(uhandle): 411 return 412 consumer.start_alignment() 413 414 self._scan_alignment_header(uhandle, consumer) 415 416 # Scan a bunch of score/alignment pairs. 417 while True: 418 if self._eof(uhandle): 419 # Shouldn't have issued that _scan_alignment_header event... 420 break 421 line = safe_peekline(uhandle) 422 if not line.startswith(' Score'): 423 break 424 self._scan_hsp(uhandle, consumer) 425 consumer.end_alignment()
426
427 - def _scan_alignment_header(self, uhandle, consumer):
428 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 429 # stearothermophilus] 430 # Length = 81 431 # 432 # Or, more recently with different white space: 433 # 434 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 435 # gi|15829258|ref|NP_308031.1| threonine synthase 436 # ... 437 # Length=428 438 read_and_call(uhandle, consumer.title, start='>') 439 while True: 440 line = safe_readline(uhandle) 441 if line.lstrip().startswith('Length =') \ 442 or line.lstrip().startswith('Length='): 443 consumer.length(line) 444 break 445 elif is_blank_line(line): 446 # Check to make sure I haven't missed the Length line 447 raise ValueError("I missed the Length in an alignment header") 448 consumer.title(line) 449 450 # Older versions of BLAST will have a line with some spaces. 451 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 452 if not attempt_read_and_call(uhandle, consumer.noevent, 453 start=' '): 454 read_and_call(uhandle, consumer.noevent, blank=1)
455
456 - def _scan_hsp(self, uhandle, consumer):
457 consumer.start_hsp() 458 self._scan_hsp_header(uhandle, consumer) 459 self._scan_hsp_alignment(uhandle, consumer) 460 consumer.end_hsp()
461
462 - def _scan_hsp_header(self, uhandle, consumer):
463 # Score = 22.7 bits (47), Expect = 2.5 464 # Identities = 10/36 (27%), Positives = 18/36 (49%) 465 # Strand = Plus / Plus 466 # Frame = +3 467 # 468 469 read_and_call(uhandle, consumer.score, start=' Score') 470 read_and_call(uhandle, consumer.identities, start=' Identities') 471 # BLASTN 472 attempt_read_and_call(uhandle, consumer.strand, start=' Strand') 473 # BLASTX, TBLASTN, TBLASTX 474 attempt_read_and_call(uhandle, consumer.frame, start=' Frame') 475 read_and_call(uhandle, consumer.noevent, blank=1)
476
477 - def _scan_hsp_alignment(self, uhandle, consumer):
478 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 479 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 480 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 481 # 482 # Query: 64 AEKILIKR 71 483 # I +K 484 # Sbjct: 70 PNIIQLKD 77 485 # 486 487 while True: 488 # Blastn adds an extra line filled with spaces before Query 489 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 490 read_and_call(uhandle, consumer.query, start='Query') 491 read_and_call(uhandle, consumer.align, start=' ') 492 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 493 try: 494 read_and_call_while(uhandle, consumer.noevent, blank=1) 495 except ValueError as err: 496 if str(err) != "Unexpected end of stream.": 497 raise err 498 # End of File (well, it looks like it with recent versions 499 # of BLAST for multiple queries after the Iterator class 500 # has broken up the whole file into chunks). 501 break 502 line = safe_peekline(uhandle) 503 # Alignment continues if I see a 'Query' or the spaces for Blastn. 504 if not (line.startswith('Query') or line.startswith(' ')): 505 break
506
507 - def _scan_masterslave_alignment(self, uhandle, consumer):
508 consumer.start_alignment() 509 while True: 510 line = safe_readline(uhandle) 511 # Check to see whether I'm finished reading the alignment. 512 # This is indicated by 1) database section, 2) next psi-blast 513 # round, which can also be a 'Results from round' if no 514 # searching line is present 515 # patch by chapmanb 516 if line.startswith('Searching') or \ 517 line.startswith('Results from round'): 518 uhandle.saveline(line) 519 break 520 elif line.startswith(' Database'): 521 uhandle.saveline(line) 522 break 523 elif is_blank_line(line): 524 consumer.noevent(line) 525 else: 526 consumer.multalign(line) 527 read_and_call_while(uhandle, consumer.noevent, blank=1) 528 consumer.end_alignment()
529
530 - def _eof(self, uhandle):
531 try: 532 line = safe_peekline(uhandle) 533 except ValueError as err: 534 if str(err) != "Unexpected end of stream.": 535 raise err 536 line = "" 537 return not line
538
539 - def _scan_database_report(self, uhandle, consumer):
540 # Database: sdqib40-1.35.seg.fa 541 # Posted date: Nov 1, 1999 4:25 PM 542 # Number of letters in database: 223,339 543 # Number of sequences in database: 1323 544 # 545 # Lambda K H 546 # 0.322 0.133 0.369 547 # 548 # Gapped 549 # Lambda K H 550 # 0.270 0.0470 0.230 551 # 552 ########################################## 553 # Or, more recently Blast 2.2.15 gives less blank lines 554 ########################################## 555 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 556 # environmental samples 557 # Posted date: Dec 12, 2006 5:51 PM 558 # Number of letters in database: 667,088,753 559 # Number of sequences in database: 2,094,974 560 # Lambda K H 561 # 0.319 0.136 0.395 562 # Gapped 563 # Lambda K H 564 # 0.267 0.0410 0.140 565 566 if self._eof(uhandle): 567 return 568 569 consumer.start_database_report() 570 571 # Subset of the database(s) listed below 572 # Number of letters searched: 562,618,960 573 # Number of sequences searched: 228,924 574 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 575 read_and_call(uhandle, consumer.noevent, contains="letters") 576 read_and_call(uhandle, consumer.noevent, contains="sequences") 577 read_and_call(uhandle, consumer.noevent, start=" ") 578 579 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 580 # was missing the "Database" stanza completely. 581 while attempt_read_and_call(uhandle, consumer.database, 582 start=' Database'): 583 # BLAT output ends abruptly here, without any of the other 584 # information. Check to see if this is the case. If so, 585 # then end the database report here gracefully. 586 if not uhandle.peekline().strip() \ 587 or uhandle.peekline().startswith("BLAST"): 588 consumer.end_database_report() 589 return 590 591 # Database can span multiple lines. 592 read_and_call_until(uhandle, consumer.database, start=' Posted') 593 read_and_call(uhandle, consumer.posted_date, start=' Posted') 594 read_and_call(uhandle, consumer.num_letters_in_database, 595 start=' Number of letters') 596 read_and_call(uhandle, consumer.num_sequences_in_database, 597 start=' Number of sequences') 598 # There may not be a line starting with spaces... 599 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 600 601 line = safe_readline(uhandle) 602 uhandle.saveline(line) 603 if 'Lambda' in line: 604 break 605 606 try: 607 read_and_call(uhandle, consumer.noevent, start='Lambda') 608 read_and_call(uhandle, consumer.ka_params) 609 except Exception: # TODO: ValueError, AttributeError? 610 pass 611 612 # This blank line is optional: 613 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 614 615 # not BLASTP 616 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 617 # not TBLASTX 618 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 619 read_and_call(uhandle, consumer.ka_params_gap) 620 621 # Blast 2.2.4 can sometimes skip the whole parameter section. 622 # Thus, I need to be careful not to read past the end of the 623 # file. 624 try: 625 read_and_call_while(uhandle, consumer.noevent, blank=1) 626 except ValueError as x: 627 if str(x) != "Unexpected end of stream.": 628 raise 629 consumer.end_database_report()
630
631 - def _scan_parameters(self, uhandle, consumer):
632 # Matrix: BLOSUM62 633 # Gap Penalties: Existence: 11, Extension: 1 634 # Number of Hits to DB: 50604 635 # Number of Sequences: 1323 636 # Number of extensions: 1526 637 # Number of successful extensions: 6 638 # Number of sequences better than 10.0: 5 639 # Number of HSP's better than 10.0 without gapping: 5 640 # Number of HSP's successfully gapped in prelim test: 0 641 # Number of HSP's that attempted gapping in prelim test: 1 642 # Number of HSP's gapped (non-prelim): 5 643 # length of query: 140 644 # length of database: 223,339 645 # effective HSP length: 39 646 # effective length of query: 101 647 # effective length of database: 171,742 648 # effective search space: 17345942 649 # effective search space used: 17345942 650 # T: 11 651 # A: 40 652 # X1: 16 ( 7.4 bits) 653 # X2: 38 (14.8 bits) 654 # X3: 64 (24.9 bits) 655 # S1: 41 (21.9 bits) 656 # S2: 42 (20.8 bits) 657 ########################################## 658 # Or, more recently Blast(x) 2.2.15 gives 659 ########################################## 660 # Matrix: BLOSUM62 661 # Gap Penalties: Existence: 11, Extension: 1 662 # Number of Sequences: 4535438 663 # Number of Hits to DB: 2,588,844,100 664 # Number of extensions: 60427286 665 # Number of successful extensions: 126433 666 # Number of sequences better than 2.0: 30 667 # Number of HSP's gapped: 126387 668 # Number of HSP's successfully gapped: 35 669 # Length of query: 291 670 # Length of database: 1,573,298,872 671 # Length adjustment: 130 672 # Effective length of query: 161 673 # Effective length of database: 983,691,932 674 # Effective search space: 158374401052 675 # Effective search space used: 158374401052 676 # Neighboring words threshold: 12 677 # Window for multiple hits: 40 678 # X1: 16 ( 7.3 bits) 679 # X2: 38 (14.6 bits) 680 # X3: 64 (24.7 bits) 681 # S1: 41 (21.7 bits) 682 # S2: 32 (16.9 bits) 683 684 # Blast 2.2.4 can sometimes skip the whole parameter section. 685 # BLAT also skips the whole parameter section. 686 # Thus, check to make sure that the parameter section really 687 # exists. 688 if not uhandle.peekline().strip(): 689 return 690 691 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 692 # "Number of Sequences" lines. 693 consumer.start_parameters() 694 695 # Matrix line may be missing in BLASTN 2.2.9 696 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 697 # not TBLASTX 698 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 699 700 attempt_read_and_call(uhandle, consumer.num_sequences, 701 start='Number of Sequences') 702 attempt_read_and_call(uhandle, consumer.num_hits, 703 start='Number of Hits') 704 attempt_read_and_call(uhandle, consumer.num_sequences, 705 start='Number of Sequences') 706 attempt_read_and_call(uhandle, consumer.num_extends, 707 start='Number of extensions') 708 attempt_read_and_call(uhandle, consumer.num_good_extends, 709 start='Number of successful') 710 711 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 712 start='Number of sequences') 713 714 # not BLASTN, TBLASTX 715 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 716 start="Number of HSP's better"): 717 # BLASTN 2.2.9 718 if attempt_read_and_call(uhandle, consumer.noevent, 719 start="Number of HSP's gapped:"): 720 read_and_call(uhandle, consumer.noevent, 721 start="Number of HSP's successfully") 722 # This is omitted in 2.2.15 723 attempt_read_and_call(uhandle, consumer.noevent, 724 start="Number of extra gapped extensions") 725 else: 726 read_and_call(uhandle, consumer.hsps_prelim_gapped, 727 start="Number of HSP's successfully") 728 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 729 start="Number of HSP's that") 730 read_and_call(uhandle, consumer.hsps_gapped, 731 start="Number of HSP's gapped") 732 # e.g. BLASTX 2.2.15 where the "better" line is missing 733 elif attempt_read_and_call(uhandle, consumer.noevent, 734 start="Number of HSP's gapped"): 735 read_and_call(uhandle, consumer.noevent, 736 start="Number of HSP's successfully") 737 738 # not in blastx 2.2.1 739 attempt_read_and_call(uhandle, consumer.query_length, 740 has_re=re.compile(r"[Ll]ength of query")) 741 # Not in BLASTX 2.2.22+ 742 attempt_read_and_call(uhandle, consumer.database_length, 743 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 744 745 # BLASTN 2.2.9 746 attempt_read_and_call(uhandle, consumer.noevent, 747 start="Length adjustment") 748 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 749 start='effective HSP') 750 # Not in blastx 2.2.1 751 attempt_read_and_call( 752 uhandle, consumer.effective_query_length, 753 has_re=re.compile(r'[Ee]ffective length of query')) 754 755 # This is not in BLASTP 2.2.15 756 attempt_read_and_call( 757 uhandle, consumer.effective_database_length, 758 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 759 # Not in blastx 2.2.1, added a ':' to distinguish between 760 # this and the 'effective search space used' line 761 attempt_read_and_call( 762 uhandle, consumer.effective_search_space, 763 has_re=re.compile(r'[Ee]ffective search space:')) 764 # Does not appear in BLASTP 2.0.5 765 attempt_read_and_call( 766 uhandle, consumer.effective_search_space_used, 767 has_re=re.compile(r'[Ee]ffective search space used')) 768 769 # BLASTX, TBLASTN, TBLASTX 770 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 771 772 # not in BLASTN 2.2.9 773 attempt_read_and_call(uhandle, consumer.threshold, start='T') 774 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 775 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 776 777 # not in BLASTX 2.2.15 778 attempt_read_and_call(uhandle, consumer.window_size, start='A') 779 # get this instead: "Window for multiple hits: 40" 780 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 781 782 # not in BLASTX 2.2.22+ 783 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 784 # not TBLASTN 785 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 786 787 # not BLASTN, TBLASTX 788 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 789 start='X3') 790 791 # not TBLASTN 792 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 793 # not in blastx 2.2.1 794 # first we make sure we have additional lines to work with, if 795 # not then the file is done and we don't have a final S2 796 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 797 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 798 799 consumer.end_parameters()
800 801
802 -class BlastParser(AbstractParser):
803 """Parses BLAST data into a Record.Blast object. 804 805 """
806 - def __init__(self):
807 """__init__(self)""" 808 self._scanner = _Scanner() 809 self._consumer = _BlastConsumer()
810
811 - def parse(self, handle):
812 """parse(self, handle)""" 813 self._scanner.feed(handle, self._consumer) 814 return self._consumer.data
815 816
817 -class PSIBlastParser(AbstractParser):
818 """Parses BLAST data into a Record.PSIBlast object. 819 820 """
821 - def __init__(self):
822 """__init__(self)""" 823 self._scanner = _Scanner() 824 self._consumer = _PSIBlastConsumer()
825
826 - def parse(self, handle):
827 """parse(self, handle)""" 828 self._scanner.feed(handle, self._consumer) 829 return self._consumer.data
830 831
832 -class _HeaderConsumer(object):
833 - def start_header(self):
834 self._header = Record.Header()
835
836 - def version(self, line):
837 c = line.split() 838 self._header.application = c[0] 839 self._header.version = c[1] 840 if len(c) > 2: 841 # The date is missing in the new C++ output from blastx 2.2.22+ 842 # Just get "BLASTX 2.2.22+\n" and that's all. 843 self._header.date = c[2][1:-1]
844
845 - def reference(self, line):
846 if line.startswith('Reference: '): 847 self._header.reference = line[11:] 848 else: 849 self._header.reference = self._header.reference + line
850
851 - def query_info(self, line):
852 if line.startswith('Query= '): 853 self._header.query = line[7:].lstrip() 854 elif line.startswith('Length='): 855 # New style way to give the query length in BLAST 2.2.22+ (the C++ code) 856 self._header.query_letters = _safe_int(line[7:].strip()) 857 elif not line.startswith(' '): # continuation of query_info 858 self._header.query = "%s%s" % (self._header.query, line) 859 else: 860 # Hope it is the old style way to give the query length: 861 letters, = _re_search( 862 r"([0-9,]+) letters", line, 863 "I could not find the number of letters in line\n%s" % line) 864 self._header.query_letters = _safe_int(letters)
865
866 - def database_info(self, line):
867 line = line.rstrip() 868 if line.startswith('Database: '): 869 self._header.database = line[10:] 870 elif not line.endswith('total letters'): 871 if self._header.database: 872 # Need to include a space when merging multi line datase descr 873 self._header.database = self._header.database + " " + line.strip() 874 else: 875 self._header.database = line.strip() 876 else: 877 sequences, letters = _re_search( 878 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 879 "I could not find the sequences and letters in line\n%s" % line) 880 self._header.database_sequences = _safe_int(sequences) 881 self._header.database_letters = _safe_int(letters)
882
883 - def end_header(self):
884 # Get rid of the trailing newlines 885 self._header.reference = self._header.reference.rstrip() 886 self._header.query = self._header.query.rstrip()
887 888
889 -class _DescriptionConsumer(object):
890 - def start_descriptions(self):
891 self._descriptions = [] 892 self._model_sequences = [] 893 self._nonmodel_sequences = [] 894 self._converged = 0 895 self._type = None 896 self._roundnum = None 897 898 self.__has_n = 0 # Does the description line contain an N value?
899
900 - def description_header(self, line):
901 if line.startswith('Sequences producing'): 902 cols = line.split() 903 if cols[-1] == 'N': 904 self.__has_n = 1
905
906 - def description(self, line):
907 dh = self._parse(line) 908 if self._type == 'model': 909 self._model_sequences.append(dh) 910 elif self._type == 'nonmodel': 911 self._nonmodel_sequences.append(dh) 912 else: 913 self._descriptions.append(dh)
914
915 - def model_sequences(self, line):
916 self._type = 'model'
917
918 - def nonmodel_sequences(self, line):
919 self._type = 'nonmodel'
920
921 - def converged(self, line):
922 self._converged = 1
923
924 - def no_hits(self, line):
925 pass
926
927 - def round(self, line):
928 if not line.startswith('Results from round'): 929 raise ValueError("I didn't understand the round line\n%s" % line) 930 self._roundnum = _safe_int(line[18:].strip())
931
932 - def end_descriptions(self):
933 pass
934
935 - def _parse(self, description_line):
936 line = description_line # for convenience 937 dh = Record.Description() 938 939 # I need to separate the score and p-value from the title. 940 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 941 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 942 # special cases to handle: 943 # - title must be preserved exactly (including whitespaces) 944 # - score could be equal to e-value (not likely, but what if??) 945 # - sometimes there's an "N" score of '1'. 946 cols = line.split() 947 if len(cols) < 3: 948 raise ValueError( 949 "Line does not appear to contain description:\n%s" % line) 950 if self.__has_n: 951 i = line.rfind(cols[-1]) # find start of N 952 i = line.rfind(cols[-2], 0, i) # find start of p-value 953 i = line.rfind(cols[-3], 0, i) # find start of score 954 else: 955 i = line.rfind(cols[-1]) # find start of p-value 956 i = line.rfind(cols[-2], 0, i) # find start of score 957 if self.__has_n: 958 dh.title, dh.score, dh.e, dh.num_alignments = \ 959 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 960 else: 961 dh.title, dh.score, dh.e, dh.num_alignments = \ 962 line[:i].rstrip(), cols[-2], cols[-1], 1 963 dh.num_alignments = _safe_int(dh.num_alignments) 964 dh.score = _safe_int(dh.score) 965 dh.e = _safe_float(dh.e) 966 return dh
967 968
969 -class _AlignmentConsumer(object):
970 # This is a little bit tricky. An alignment can either be a 971 # pairwise alignment or a multiple alignment. Since it's difficult 972 # to know a-priori which one the blast record will contain, I'm going 973 # to make one class that can parse both of them.
974 - def start_alignment(self):
975 self._alignment = Record.Alignment() 976 self._multiple_alignment = Record.MultipleAlignment()
977
978 - def title(self, line):
979 if self._alignment.title: 980 self._alignment.title += " " 981 self._alignment.title += line.strip()
982
983 - def length(self, line):
984 # e.g. "Length = 81" or more recently, "Length=428" 985 parts = line.replace(" ", "").split("=") 986 assert len(parts) == 2, "Unrecognised format length line" 987 self._alignment.length = parts[1] 988 self._alignment.length = _safe_int(self._alignment.length)
989
990 - def multalign(self, line):
991 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 992 if line.startswith('QUERY') or line.startswith('blast_tmp'): 993 # If this is the first line of the multiple alignment, 994 # then I need to figure out how the line is formatted. 995 996 # Format of line is: 997 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 998 try: 999 name, start, seq, end = line.split() 1000 except ValueError: 1001 raise ValueError("I do not understand the line\n%s" % line) 1002 self._start_index = line.index(start, len(name)) 1003 self._seq_index = line.index(seq, 1004 self._start_index + len(start)) 1005 # subtract 1 for the space 1006 self._name_length = self._start_index - 1 1007 self._start_length = self._seq_index - self._start_index - 1 1008 self._seq_length = line.rfind(end) - self._seq_index - 1 1009 1010 # self._seq_index = line.index(seq) 1011 # # subtract 1 for the space 1012 # self._seq_length = line.rfind(end) - self._seq_index - 1 1013 # self._start_index = line.index(start) 1014 # self._start_length = self._seq_index - self._start_index - 1 1015 # self._name_length = self._start_index 1016 1017 # Extract the information from the line 1018 name = line[:self._name_length] 1019 name = name.rstrip() 1020 start = line[self._start_index:self._start_index + self._start_length] 1021 start = start.rstrip() 1022 if start: 1023 start = _safe_int(start) 1024 end = line[self._seq_index + self._seq_length:].rstrip() 1025 if end: 1026 end = _safe_int(end) 1027 seq = line[self._seq_index:self._seq_index + self._seq_length].rstrip() 1028 # right pad the sequence with spaces if necessary 1029 if len(seq) < self._seq_length: 1030 seq += ' ' * (self._seq_length - len(seq)) 1031 1032 # I need to make sure the sequence is aligned correctly with the query. 1033 # First, I will find the length of the query. Then, if necessary, 1034 # I will pad my current sequence with spaces so that they will line 1035 # up correctly. 1036 1037 # Two possible things can happen: 1038 # QUERY 1039 # 504 1040 # 1041 # QUERY 1042 # 403 1043 # 1044 # Sequence 504 will need padding at the end. Since I won't know 1045 # this until the end of the alignment, this will be handled in 1046 # end_alignment. 1047 # Sequence 403 will need padding before being added to the alignment. 1048 1049 align = self._multiple_alignment.alignment # for convenience 1050 align.append((name, start, seq, end))
1051 1052 # This is old code that tried to line up all the sequences 1053 # in a multiple alignment by using the sequence title's as 1054 # identifiers. The problem with this is that BLAST assigns 1055 # different HSP's from the same sequence the same id. Thus, 1056 # in one alignment block, there may be multiple sequences with 1057 # the same id. I'm not sure how to handle this, so I'm not 1058 # going to. 1059 1060 # # If the sequence is the query, then just add it. 1061 # if name == 'QUERY': 1062 # if len(align) == 0: 1063 # align.append((name, start, seq)) 1064 # else: 1065 # aname, astart, aseq = align[0] 1066 # if name != aname: 1067 # raise ValueError, "Query is not the first sequence" 1068 # aseq = aseq + seq 1069 # align[0] = aname, astart, aseq 1070 # else: 1071 # if len(align) == 0: 1072 # raise ValueError, "I could not find the query sequence" 1073 # qname, qstart, qseq = align[0] 1074 # 1075 # # Now find my sequence in the multiple alignment. 1076 # for i in range(1, len(align)): 1077 # aname, astart, aseq = align[i] 1078 # if name == aname: 1079 # index = i 1080 # break 1081 # else: 1082 # # If I couldn't find it, then add a new one. 1083 # align.append((None, None, None)) 1084 # index = len(align)-1 1085 # # Make sure to left-pad it. 1086 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1087 # 1088 # if len(qseq) != len(aseq) + len(seq): 1089 # # If my sequences are shorter than the query sequence, 1090 # # then I will need to pad some spaces to make them line up. 1091 # # Since I've already right padded seq, that means aseq 1092 # # must be too short. 1093 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1094 # aseq = aseq + seq 1095 # if astart is None: 1096 # astart = start 1097 # align[index] = aname, astart, aseq 1098
1099 - def end_alignment(self):
1100 # Remove trailing newlines 1101 if self._alignment: 1102 self._alignment.title = self._alignment.title.rstrip() 1103 1104 # This code is also obsolete. See note above. 1105 # If there's a multiple alignment, I will need to make sure 1106 # all the sequences are aligned. That is, I may need to 1107 # right-pad the sequences. 1108 # if self._multiple_alignment is not None: 1109 # align = self._multiple_alignment.alignment 1110 # seqlen = None 1111 # for i in range(len(align)): 1112 # name, start, seq = align[i] 1113 # if seqlen is None: 1114 # seqlen = len(seq) 1115 # else: 1116 # if len(seq) < seqlen: 1117 # seq = seq + ' '*(seqlen - len(seq)) 1118 # align[i] = name, start, seq 1119 # elif len(seq) > seqlen: 1120 # raise ValueError, \ 1121 # "Sequence %s is longer than the query" % name 1122 1123 # Clean up some variables, if they exist. 1124 try: 1125 del self._seq_index 1126 del self._seq_length 1127 del self._start_index 1128 del self._start_length 1129 del self._name_length 1130 except AttributeError: 1131 pass
1132 1133
1134 -class _HSPConsumer(object):
1135 - def start_hsp(self):
1136 self._hsp = Record.HSP()
1137
1138 - def score(self, line):
1139 self._hsp.bits, self._hsp.score = _re_search( 1140 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1141 "I could not find the score in line\n%s" % line) 1142 self._hsp.score = _safe_float(self._hsp.score) 1143 self._hsp.bits = _safe_float(self._hsp.bits) 1144 1145 x, y = _re_search( 1146 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1147 "I could not find the expect in line\n%s" % line) 1148 if x: 1149 self._hsp.num_alignments = _safe_int(x) 1150 else: 1151 self._hsp.num_alignments = 1 1152 self._hsp.expect = _safe_float(y)
1153
1154 - def identities(self, line):
1155 x, y = _re_search( 1156 r"Identities = (\d+)\/(\d+)", line, 1157 "I could not find the identities in line\n%s" % line) 1158 self._hsp.identities = _safe_int(x), _safe_int(y) 1159 self._hsp.align_length = _safe_int(y) 1160 1161 if 'Positives' in line: 1162 x, y = _re_search( 1163 r"Positives = (\d+)\/(\d+)", line, 1164 "I could not find the positives in line\n%s" % line) 1165 self._hsp.positives = _safe_int(x), _safe_int(y) 1166 assert self._hsp.align_length == _safe_int(y) 1167 1168 if 'Gaps' in line: 1169 x, y = _re_search( 1170 r"Gaps = (\d+)\/(\d+)", line, 1171 "I could not find the gaps in line\n%s" % line) 1172 self._hsp.gaps = _safe_int(x), _safe_int(y) 1173 assert self._hsp.align_length == _safe_int(y)
1174
1175 - def strand(self, line):
1176 self._hsp.strand = _re_search( 1177 r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", line, 1178 "I could not find the strand in line\n%s" % line)
1179
1180 - def frame(self, line):
1181 # Frame can be in formats: 1182 # Frame = +1 1183 # Frame = +2 / +2 1184 if '/' in line: 1185 self._hsp.frame = _re_search( 1186 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", line, 1187 "I could not find the frame in line\n%s" % line) 1188 else: 1189 self._hsp.frame = _re_search( 1190 r"Frame = ([-+][123])", line, 1191 "I could not find the frame in line\n%s" % line)
1192 1193 # Match a space, if one is available. Masahir Ishikawa found a 1194 # case where there's no space between the start and the sequence: 1195 # Query: 100tt 101 1196 # line below modified by Yair Benita, Sep 2004 1197 # Note that the colon is not always present. 2006 1198 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)") 1199
1200 - def query(self, line):
1201 m = self._query_re.search(line) 1202 if m is None: 1203 if line.strip() == "Query ------------------------------------------------------------": 1204 # Special case - long gap relative to the subject, 1205 # note there is no start/end present, cannot update those 1206 self._hsp.query += "-" * 60 1207 self._query_len = 60 # number of dashes 1208 self._query_start_index = 13 # offset of first dash 1209 return 1210 raise ValueError("I could not find the query in line\n%s" % line) 1211 1212 # line below modified by Yair Benita, Sep 2004. 1213 # added the end attribute for the query 1214 colon, start, seq, end = m.groups() 1215 seq = seq.strip() 1216 self._hsp.query += seq 1217 if self._hsp.query_start is None: 1218 self._hsp.query_start = _safe_int(start) 1219 1220 # line below added by Yair Benita, Sep 2004. 1221 # added the end attribute for the query 1222 self._hsp.query_end = _safe_int(end) 1223 1224 # Get index for sequence start (regular expression element 3) 1225 self._query_start_index = m.start(3) 1226 self._query_len = len(seq)
1227
1228 - def align(self, line):
1229 seq = line[self._query_start_index:].rstrip() 1230 if len(seq) < self._query_len: 1231 # Make sure the alignment is the same length as the query 1232 seq += ' ' * (self._query_len - len(seq)) 1233 elif len(seq) < self._query_len: 1234 raise ValueError("Match is longer than the query in line\n%s" 1235 % line) 1236 self._hsp.match = self._hsp.match + seq
1237 1238 # To match how we do the query, cache the regular expression. 1239 # Note that the colon is not always present. 1240 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)") 1241
1242 - def sbjct(self, line):
1243 m = self._sbjct_re.search(line) 1244 if m is None: 1245 raise ValueError("I could not find the sbjct in line\n%s" % line) 1246 colon, start, seq, end = m.groups() 1247 # mikep 26/9/00 1248 # On occasion, there is a blast hit with no subject match 1249 # so far, it only occurs with 1-line short "matches" 1250 # I have decided to let these pass as they appear 1251 if not seq.strip(): 1252 seq = ' ' * self._query_len 1253 else: 1254 seq = seq.strip() 1255 self._hsp.sbjct += seq 1256 if self._hsp.sbjct_start is None: 1257 self._hsp.sbjct_start = _safe_int(start) 1258 1259 self._hsp.sbjct_end = _safe_int(end) 1260 if len(seq) != self._query_len: 1261 raise ValueError( 1262 "QUERY and SBJCT sequence lengths don't match (%i %r vs %i) in line\n%s" 1263 % (self._query_len, self._hsp.query, len(seq), line)) 1264 1265 del self._query_start_index # clean up unused variables 1266 del self._query_len
1267
1268 - def end_hsp(self):
1269 pass
1270 1271
1272 -class _DatabaseReportConsumer(object):
1273
1274 - def start_database_report(self):
1275 self._dr = Record.DatabaseReport()
1276
1277 - def database(self, line):
1278 m = re.search(r"Database: (.+)$", line) 1279 if m: 1280 self._dr.database_name.append(m.group(1)) 1281 elif self._dr.database_name: 1282 # This must be a continuation of the previous name. 1283 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1284 line.strip())
1285
1286 - def posted_date(self, line):
1287 self._dr.posted_date.append(_re_search( 1288 r"Posted date:\s*(.+)$", line, 1289 "I could not find the posted date in line\n%s" % line))
1290
1291 - def num_letters_in_database(self, line):
1292 letters, = _get_cols( 1293 line, (-1,), ncols=6, expected={2: "letters", 4: "database:"}) 1294 self._dr.num_letters_in_database.append(_safe_int(letters))
1295
1296 - def num_sequences_in_database(self, line):
1297 sequences, = _get_cols( 1298 line, (-1,), ncols=6, expected={2: "sequences", 4: "database:"}) 1299 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1300
1301 - def ka_params(self, line):
1302 self._dr.ka_params = [_safe_float(x) for x in line.split()]
1303
1304 - def gapped(self, line):
1305 self._dr.gapped = 1
1306
1307 - def ka_params_gap(self, line):
1308 self._dr.ka_params_gap = [_safe_float(x) for x in line.split()]
1309
1310 - def end_database_report(self):
1311 pass
1312 1313
1314 -class _ParametersConsumer(object):
1315 - def start_parameters(self):
1316 self._params = Record.Parameters()
1317
1318 - def matrix(self, line):
1319 self._params.matrix = line[8:].rstrip()
1320
1321 - def gap_penalties(self, line):
1322 self._params.gap_penalties = [_safe_float(x) for x in _get_cols( 1323 line, (3, 5), ncols=6, expected={2: "Existence:", 4: "Extension:"})]
1324
1325 - def num_hits(self, line):
1326 if '1st pass' in line: 1327 x, = _get_cols(line, (-4,), ncols=11, expected={2: "Hits"}) 1328 self._params.num_hits = _safe_int(x) 1329 else: 1330 x, = _get_cols(line, (-1,), ncols=6, expected={2: "Hits"}) 1331 self._params.num_hits = _safe_int(x)
1332
1333 - def num_sequences(self, line):
1334 if '1st pass' in line: 1335 x, = _get_cols(line, (-4,), ncols=9, expected={2: "Sequences:"}) 1336 self._params.num_sequences = _safe_int(x) 1337 else: 1338 x, = _get_cols(line, (-1,), ncols=4, expected={2: "Sequences:"}) 1339 self._params.num_sequences = _safe_int(x)
1340
1341 - def num_extends(self, line):
1342 if '1st pass' in line: 1343 x, = _get_cols(line, (-4,), ncols=9, expected={2: "extensions:"}) 1344 self._params.num_extends = _safe_int(x) 1345 else: 1346 x, = _get_cols(line, (-1,), ncols=4, expected={2: "extensions:"}) 1347 self._params.num_extends = _safe_int(x)
1348
1349 - def num_good_extends(self, line):
1350 if '1st pass' in line: 1351 x, = _get_cols(line, (-4,), ncols=10, expected={3: "extensions:"}) 1352 self._params.num_good_extends = _safe_int(x) 1353 else: 1354 x, = _get_cols(line, (-1,), ncols=5, expected={3: "extensions:"}) 1355 self._params.num_good_extends = _safe_int(x)
1356
1357 - def num_seqs_better_e(self, line):
1358 self._params.num_seqs_better_e, = _get_cols( 1359 line, (-1,), ncols=7, expected={2: "sequences"}) 1360 self._params.num_seqs_better_e = _safe_int( 1361 self._params.num_seqs_better_e)
1362
1363 - def hsps_no_gap(self, line):
1364 self._params.hsps_no_gap, = _get_cols( 1365 line, (-1,), ncols=9, expected={3: "better", 7: "gapping:"}) 1366 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1367
1368 - def hsps_prelim_gapped(self, line):
1369 self._params.hsps_prelim_gapped, = _get_cols( 1370 line, (-1,), ncols=9, expected={4: "gapped", 6: "prelim"}) 1371 self._params.hsps_prelim_gapped = _safe_int( 1372 self._params.hsps_prelim_gapped)
1373
1374 - def hsps_prelim_gapped_attempted(self, line):
1375 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1376 line, (-1,), ncols=10, expected={4: "attempted", 7: "prelim"}) 1377 self._params.hsps_prelim_gapped_attempted = _safe_int( 1378 self._params.hsps_prelim_gapped_attempted)
1379
1380 - def hsps_gapped(self, line):
1381 self._params.hsps_gapped, = _get_cols( 1382 line, (-1,), ncols=6, expected={3: "gapped"}) 1383 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1384
1385 - def query_length(self, line):
1386 self._params.query_length, = _get_cols( 1387 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "query:"}) 1388 self._params.query_length = _safe_int(self._params.query_length)
1389
1390 - def database_length(self, line):
1391 self._params.database_length, = _get_cols( 1392 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "database:"}) 1393 self._params.database_length = _safe_int(self._params.database_length)
1394
1395 - def effective_hsp_length(self, line):
1396 self._params.effective_hsp_length, = _get_cols( 1397 line, (-1,), ncols=4, expected={1: "HSP", 2: "length:"}) 1398 self._params.effective_hsp_length = _safe_int( 1399 self._params.effective_hsp_length)
1400
1401 - def effective_query_length(self, line):
1402 self._params.effective_query_length, = _get_cols( 1403 line, (-1,), ncols=5, expected={1: "length", 3: "query:"}) 1404 self._params.effective_query_length = _safe_int( 1405 self._params.effective_query_length)
1406
1407 - def effective_database_length(self, line):
1408 self._params.effective_database_length, = _get_cols( 1409 line.lower(), (-1,), ncols=5, expected={1: "length", 3: "database:"}) 1410 self._params.effective_database_length = _safe_int( 1411 self._params.effective_database_length)
1412
1413 - def effective_search_space(self, line):
1414 self._params.effective_search_space, = _get_cols( 1415 line, (-1,), ncols=4, expected={1: "search"}) 1416 self._params.effective_search_space = _safe_int( 1417 self._params.effective_search_space)
1418
1419 - def effective_search_space_used(self, line):
1420 self._params.effective_search_space_used, = _get_cols( 1421 line, (-1,), ncols=5, expected={1: "search", 3: "used:"}) 1422 self._params.effective_search_space_used = _safe_int( 1423 self._params.effective_search_space_used)
1424
1425 - def frameshift(self, line):
1426 self._params.frameshift = _get_cols( 1427 line, (4, 5), ncols=6, expected={0: "frameshift", 2: "decay"})
1428
1429 - def threshold(self, line):
1430 if line[:2] == "T:": 1431 # Assume its an old style line like "T: 123" 1432 self._params.threshold, = _get_cols( 1433 line, (1,), ncols=2, expected={0: "T:"}) 1434 elif line[:28] == "Neighboring words threshold:": 1435 self._params.threshold, = _get_cols( 1436 line, (3,), ncols=4, expected={0: "Neighboring", 1: "words", 2: "threshold:"}) 1437 else: 1438 raise ValueError("Unrecognised threshold line:\n%s" % line) 1439 self._params.threshold = _safe_int(self._params.threshold)
1440
1441 - def window_size(self, line):
1442 if line[:2] == "A:": 1443 self._params.window_size, = _get_cols( 1444 line, (1,), ncols=2, expected={0: "A:"}) 1445 elif line[:25] == "Window for multiple hits:": 1446 self._params.window_size, = _get_cols( 1447 line, (4,), ncols=5, expected={0: "Window", 2: "multiple", 3: "hits:"}) 1448 else: 1449 raise ValueError("Unrecognised window size line:\n%s" % line) 1450 self._params.window_size = _safe_int(self._params.window_size)
1451
1452 - def dropoff_1st_pass(self, line):
1453 score, bits = _re_search( 1454 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1455 "I could not find the dropoff in line\n%s" % line) 1456 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1457
1458 - def gap_x_dropoff(self, line):
1459 score, bits = _re_search( 1460 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1461 "I could not find the gap dropoff in line\n%s" % line) 1462 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1463
1464 - def gap_x_dropoff_final(self, line):
1465 score, bits = _re_search( 1466 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1467 "I could not find the gap dropoff final in line\n%s" % line) 1468 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1469
1470 - def gap_trigger(self, line):
1471 score, bits = _re_search( 1472 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1473 "I could not find the gap trigger in line\n%s" % line) 1474 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1475
1476 - def blast_cutoff(self, line):
1477 score, bits = _re_search( 1478 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1479 "I could not find the blast cutoff in line\n%s" % line) 1480 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1481
1482 - def end_parameters(self):
1483 pass
1484 1485
1486 -class _BlastConsumer(AbstractConsumer, 1487 _HeaderConsumer, 1488 _DescriptionConsumer, 1489 _AlignmentConsumer, 1490 _HSPConsumer, 1491 _DatabaseReportConsumer, 1492 _ParametersConsumer 1493 ):
1494 # This Consumer is inherits from many other consumer classes that handle 1495 # the actual dirty work. An alternate way to do it is to create objects 1496 # of those classes and then delegate the parsing tasks to them in a 1497 # decorator-type pattern. The disadvantage of that is that the method 1498 # names will need to be resolved in this classes. However, using 1499 # a decorator will retain more control in this class (which may or 1500 # may not be a bad thing). In addition, having each sub-consumer as 1501 # its own object prevents this object's dictionary from being cluttered 1502 # with members and reduces the chance of member collisions.
1503 - def __init__(self):
1504 self.data = None
1505
1506 - def round(self, line):
1507 # Make sure nobody's trying to pass me PSI-BLAST data! 1508 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1509
1510 - def start_header(self):
1511 self.data = Record.Blast() 1512 _HeaderConsumer.start_header(self)
1513
1514 - def end_header(self):
1515 _HeaderConsumer.end_header(self) 1516 self.data.__dict__.update(self._header.__dict__)
1517
1518 - def end_descriptions(self):
1519 self.data.descriptions = self._descriptions
1520
1521 - def end_alignment(self):
1522 _AlignmentConsumer.end_alignment(self) 1523 if self._alignment.hsps: 1524 self.data.alignments.append(self._alignment) 1525 if self._multiple_alignment.alignment: 1526 self.data.multiple_alignment = self._multiple_alignment
1527
1528 - def end_hsp(self):
1529 _HSPConsumer.end_hsp(self) 1530 try: 1531 self._alignment.hsps.append(self._hsp) 1532 except AttributeError: 1533 raise ValueError("Found an HSP before an alignment")
1534
1535 - def end_database_report(self):
1536 _DatabaseReportConsumer.end_database_report(self) 1537 self.data.__dict__.update(self._dr.__dict__)
1538
1539 - def end_parameters(self):
1540 _ParametersConsumer.end_parameters(self) 1541 self.data.__dict__.update(self._params.__dict__)
1542 1543
1544 -class _PSIBlastConsumer(AbstractConsumer, 1545 _HeaderConsumer, 1546 _DescriptionConsumer, 1547 _AlignmentConsumer, 1548 _HSPConsumer, 1549 _DatabaseReportConsumer, 1550 _ParametersConsumer 1551 ):
1552 - def __init__(self):
1553 self.data = None
1554
1555 - def start_header(self):
1556 self.data = Record.PSIBlast() 1557 _HeaderConsumer.start_header(self)
1558
1559 - def end_header(self):
1560 _HeaderConsumer.end_header(self) 1561 self.data.__dict__.update(self._header.__dict__)
1562
1563 - def start_descriptions(self):
1564 self._round = Record.Round() 1565 self.data.rounds.append(self._round) 1566 _DescriptionConsumer.start_descriptions(self)
1567
1568 - def end_descriptions(self):
1569 _DescriptionConsumer.end_descriptions(self) 1570 self._round.number = self._roundnum 1571 if self._descriptions: 1572 self._round.new_seqs.extend(self._descriptions) 1573 self._round.reused_seqs.extend(self._model_sequences) 1574 self._round.new_seqs.extend(self._nonmodel_sequences) 1575 if self._converged: 1576 self.data.converged = 1
1577
1578 - def end_alignment(self):
1579 _AlignmentConsumer.end_alignment(self) 1580 if self._alignment.hsps: 1581 self._round.alignments.append(self._alignment) 1582 if self._multiple_alignment: 1583 self._round.multiple_alignment = self._multiple_alignment
1584
1585 - def end_hsp(self):
1586 _HSPConsumer.end_hsp(self) 1587 try: 1588 self._alignment.hsps.append(self._hsp) 1589 except AttributeError: 1590 raise ValueError("Found an HSP before an alignment")
1591
1592 - def end_database_report(self):
1593 _DatabaseReportConsumer.end_database_report(self) 1594 self.data.__dict__.update(self._dr.__dict__)
1595
1596 - def end_parameters(self):
1597 _ParametersConsumer.end_parameters(self) 1598 self.data.__dict__.update(self._params.__dict__)
1599 1600
1601 -class Iterator(object):
1602 """Iterates over a file of multiple BLAST results. 1603 1604 Methods: 1605 next Return the next record from the stream, or None. 1606 1607 """
1608 - def __init__(self, handle, parser=None):
1609 """__init__(self, handle, parser=None) 1610 1611 Create a new iterator. handle is a file-like object. parser 1612 is an optional Parser object to change the results into another form. 1613 If set to None, then the raw contents of the file will be returned. 1614 1615 """ 1616 try: 1617 handle.readline 1618 except AttributeError: 1619 raise ValueError( 1620 "I expected a file handle or file-like object, got %s" 1621 % type(handle)) 1622 self._uhandle = File.UndoHandle(handle) 1623 self._parser = parser 1624 self._header = []
1625
1626 - def __next__(self):
1627 """next(self) -> object 1628 1629 Return the next Blast record from the file. If no more records, 1630 return None. 1631 1632 """ 1633 lines = [] 1634 query = False 1635 while True: 1636 line = self._uhandle.readline() 1637 if not line: 1638 break 1639 # If I've reached the next one, then put the line back and stop. 1640 if lines and (line.startswith('BLAST') or 1641 line.startswith('BLAST', 1) or 1642 line.startswith('<?xml ')): 1643 self._uhandle.saveline(line) 1644 break 1645 # New style files omit the BLAST line to mark a new query: 1646 if line.startswith("Query="): 1647 if not query: 1648 if not self._header: 1649 self._header = lines[:] 1650 query = True 1651 else: 1652 # Start of another record 1653 self._uhandle.saveline(line) 1654 break 1655 lines.append(line) 1656 1657 if query and "BLAST" not in lines[0]: 1658 # Cheat and re-insert the header 1659 # print "-"*50 1660 # print "".join(self._header) 1661 # print "-"*50 1662 # print "".join(lines) 1663 # print "-"*50 1664 lines = self._header + lines 1665 1666 if not lines: 1667 return None 1668 1669 data = ''.join(lines) 1670 if self._parser is not None: 1671 return self._parser.parse(StringIO(data)) 1672 return data
1673 1674 if sys.version_info[0] < 3:
1675 - def next(self):
1676 """Python 2 style alias for Python 3 style __next__ method.""" 1677 return self.__next__()
1678
1679 - def __iter__(self):
1680 return iter(self.__next__, None)
1681 1682
1683 -def _re_search(regex, line, error_msg):
1684 m = re.search(regex, line) 1685 if not m: 1686 raise ValueError(error_msg) 1687 return m.groups()
1688 1689
1690 -def _get_cols(line, cols_to_get, ncols=None, expected=None):
1691 if expected is None: 1692 expected = {} 1693 cols = line.split() 1694 1695 # Check to make sure number of columns is correct 1696 if ncols is not None and len(cols) != ncols: 1697 raise ValueError("I expected %d columns (got %d) in line\n%s" 1698 % (ncols, len(cols), line)) 1699 1700 # Check to make sure columns contain the correct data 1701 for k in expected: 1702 if cols[k] != expected[k]: 1703 raise ValueError("I expected '%s' in column %d in line\n%s" 1704 % (expected[k], k, line)) 1705 1706 # Construct the answer tuple 1707 results = [] 1708 for c in cols_to_get: 1709 results.append(cols[c]) 1710 return tuple(results)
1711 1712
1713 -def _safe_int(str):
1714 try: 1715 return int(str) 1716 except ValueError: 1717 # Something went wrong. Try to clean up the string. 1718 # Remove all commas from the string 1719 str = str.replace(',', '') 1720 # try again after removing commas. 1721 # Note int() will return a long rather than overflow 1722 try: 1723 return int(str) 1724 except ValueError: 1725 pass 1726 # Call float to handle things like "54.3", note could lose precision, e.g. 1727 # >>> int("5399354557888517312") 1728 # 5399354557888517312 1729 # >>> int(float("5399354557888517312")) 1730 # 5399354557888517120 1731 return int(float(str))
1732 1733
1734 -def _safe_float(str):
1735 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1736 # float('e-172') does not produce an error on his platform. Thus, 1737 # we need to check the string for this condition. 1738 1739 # Sometimes BLAST leaves of the '1' in front of an exponent. 1740 if str and str[0] in ['E', 'e']: 1741 str = '1' + str 1742 try: 1743 return float(str) 1744 except ValueError: 1745 # Remove all commas from the string 1746 str = str.replace(',', '') 1747 # try again. 1748 return float(str)
1749 1750
1751 -class _BlastErrorConsumer(_BlastConsumer):
1752 - def __init__(self):
1754
1755 - def noevent(self, line):
1756 if 'Query must be at least wordsize' in line: 1757 raise ShortQueryBlastError("Query must be at least wordsize") 1758 # Now pass the line back up to the superclass. 1759 method = getattr(_BlastConsumer, 'noevent', 1760 _BlastConsumer.__getattr__(self, 'noevent')) 1761 method(line)
1762 1763
1764 -class BlastErrorParser(AbstractParser):
1765 """Attempt to catch and diagnose BLAST errors while parsing. 1766 1767 This utilizes the BlastParser module but adds an additional layer 1768 of complexity on top of it by attempting to diagnose ValueErrors 1769 that may actually indicate problems during BLAST parsing. 1770 1771 Current BLAST problems this detects are: 1772 o LowQualityBlastError - When BLASTing really low quality sequences 1773 (ie. some GenBank entries which are just short stretches of a single 1774 nucleotide), BLAST will report an error with the sequence and be 1775 unable to search with this. This will lead to a badly formatted 1776 BLAST report that the parsers choke on. The parser will convert the 1777 ValueError to a LowQualityBlastError and attempt to provide useful 1778 information. 1779 """
1780 - def __init__(self, bad_report_handle=None):
1781 """Initialize a parser that tries to catch BlastErrors. 1782 1783 Arguments: 1784 o bad_report_handle - An optional argument specifying a handle 1785 where bad reports should be sent. This would allow you to save 1786 all of the bad reports to a file, for instance. If no handle 1787 is specified, the bad reports will not be saved. 1788 """ 1789 self._bad_report_handle = bad_report_handle 1790 1791 # self._b_parser = BlastParser() 1792 self._scanner = _Scanner() 1793 self._consumer = _BlastErrorConsumer()
1794
1795 - def parse(self, handle):
1796 """Parse a handle, attempting to diagnose errors. 1797 """ 1798 results = handle.read() 1799 1800 try: 1801 self._scanner.feed(StringIO(