Package Bio :: Package Blast :: Module Applications
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.Applications

   1  # Copyright 2001 Brad Chapman. 
   2  # Revisions copyright 2009-2010 by Peter Cock. 
   3  # Revisions copyright 2010 by Phillip Garland. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Definitions for interacting with BLAST related applications. 
   9   
  10  Wrappers for the new NCBI BLAST+ tools (written in C++): 
  11   
  12   - NcbiblastpCommandline - Protein-Protein BLAST 
  13   - NcbiblastnCommandline - Nucleotide-Nucleotide BLAST 
  14   - NcbiblastxCommandline - Translated Query-Protein Subject BLAST 
  15   - NcbitblastnCommandline - Protein Query-Translated Subject BLAST 
  16   - NcbitblastxCommandline - Translated Query-Protein Subject BLAST 
  17   - NcbipsiblastCommandline - Position-Specific Initiated BLAST 
  18   - NcbirpsblastCommandline - Reverse Position Specific BLAST 
  19   - NcbirpstblastnCommandline - Translated Reverse Position Specific BLAST 
  20   - NcbideltablastCommandline - Protein-Protein domain enhanced lookup time accelerated blast 
  21   - NcbiblastformatterCommandline - Convert ASN.1 to other BLAST output formats 
  22   
  23  For further details, see: 
  24   
  25  Camacho et al. BLAST+: architecture and applications 
  26  BMC Bioinformatics 2009, 10:421 
  27  doi:10.1186/1471-2105-10-421 
  28  """ 
  29  from __future__ import print_function 
  30   
  31  from Bio.Application import _Option, AbstractCommandline, _Switch 
  32   
  33  __docformat__ = "restructuredtext en" 
  34   
  35   
36 -class _NcbibaseblastCommandline(AbstractCommandline):
37 """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). 38 39 This is provided for subclassing, it deals with shared options 40 common to all the BLAST tools (blastn, rpsblast, rpsblast, etc 41 AND blast_formatter). 42 """
43 - def __init__(self, cmd=None, **kwargs):
44 assert cmd is not None 45 extra_parameters = [ 46 # Core: 47 _Switch(["-h", "h"], 48 "Print USAGE and DESCRIPTION; ignore other arguments."), 49 _Switch(["-help", "help"], 50 "Print USAGE, DESCRIPTION and ARGUMENTS description; " 51 "ignore other arguments."), 52 _Switch(["-version", "version"], 53 "Print version number; ignore other arguments."), 54 # Output configuration options 55 _Option(["-out", "out"], 56 "Output file for alignment.", 57 filename=True, 58 equate=False), 59 # Formatting options: 60 _Option(["-outfmt", "outfmt"], 61 "Alignment view. Integer 0-11. Use 5 for XML output " 62 "(differs from classic BLAST which used 7 for XML).", 63 equate=False), 64 # TODO - Document and test the column options 65 _Switch(["-show_gis", "show_gis"], 66 "Show NCBI GIs in deflines?"), 67 _Option(["-num_descriptions", "num_descriptions"], 68 """Number of database sequences to show one-line descriptions for. 69 70 Integer argument (at least zero). Default is 500. 71 See also num_alignments.""", 72 equate=False), 73 _Option(["-num_alignments", "num_alignments"], 74 """Number of database sequences to show num_alignments for. 75 76 Integer argument (at least zero). Default is 200. 77 See also num_alignments.""", 78 equate=False), 79 _Option(["-line_length", "line_length"], 80 """Line length for formatting alignments (integer, at least 1, default 60). 81 82 Not applicable for outfmt > 4. 83 84 Added in BLAST+ 2.2.30. 85 """, 86 equate=False), 87 _Switch(["-html", "html"], 88 "Produce HTML output? See also the outfmt option."), 89 # Miscellaneous options 90 _Switch(["-parse_deflines", "parse_deflines"], 91 "Should the query and subject defline(s) be parsed?"), 92 ] 93 try: 94 # Insert extra parameters - at the start just in case there 95 # are any arguments which must come last: 96 self.parameters = extra_parameters + self.parameters 97 except AttributeError: 98 # Should we raise an error? The subclass should have set this up! 99 self.parameters = extra_parameters 100 AbstractCommandline.__init__(self, cmd, **kwargs)
101
102 - def _validate_incompatibilities(self, incompatibles):
103 """Used by the BLAST+ _validate method (PRIVATE).""" 104 for a in incompatibles: 105 if self._get_parameter(a): 106 for b in incompatibles[a]: 107 if self._get_parameter(b): 108 raise ValueError("Options %s and %s are incompatible." 109 % (a, b))
110 111
112 -class _NcbiblastCommandline(_NcbibaseblastCommandline):
113 """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). 114 115 This is provided for subclassing, it deals with shared options 116 common to all the BLAST tools (blastn, rpsblast, rpsblast, etc). 117 """
118 - def __init__(self, cmd=None, **kwargs):
119 assert cmd is not None 120 extra_parameters = [ 121 # Input query options: 122 _Option(["-query", "query"], 123 "The sequence to search with.", 124 filename=True, 125 equate=False), # Should this be required? 126 _Option(["-query_loc", "query_loc"], 127 "Location on the query sequence (Format: start-stop)", 128 equate=False), 129 # General search options: 130 _Option(["-db", "db"], 131 "The database to BLAST against.", 132 equate=False), 133 _Option(["-evalue", "evalue"], 134 "Expectation value cutoff.", 135 equate=False), 136 _Option(["-word_size", "word_size"], 137 """Word size for wordfinder algorithm. 138 139 Integer. Minimum 2.""", 140 equate=False), 141 # BLAST-2-Sequences options: 142 # - see subclass 143 # Formatting options: 144 # - see baseclass 145 # Query filtering options 146 _Option(["-soft_masking", "soft_masking"], 147 "Apply filtering locations as soft masks (Boolean, Default = true)", 148 equate=False), 149 _Switch(["-lcase_masking", "lcase_masking"], 150 "Use lower case filtering in query and subject sequence(s)?"), 151 # Restrict search or results 152 _Option(["-gilist", "gilist"], 153 """Restrict search of database to list of GI's. 154 155 Incompatible with: negative_gilist, seqidlist, remote, subject, subject_loc""", 156 filename=True, 157 equate=False), 158 _Option(["-negative_gilist", "negative_gilist"], 159 """Restrict search of database to everything except the listed GIs. 160 161 Incompatible with: gilist, seqidlist, remote, subject, subject_loc""", 162 filename=True, 163 equate=False), 164 _Option(["-seqidlist", "seqidlist"], 165 """Restrict search of database to list of SeqID's. 166 167 Incompatible with: gilist, negative_gilist, remote, subject, subject_loc""", 168 filename=True, 169 equate=False), 170 _Option(["-entrez_query", "entrez_query"], 171 "Restrict search with the given Entrez query (requires remote).", 172 equate=False), 173 _Option(["-qcov_hsp_perc", "qcov_hsp_perc"], 174 """Percent query coverage per hsp (float, 0 to 100). 175 176 Added in BLAST+ 2.2.30. 177 """, 178 equate=False), 179 _Option(["-max_target_seqs", "max_target_seqs"], 180 "Maximum number of aligned sequences to keep (integer, at least one).", 181 equate=False), 182 # Statistical options 183 _Option(["-dbsize", "dbsize"], 184 "Effective length of the database (integer).", 185 equate=False), 186 _Option(["-searchsp", "searchsp"], 187 "Effective length of the search space (integer).", 188 equate=False), 189 _Option(["-max_hsps_per_subject", "max_hsps_per_subject"], 190 "Override maximum number of HSPs per subject to save for ungapped searches (integer).", 191 equate=False), 192 _Option(["-max_hsps", "max_hsps"], 193 "Set maximum number of HSPs per subject sequence to save (default 0 means no limit).", 194 equate=False), 195 _Switch(["-sum_statistics", "sum_statistics"], 196 "Use sum statistics."), 197 # Is -sum_stats a BLAST+ bug, why not use -sum_statistics switch? 198 _Option(["-sum_stats", "sum_stats"], 199 """Use sum statistics (boolean). 200 201 Added in BLAST+ 2.2.30. 202 """, 203 equate=False), 204 # Extension options 205 _Option(["-xdrop_ungap", "xdrop_ungap"], 206 "X-dropoff value (in bits) for ungapped extensions (float).", 207 equate=False), 208 _Option(["-xdrop_gap", "xdrop_gap"], 209 "X-dropoff value (in bits) for preliminary gapped extensions (float).", 210 equate=False), 211 _Option(["-xdrop_gap_final", "xdrop_gap_final"], 212 "X-dropoff value (in bits) for final gapped alignment (float).", 213 equate=False), 214 _Option(["-window_size", "window_size"], 215 "Multiple hits window size, use 0 to specify 1-hit algorithm (integer).", 216 equate=False), 217 # Search strategy options 218 _Option(["-import_search_strategy", "import_search_strategy"], 219 """Search strategy to use. 220 221 Incompatible with: export_search_strategy""", 222 filename=True, 223 equate=False), 224 _Option(["-export_search_strategy", "export_search_strategy"], 225 """File name to record the search strategy used. 226 227 Incompatible with: import_search_strategy""", 228 filename=True, 229 equate=False), 230 # Miscellaneous options 231 _Option(["-num_threads", "num_threads"], 232 """Number of threads to use in the BLAST search (integer, at least one). 233 234 Default is one. 235 Incompatible with: remote""", 236 equate=False), 237 _Switch(["-remote", "remote"], 238 """Execute search remotely? 239 240 Incompatible with: gilist, negative_gilist, subject_loc, num_threads, ..."""), 241 ] 242 try: 243 # Insert extra parameters - at the start just in case there 244 # are any arguments which must come last: 245 self.parameters = extra_parameters + self.parameters 246 except AttributeError: 247 # Should we raise an error? The subclass should have set this up! 248 self.parameters = extra_parameters 249 _NcbibaseblastCommandline.__init__(self, cmd, **kwargs)
250
251 - def _validate(self):
252 incompatibles = {"remote": ["gilist", "negative_gilist", "num_threads"], 253 "import_search_strategy": ["export_search_strategy"], 254 "gilist": ["negative_gilist"], 255 "seqidlist": ["gilist", "negative_gilist", "remote"]} 256 self._validate_incompatibilities(incompatibles) 257 if self.entrez_query and not self.remote: 258 raise ValueError("Option entrez_query requires remote option.") 259 AbstractCommandline._validate(self)
260 261
262 -class _Ncbiblast2SeqCommandline(_NcbiblastCommandline):
263 """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). 264 265 This is provided for subclassing, it deals with shared options 266 common to all the BLAST tools supporting two-sequence BLAST 267 (blastn, psiblast, etc) but not rpsblast or rpstblastn. 268 """
269 - def __init__(self, cmd=None, **kwargs):
270 assert cmd is not None 271 extra_parameters = [ 272 # General search options: 273 _Option(["-gapopen", "gapopen"], 274 "Cost to open a gap (integer).", 275 equate=False), 276 _Option(["-gapextend", "gapextend"], 277 "Cost to extend a gap (integer).", 278 equate=False), 279 # BLAST-2-Sequences options: 280 _Option(["-subject", "subject"], 281 """Subject sequence(s) to search. 282 283 Incompatible with: db, gilist, negative_gilist. 284 See also subject_loc.""", 285 filename=True, 286 equate=False), 287 _Option(["-subject_loc", "subject_loc"], 288 """Location on the subject sequence (Format: start-stop). 289 290 Incompatible with: db, gilist, seqidlist, negative_gilist, 291 db_soft_mask, db_hard_mask, remote. 292 293 See also subject.""", 294 equate=False), 295 # Restrict search or results: 296 _Option(["-culling_limit", "culling_limit"], 297 """Hit culling limit (integer). 298 299 If the query range of a hit is enveloped by that of at 300 least this many higher-scoring hits, delete the hit. 301 302 Incompatible with: best_hit_overhang, best_hit_score_edge. 303 """, 304 equate=False), 305 _Option(["-best_hit_overhang", "best_hit_overhang"], 306 """Best Hit algorithm overhang value (float, recommended value: 0.1) 307 308 Float between 0.0 and 0.5 inclusive. 309 310 Incompatible with: culling_limit.""", 311 equate=False), 312 _Option(["-best_hit_score_edge", "best_hit_score_edge"], 313 """Best Hit algorithm score edge value (float, recommended value: 0.1) 314 315 Float between 0.0 and 0.5 inclusive. 316 317 Incompatible with: culling_limit.""", 318 equate=False), 319 ] 320 try: 321 # Insert extra parameters - at the start just in case there 322 # are any arguments which must come last: 323 self.parameters = extra_parameters + self.parameters 324 except AttributeError: 325 # Should we raise an error? The subclass should have set this up! 326 self.parameters = extra_parameters 327 _NcbiblastCommandline.__init__(self, cmd, **kwargs)
328
329 - def _validate(self):
330 incompatibles = {"subject_loc": ["db", "gilist", "negative_gilist", "seqidlist", "remote"], 331 "culling_limit": ["best_hit_overhang", "best_hit_score_edge"], 332 "subject": ["db", "gilist", "negative_gilist", "seqidlist"]} 333 self._validate_incompatibilities(incompatibles) 334 _NcbiblastCommandline._validate(self)
335 336
337 -class _NcbiblastMain2SeqCommandline(_Ncbiblast2SeqCommandline):
338 """Base Commandline object for (new) NCBI BLAST+ wrappers (PRIVATE). 339 340 This is provided for subclassing, it deals with shared options 341 common to the main BLAST tools blastp, blastn, blastx, tblastx, tblastn 342 but not psiblast, rpsblast or rpstblastn. 343 """
344 - def __init__(self, cmd=None, **kwargs):
345 assert cmd is not None 346 extra_parameters = [ 347 # Restrict search or results: 348 _Option(["-db_soft_mask", "db_soft_mask"], 349 """Filtering algorithm for soft masking (integer). 350 351 Filtering algorithm ID to apply to the BLAST database as soft masking. 352 353 Incompatible with: db_hard_mask, subject, subject_loc""", 354 equate=False), 355 _Option(["-db_hard_mask", "db_hard_mask"], 356 """Filtering algorithm for hard masking (integer). 357 358 Filtering algorithm ID to apply to the BLAST database as hard masking. 359 360 Incompatible with: db_soft_mask, subject, subject_loc""", 361 equate=False), 362 ] 363 try: 364 # Insert extra parameters - at the start just in case there 365 # are any arguments which must come last: 366 self.parameters = extra_parameters + self.parameters 367 except AttributeError: 368 # Should we raise an error? The subclass should have set this up! 369 self.parameters = extra_parameters 370 _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs)
371
372 - def _validate(self):
373 incompatibles = {"db_soft_mask": ["db_hard_mask", "subject", "subject_loc"], 374 "db_hard_mask": ["db_soft_mask", "subject", "subject_loc"]} 375 self._validate_incompatibilities(incompatibles) 376 _Ncbiblast2SeqCommandline._validate(self)
377 378
379 -class NcbiblastpCommandline(_NcbiblastMain2SeqCommandline):
380 """Create a commandline for the NCBI BLAST+ program blastp (for proteins). 381 382 With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI 383 replaced the old blastall tool with separate tools for each of the searches. 384 This wrapper therefore replaces BlastallCommandline with option -p blastp. 385 386 >>> from Bio.Blast.Applications import NcbiblastpCommandline 387 >>> cline = NcbiblastpCommandline(query="rosemary.pro", db="nr", 388 ... evalue=0.001, remote=True, ungapped=True) 389 >>> cline 390 NcbiblastpCommandline(cmd='blastp', query='rosemary.pro', db='nr', evalue=0.001, remote=True, ungapped=True) 391 >>> print(cline) 392 blastp -query rosemary.pro -db nr -evalue 0.001 -remote -ungapped 393 394 You would typically run the command line with cline() or via the Python 395 subprocess module, as described in the Biopython tutorial. 396 """
397 - def __init__(self, cmd="blastp", **kwargs):
398 self.parameters = [ 399 # General search options: 400 _Option(["-task", "task"], 401 "Task to execute (string, blastp (default), blastp-fast or blastp-short).", 402 checker_function=lambda value: value in ["blastp", 403 "blastp-fast", 404 "blastp-short"], 405 equate=False), 406 _Option(["-matrix", "matrix"], 407 "Scoring matrix name (default BLOSUM62)."), 408 _Option(["-threshold", "threshold"], 409 "Minimum score for words to be added to the BLAST lookup table (float).", 410 equate=False), 411 _Option(["-comp_based_stats", "comp_based_stats"], 412 """Use composition-based statistics (string, default 2, i.e. True). 413 414 0, F or f: no composition-based statistics 415 2, T or t, D or d : Composition-based score adjustment as in 416 Bioinformatics 21:902-911, 2005, conditioned on sequence properties 417 418 Note that tblastn also supports values of 1 and 3.""", 419 checker_function=lambda value: value in "0Ft2TtDd", 420 equate=False), 421 # Query filtering options: 422 _Option(["-seg", "seg"], 423 """Filter query sequence with SEG (string). 424 425 Format: "yes", "window locut hicut", or "no" to disable. 426 Default is "12 2.2 2.5""", 427 equate=False), 428 # Extension options: 429 _Switch(["-ungapped", "ungapped"], 430 "Perform ungapped alignment only?"), 431 # Miscellaneous options: 432 _Switch(["-use_sw_tback", "use_sw_tback"], 433 "Compute locally optimal Smith-Waterman alignments?"), 434 ] 435 _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
436 437
438 -class NcbiblastnCommandline(_NcbiblastMain2SeqCommandline):
439 """Wrapper for the NCBI BLAST+ program blastn (for nucleotides). 440 441 With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI 442 replaced the old blastall tool with separate tools for each of the searches. 443 This wrapper therefore replaces BlastallCommandline with option -p blastn. 444 445 For example, to run a search against the "nt" nucleotide database using the 446 FASTA nucleotide file "m_code.fasta" as the query, with an expectation value 447 cut off of 0.001, saving the output to a file in XML format: 448 449 >>> from Bio.Blast.Applications import NcbiblastnCommandline 450 >>> cline = NcbiblastnCommandline(query="m_cold.fasta", db="nt", strand="plus", 451 ... evalue=0.001, out="m_cold.xml", outfmt=5) 452 >>> cline 453 NcbiblastnCommandline(cmd='blastn', out='m_cold.xml', outfmt=5, query='m_cold.fasta', db='nt', evalue=0.001, strand='plus') 454 >>> print(cline) 455 blastn -out m_cold.xml -outfmt 5 -query m_cold.fasta -db nt -evalue 0.001 -strand plus 456 457 You would typically run the command line with cline() or via the Python 458 subprocess module, as described in the Biopython tutorial. 459 """
460 - def __init__(self, cmd="blastn", **kwargs):
461 self.parameters = [ 462 # Input query options: 463 _Option(["-strand", "strand"], 464 """Query strand(s) to search against database/subject. 465 466 Values allowed are "both" (default), "minus", "plus".""", 467 checker_function=lambda value: value in ["both", 468 "minus", 469 "plus"], 470 equate=False), 471 # General search options: 472 _Option(["-task", "task"], 473 """Task to execute (string, default 'megablast') 474 475 Allowed values 'blastn', 'blastn-short', 'dc-megablast', 'megablast' 476 (the default), or 'vecscreen'.""", 477 checker_function=lambda value: value in ['blastn', 478 'blastn-short', 479 'dc-megablast', 480 'megablast', 481 'vecscreen'], 482 equate=False), 483 _Option(["-penalty", "penalty"], 484 "Penalty for a nucleotide mismatch (integer, at most zero).", 485 equate=False), 486 _Option(["-reward", "reward"], 487 "Reward for a nucleotide match (integer, at least zero).", 488 equate=False), 489 _Option(["-use_index", "use_index"], 490 "Use MegaBLAST database index (Boolean, Default = False)", 491 equate=False), 492 _Option(["-index_name", "index_name"], 493 "MegaBLAST database index name.", 494 equate=False), 495 # Query filtering options: 496 _Option(["-dust", "dust"], 497 """Filter query sequence with DUST (string). 498 499 Format: 'yes', 'level window linker', or 'no' to disable. 500 Default = '20 64 1'. 501 """, 502 equate=False), 503 _Option(["-filtering_db", "filtering_db"], 504 "BLAST database containing filtering elements (i.e. repeats).", 505 equate=False), 506 _Option(["-window_masker_taxid", "window_masker_taxid"], 507 "Enable WindowMasker filtering using a Taxonomic ID (integer).", 508 equate=False), 509 _Option(["-window_masker_db", "window_masker_db"], 510 "Enable WindowMasker filtering using this repeats database (string).", 511 equate=False), 512 # Restrict search or results: 513 _Option(["-perc_identity", "perc_identity"], 514 "Percent identity (real, 0 to 100 inclusive).", 515 equate=False), 516 # Discontiguous MegaBLAST options 517 _Option(["-template_type", "template_type"], 518 """Discontiguous MegaBLAST template type (string). 519 520 Allowed values: 'coding', 'coding_and_optimal' or 'optimal' 521 Requires: template_length.""", 522 checker_function=lambda value: value in ['coding', 'coding_and_optimal', 'optimal'], 523 equate=False), 524 _Option(["-template_length", "template_length"], 525 """Discontiguous MegaBLAST template length (integer). 526 527 Allowed values: 16, 18, 21 528 529 Requires: template_type.""", 530 checker_function=lambda value: value in [16, 18, 21, '16', '18', '21'], 531 equate=False), 532 # Extension options: 533 _Switch(["-no_greedy", "no_greedy"], 534 "Use non-greedy dynamic programming extension"), 535 _Option(["-min_raw_gapped_score", "min_raw_gapped_score"], 536 "Minimum raw gapped score to keep an alignment in the " 537 "preliminary gapped and traceback stages (integer).", 538 equate=False), 539 _Switch(["-ungapped", "ungapped"], 540 "Perform ungapped alignment only?"), 541 _Option(["-off_diagonal_range", "off_diagonal_range"], 542 """Number of off-diagonals to search for the 2nd hit (integer). 543 544 Expects a positive integer, or 0 (default) to turn off. 545 546 Added in BLAST 2.2.23+ 547 """, 548 equate=False), 549 ] 550 _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
551
552 - def _validate(self):
553 if (self.template_type and not self.template_length) \ 554 or (self.template_length and not self.template_type): 555 raise ValueError("Options template_type and template_type require each other.") 556 _NcbiblastMain2SeqCommandline._validate(self)
557 558
559 -class NcbiblastxCommandline(_NcbiblastMain2SeqCommandline):
560 """Wrapper for the NCBI BLAST+ program blastx (nucleotide query, protein database). 561 562 With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI 563 replaced the old blastall tool with separate tools for each of the searches. 564 This wrapper therefore replaces BlastallCommandline with option -p blastx. 565 566 >>> from Bio.Blast.Applications import NcbiblastxCommandline 567 >>> cline = NcbiblastxCommandline(query="m_cold.fasta", db="nr", evalue=0.001) 568 >>> cline 569 NcbiblastxCommandline(cmd='blastx', query='m_cold.fasta', db='nr', evalue=0.001) 570 >>> print(cline) 571 blastx -query m_cold.fasta -db nr -evalue 0.001 572 573 You would typically run the command line with cline() or via the Python 574 subprocess module, as described in the Biopython tutorial. 575 """
576 - def __init__(self, cmd="blastx", **kwargs):
577 self.parameters = [ 578 # Input query options: 579 _Option(["-task", "task"], 580 "Task to execute (string, blastx (default) or blastx-fast).", 581 checker_function=lambda value: value in ["blastx", 582 "blastx-fast"], 583 equate=False), 584 _Option(["-strand", "strand"], 585 """Query strand(s) to search against database/subject. 586 587 Values allowed are "both" (default), "minus", "plus".""", 588 checker_function=lambda value: value in ["both", "minus", "plus"], 589 equate=False), 590 # Input query options: 591 _Option(["-query_gencode", "query_gencode"], 592 "Genetic code to use to translate query (integer, default 1).", 593 equate=False), 594 # General search options: 595 _Option(["-frame_shift_penalty", "frame_shift_penalty"], 596 """Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE). 597 598 This was removed in BLAST 2.2.27+""", 599 equate=False), 600 _Option(["-max_intron_length", "max_intron_length"], 601 """Maximum intron length (integer). 602 603 Length of the largest intron allowed in a translated nucleotide 604 sequence when linking multiple distinct alignments (a negative 605 value disables linking). Default zero.""", 606 equate=False), 607 _Option(["-matrix", "matrix"], 608 "Scoring matrix name (default BLOSUM62).", 609 equate=False), 610 _Option(["-threshold", "threshold"], 611 "Minimum score for words to be added to the BLAST lookup table (float).", 612 equate=False), 613 _Option(["-comp_based_stats", "comp_based_stats"], 614 """Use composition-based statistics for blastp, blastx, or tblastn: 615 616 D or d: default (equivalent to 2 ) 617 0 or F or f: no composition-based statistics 618 1: Composition-based statistics as in NAR 29:2994-3005, 2001 619 2 or T or t : Composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence properties 620 3: Composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally 621 622 For programs other than tblastn, must either be absent or be D, F or 0 623 Default = `2' 624 """, 625 equate=False), 626 # Query filtering options: 627 _Option(["-seg", "seg"], 628 """Filter query sequence with SEG (string). 629 630 Format: "yes", "window locut hicut", or "no" to disable. 631 Default is "12 2.2 2.5""", 632 equate=False), 633 # Extension options: 634 _Switch(["-ungapped", "ungapped"], 635 "Perform ungapped alignment only?"), 636 _Switch(["-use_sw_tback", "use_sw_tback"], 637 "Compute locally optimal Smith-Waterman alignments?"), 638 ] 639 _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
640 641
642 -class NcbitblastnCommandline(_NcbiblastMain2SeqCommandline):
643 """Wrapper for the NCBI BLAST+ program tblastn. 644 645 With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI 646 replaced the old blastall tool with separate tools for each of the searches. 647 This wrapper therefore replaces BlastallCommandline with option -p tblastn. 648 649 >>> from Bio.Blast.Applications import NcbitblastnCommandline 650 >>> cline = NcbitblastnCommandline(help=True) 651 >>> cline 652 NcbitblastnCommandline(cmd='tblastn', help=True) 653 >>> print(cline) 654 tblastn -help 655 656 You would typically run the command line with cline() or via the Python 657 subprocess module, as described in the Biopython tutorial. 658 """
659 - def __init__(self, cmd="tblastn", **kwargs):
660 self.parameters = [ 661 # General search options: 662 _Option(["-task", "task"], 663 "Task to execute (string, tblastn (default) or tblastn-fast).", 664 checker_function=lambda value: value in ["tblastn", 665 "tblastn-fast"], 666 equate=False), 667 _Option(["-db_gencode", "db_gencode"], 668 "Genetic code to use to translate query (integer, default 1).", 669 equate=False), 670 _Option(["-frame_shift_penalty", "frame_shift_penalty"], 671 """Frame shift penalty (integer, at least 1, default ignored) (OBSOLETE). 672 673 This was removed in BLAST 2.2.27+""", 674 equate=False), 675 _Option(["-max_intron_length", "max_intron_length"], 676 """Maximum intron length (integer). 677 678 Length of the largest intron allowed in a translated nucleotide 679 sequence when linking multiple distinct alignments (a negative 680 value disables linking). Default zero.""", 681 equate=False), 682 _Option(["-matrix", "matrix"], 683 "Scoring matrix name (default BLOSUM62).", 684 equate=False), 685 _Option(["-threshold", "threshold"], 686 "Minimum score for words to be added to the BLAST lookup table (float).", 687 equate=False), 688 _Option(["-comp_based_stats", "comp_based_stats"], 689 """Use composition-based statistics (string, default 2, i.e. True). 690 691 0, F or f: no composition-based statistics 692 1: Composition-based statistics as in NAR 29:2994-3005, 2001 693 2, T or t, D or d : Composition-based score adjustment as in 694 Bioinformatics 21:902-911, 2005, conditioned on sequence properties 695 3: Composition-based score adjustment as in Bioinformatics 21:902-911, 696 2005, unconditionally 697 698 Note that only tblastn supports values of 1 and 3.""", 699 checker_function=lambda value: value in "0Ft12TtDd3", 700 equate=False), 701 # Query filtering options: 702 _Option(["-seg", "seg"], 703 """Filter query sequence with SEG (string). 704 705 Format: "yes", "window locut hicut", or "no" to disable. 706 Default is "12 2.2 2.5""", 707 equate=False), 708 # Extension options: 709 _Switch(["-ungapped", "ungapped"], 710 "Perform ungapped alignment only?"), 711 # Miscellaneous options: 712 _Switch(["-use_sw_tback", "use_sw_tback"], 713 "Compute locally optimal Smith-Waterman alignments?"), 714 # PSI-TBLASTN options: 715 _Option(["-in_pssm", "in_pssm"], 716 """PSI-BLAST checkpoint file 717 718 Incompatible with: remote, query""", 719 filename=True, 720 equate=False), 721 ] 722 _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
723 724
725 -class NcbitblastxCommandline(_NcbiblastMain2SeqCommandline):
726 """Wrapper for the NCBI BLAST+ program tblastx. 727 728 With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI 729 replaced the old blastall tool with separate tools for each of the searches. 730 This wrapper therefore replaces BlastallCommandline with option -p tblastx. 731 732 >>> from Bio.Blast.Applications import NcbitblastxCommandline 733 >>> cline = NcbitblastxCommandline(help=True) 734 >>> cline 735 NcbitblastxCommandline(cmd='tblastx', help=True) 736 >>> print(cline) 737 tblastx -help 738 739 You would typically run the command line with cline() or via the Python 740 subprocess module, as described in the Biopython tutorial. 741 """
742 - def __init__(self, cmd="tblastx", **kwargs):
743 self.parameters = [ 744 # Input query options: 745 _Option(["-strand", "strand"], 746 """Query strand(s) to search against database/subject. 747 748 Values allowed are "both" (default), "minus", "plus".""", 749 checker_function=lambda value: value in ["both", "minus", "plus"], 750 equate=False), 751 # Input query options: 752 _Option(["-query_gencode", "query_gencode"], 753 "Genetic code to use to translate query (integer, default 1).", 754 equate=False), 755 # General search options: 756 _Option(["-db_gencode", "db_gencode"], 757 "Genetic code to use to translate query (integer, default 1).", 758 equate=False), 759 _Option(["-max_intron_length", "max_intron_length"], 760 """Maximum intron length (integer). 761 762 Length of the largest intron allowed in a translated nucleotide 763 sequence when linking multiple distinct alignments (a negative 764 value disables linking). Default zero.""", 765 equate=False), 766 _Option(["-matrix", "matrix"], 767 "Scoring matrix name (default BLOSUM62).", 768 equate=False), 769 _Option(["-threshold", "threshold"], 770 "Minimum score for words to be added to the BLAST lookup table (float).", 771 equate=False), 772 # Query filtering options: 773 _Option(["-seg", "seg"], 774 """Filter query sequence with SEG (string). 775 776 Format: "yes", "window locut hicut", or "no" to disable. 777 Default is "12 2.2 2.5""", 778 equate=False), 779 ] 780 _NcbiblastMain2SeqCommandline.__init__(self, cmd, **kwargs)
781 782
783 -class NcbipsiblastCommandline(_Ncbiblast2SeqCommandline):
784 """Wrapper for the NCBI BLAST+ program psiblast. 785 786 With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI 787 replaced the old blastpgp tool with a similar tool psiblast. This wrapper 788 therefore replaces BlastpgpCommandline, the wrapper for blastpgp. 789 790 >>> from Bio.Blast.Applications import NcbipsiblastCommandline 791 >>> cline = NcbipsiblastCommandline(help=True) 792 >>> cline 793 NcbipsiblastCommandline(cmd='psiblast', help=True) 794 >>> print(cline) 795 psiblast -help 796 797 You would typically run the command line with cline() or via the Python 798 subprocess module, as described in the Biopython tutorial. 799 """
800 - def __init__(self, cmd="psiblast", **kwargs):
801 self.parameters = [ 802 # General search options: 803 _Option(["-matrix", "matrix"], 804 "Scoring matrix name (default BLOSUM62).", 805 equate=False), 806 _Option(["-threshold", "threshold"], 807 "Minimum score for words to be added to the BLAST lookup table (float).", 808 equate=False), 809 _Option(["-comp_based_stats", "comp_based_stats"], 810 """Use composition-based statistics (string, default 2, i.e. True). 811 812 0, F or f: no composition-based statistics 813 2, T or t, D or d : Composition-based score adjustment 814 as in Bioinformatics 21:902-911, 2005, conditioned on 815 sequence properties 816 817 Note that tblastn also supports values of 1 and 3.""", 818 checker_function=lambda value: value in "0Ft2TtDd", 819 equate=False), 820 # Query filtering options: 821 _Option(["-seg", "seg"], 822 """Filter query sequence with SEG (string). 823 824 Format: "yes", "window locut hicut", or "no" to disable. 825 Default is "12 2.2 2.5""", 826 equate=False), 827 # Extension options: 828 _Option(["-gap_trigger", "gap_trigger"], 829 "Number of bits to trigger gapping (float, default 22).", 830 equate=False), 831 # Miscellaneous options: 832 _Switch(["-use_sw_tback", "use_sw_tback"], 833 "Compute locally optimal Smith-Waterman alignments?"), 834 # PSI-BLAST options: 835 _Option(["-num_iterations", "num_iterations"], 836 """Number of iterations to perform (integer, at least one). 837 838 Default is one. 839 Incompatible with: remote""", 840 equate=False), 841 _Option(["-out_pssm", "out_pssm"], 842 "File name to store checkpoint file.", 843 filename=True, 844 equate=False), 845 _Option(["-out_ascii_pssm", "out_ascii_pssm"], 846 "File name to store ASCII version of PSSM.", 847 filename=True, 848 equate=False), 849 _Option(["-in_msa", "in_msa"], 850 """File name of multiple sequence alignment to restart PSI-BLAST. 851 852 Incompatible with: in_pssm, query""", 853 filename=True, 854 equate=False), 855 _Option(["-msa_master_idx", "msa_master_idx"], 856 """Index of sequence to use as master in MSA. 857 858 Index (1-based) of sequence to use as the master in the 859 multiple sequence alignment. If not specified, the first 860 sequence is used.""", 861 equate=False), 862 _Option(["-in_pssm", "in_pssm"], 863 """PSI-BLAST checkpoint file. 864 865 Incompatible with: in_msa, query, phi_pattern""", 866 filename=True, 867 equate=False), 868 # PSSM engine options: 869 _Option(["-pseudocount", "pseudocount"], 870 """Pseudo-count value used when constructing PSSM. 871 872 Integer. Default is zero.""", 873 equate=False), 874 _Option(["-inclusion_ethresh", "inclusion_ethresh"], 875 "E-value inclusion threshold for pairwise alignments (float, default 0.002).", 876 equate=False), 877 _Switch(["-ignore_msa_master", "ignore_msa_master"], 878 """Ignore the master sequence when creating PSSM 879 880 Requires: in_msa 881 Incompatible with: msa_master_idx, in_pssm, query, query_loc, phi_pattern 882 """), 883 # PHI-BLAST options: 884 _Option(["-phi_pattern", "phi_pattern"], 885 """File name containing pattern to search. 886 887 Incompatible with: in_pssm""", 888 filename=True, 889 equate=False), 890 ] 891 _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs)
892
893 - def _validate(self):
894 incompatibles = {"num_iterations": ["remote"], 895 "in_msa": ["in_pssm", "query"], 896 "in_pssm": ["in_msa", "query", "phi_pattern"], 897 "ignore_msa_master": ["msa_master_idx", "in_pssm", 898 "query", "query_loc", "phi_pattern"], 899 } 900 self._validate_incompatibilities(incompatibles) 901 _Ncbiblast2SeqCommandline._validate(self)
902 903
904 -class NcbirpsblastCommandline(_NcbiblastCommandline):
905 """Wrapper for the NCBI BLAST+ program rpsblast. 906 907 With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI 908 replaced the old rpsblast tool with a similar tool of the same name. This 909 wrapper replaces RpsBlastCommandline, the wrapper for the old rpsblast. 910 911 >>> from Bio.Blast.Applications import NcbirpsblastCommandline 912 >>> cline = NcbirpsblastCommandline(help=True) 913 >>> cline 914 NcbirpsblastCommandline(cmd='rpsblast', help=True) 915 >>> print(cline) 916 rpsblast -help 917 918 You would typically run the command line with cline() or via the Python 919 subprocess module, as described in the Biopython tutorial. 920 """
921 - def __init__(self, cmd="rpsblast", **kwargs):
922 # TODO - remove the -word_size argument as per BLAST+ 2.2.30 923 # (BLAST team say it should never have been included, since 924 # the word size is set when building the domain database.) 925 # This likely means reviewing the class hierarchy again. 926 self.parameters = [ 927 # Query filtering options: 928 _Option(["-seg", "seg"], 929 """Filter query sequence with SEG (string). 930 931 Format: "yes", "window locut hicut", or "no" to disable. 932 Default is "12 2.2 2.5""", 933 equate=False), 934 # Restrict search or results: 935 _Option(["-culling_limit", "culling_limit"], 936 """Hit culling limit (integer). 937 938 If the query range of a hit is enveloped by that of at 939 least this many higher-scoring hits, delete the hit. 940 941 Incompatible with: best_hit_overhang, best_hit_score_edge. 942 """, 943 equate=False), 944 _Option(["-best_hit_overhang", "best_hit_overhang"], 945 """Best Hit algorithm overhang value (recommended value: 0.1) 946 947 Float between 0.0 and 0.5 inclusive. 948 949 Incompatible with: culling_limit.""", 950 equate=False), 951 _Option(["-best_hit_score_edge", "best_hit_score_edge"], 952 """Best Hit algorithm score edge value (recommended value: 0.1) 953 954 Float between 0.0 and 0.5 inclusive. 955 956 Incompatible with: culling_limit.""", 957 equate=False), 958 # General search options: 959 _Option(["-comp_based_stats", "comp_based_stats"], 960 """Use composition-based statistics. 961 962 D or d: default (equivalent to 0 ) 963 0 or F or f: Simplified Composition-based statistics as in 964 Bioinformatics 15:1000-1011, 1999 965 1 or T or t: Composition-based statistics as in NAR 29:2994-3005, 2001 966 967 Default = `0' 968 """, 969 checker_function=lambda value: value in "Dd0Ff1Tt", 970 equate=False), 971 # Misc options: 972 _Switch(["-use_sw_tback", "use_sw_tback"], 973 "Compute locally optimal Smith-Waterman alignments?"), 974 ] 975 _NcbiblastCommandline.__init__(self, cmd, **kwargs)
976
977 - def _validate(self):
978 incompatibles = {"culling_limit": ["best_hit_overhang", "best_hit_score_edge"]} 979 self._validate_incompatibilities(incompatibles) 980 _NcbiblastCommandline._validate(self)
981 982
983 -class NcbirpstblastnCommandline(_NcbiblastCommandline):
984 """Wrapper for the NCBI BLAST+ program rpstblastn. 985 986 With the release of BLAST+ (BLAST rewritten in C++ instead of C), the NCBI 987 replaced the old rpsblast tool with a similar tool of the same name, and a 988 separate tool rpstblastn for Translated Reverse Position Specific BLAST. 989 990 >>> from Bio.Blast.Applications import NcbirpstblastnCommandline 991 >>> cline = NcbirpstblastnCommandline(help=True) 992 >>> cline 993 NcbirpstblastnCommandline(cmd='rpstblastn', help=True) 994 >>> print(cline) 995 rpstblastn -help 996 997 You would typically run the command line with cline() or via the Python 998 subprocess module, as described in the Biopython tutorial. 999 """
1000 - def __init__(self, cmd="rpstblastn", **kwargs):
1001 # TODO - remove the -word_size argument as per BLAST+ 2.2.30 1002 # (BLAST team say it should never have been included, since 1003 # the word size is set when building the domain database.) 1004 # This likely means reviewing the class hierarchy again. 1005 self.parameters = [ 1006 # Input query options: 1007 _Option(["-strand", "strand"], 1008 """Query strand(s) to search against database/subject. 1009 1010 Values allowed are "both" (default), "minus", "plus".""", 1011 checker_function=lambda value: value in ["both", 1012 "minus", 1013 "plus"], 1014 equate=False), 1015 # Input query options: 1016 _Option(["-query_gencode", "query_gencode"], 1017 "Genetic code to use to translate query (integer, default 1).", 1018 equate=False), 1019 # Query filtering options: 1020 _Option(["-seg", "seg"], 1021 """Filter query sequence with SEG (string). 1022 1023 Format: "yes", "window locut hicut", or "no" to disable. 1024 Default is "12 2.2 2.5""", 1025 equate=False), 1026 # Extension options: 1027 _Switch(["-ungapped", "ungapped"], 1028 "Perform ungapped alignment only?"), 1029 ] 1030 _NcbiblastCommandline.__init__(self, cmd, **kwargs)
1031 1032
1033 -class NcbiblastformatterCommandline(_NcbibaseblastCommandline):
1034 """Wrapper for the NCBI BLAST+ program blast_formatter. 1035 1036 With the release of BLAST 2.2.24+ (i.e. the BLAST suite rewritten in C++ 1037 instead of C), the NCBI added the ASN.1 output format option to all the 1038 search tools, and extended the blast_formatter to support this as input. 1039 1040 The blast_formatter command allows you to convert the ASN.1 output into 1041 the other output formats (XML, tabular, plain text, HTML). 1042 1043 >>> from Bio.Blast.Applications import NcbiblastformatterCommandline 1044 >>> cline = NcbiblastformatterCommandline(archive="example.asn", outfmt=5, out="example.xml") 1045 >>> cline 1046 NcbiblastformatterCommandline(cmd='blast_formatter', out='example.xml', outfmt=5, archive='example.asn') 1047 >>> print(cline) 1048 blast_formatter -out example.xml -outfmt 5 -archive example.asn 1049 1050 You would typically run the command line with cline() or via the Python 1051 subprocess module, as described in the Biopython tutorial. 1052 1053 Note that this wrapper is for the version of blast_formatter from BLAST 1054 2.2.24+ (or later) which is when the NCBI first announced the inclusion 1055 this tool. There was actually an early version in BLAST 2.2.23+ (and 1056 possibly in older releases) but this did not have the -archive option 1057 (instead -rid is a mandatory argument), and is not supported by this 1058 wrapper. 1059 """
1060 - def __init__(self, cmd="blast_formatter", **kwargs):
1061 self.parameters = [ 1062 # Input options 1063 _Option(["-rid", "rid"], 1064 "BLAST Request ID (RID), not compatible with archive arg", 1065 equate=False), 1066 _Option(["-archive", "archive"], 1067 "Archive file of results, not compatible with rid arg.", 1068 filename=True, 1069 equate=False), 1070 # Restrict search or results 1071 _Option(["-max_target_seqs", "max_target_seqs"], 1072 "Maximum number of aligned sequences to keep", 1073 checker_function=lambda value: value >= 1, 1074 equate=False), 1075 ] 1076 _NcbibaseblastCommandline.__init__(self, cmd, **kwargs)
1077
1078 - def _validate(self):
1079 incompatibles = {"rid": ["archive"]} 1080 self._validate_incompatibilities(incompatibles) 1081 _NcbibaseblastCommandline._validate(self)
1082 1083
1084 -class NcbideltablastCommandline(_Ncbiblast2SeqCommandline):
1085 """Create a commandline for the NCBI BLAST+ program deltablast (for proteins). 1086 1087 This is a wrapper for the deltablast command line command included in 1088 the NCBI BLAST+ software (not present in the original BLAST). 1089 1090 >>> from Bio.Blast.Applications import NcbideltablastCommandline 1091 >>> cline = NcbideltablastCommandline(query="rosemary.pro", db="nr", 1092 ... evalue=0.001, remote=True) 1093 >>> cline 1094 NcbideltablastCommandline(cmd='deltablast', query='rosemary.pro', db='nr', evalue=0.001, remote=True) 1095 >>> print(cline) 1096 deltablast -query rosemary.pro -db nr -evalue 0.001 -remote 1097 1098 You would typically run the command line with cline() or via the Python 1099 subprocess module, as described in the Biopython tutorial. 1100 """
1101 - def __init__(self, cmd="deltablast", **kwargs):
1102 self.parameters = [ 1103 # General search options: 1104 _Option(["-matrix", "matrix"], 1105 "Scoring matrix name (default BLOSUM62)."), 1106 _Option(["-threshold", "threshold"], 1107 "Minimum score for words to be added to the BLAST lookup table (float).", 1108 equate=False), 1109 _Option(["-comp_based_stats", "comp_based_stats"], 1110 """Use composition-based statistics (string, default 2, i.e. True). 1111 1112 0, F or f: no composition-based statistics. 1113 2, T or t, D or d : Composition-based score adjustment as in 1114 Bioinformatics 21:902-911, 2005, conditioned on sequence properties 1115 1116 Note that tblastn also supports values of 1 and 3.""", 1117 checker_function=lambda value: value in "0Ft2TtDd", 1118 equate=False), 1119 # Query filtering options: 1120 _Option(["-seg", "seg"], 1121 """Filter query sequence with SEG (string). 1122 1123 Format: "yes", "window locut hicut", or "no" to disable. 1124 Default is "12 2.2 2.5""", 1125 equate=False), 1126 # Extension options: 1127 _Option(["-gap_trigger", "gap_trigger"], 1128 "Number of bits to trigger gapping Default = 22", 1129 equate=False), 1130 # Miscellaneous options: 1131 _Switch(["-use_sw_tback", "use_sw_tback"], 1132 "Compute locally optimal Smith-Waterman alignments?"), 1133 # PSI-BLAST options 1134 _Option(["-num_iterations", "num_iterations"], 1135 """Number of iterations to perform. (integer >=1, Default is 1) 1136 1137 Incompatible with: remote""", 1138 equate=False), 1139 _Option(["-out_pssm", "out_pssm"], 1140 "File name to store checkpoint file.", 1141 filename=True, 1142 equate=False), 1143 _Option(["-out_ascii_pssm", "out_ascii_pssm"], 1144 "File name to store ASCII version of PSSM.", 1145 filename=True, 1146 equate=False), 1147 # PSSM engine options 1148 _Option(["-pseudocount", "pseudocount"], 1149 "Pseudo-count value used when constructing PSSM (integer, default 0).", 1150 equate=False), 1151 _Option(["-domain_inclusion_ethresh", "domain_inclusion_ethresh"], 1152 """E-value inclusion threshold for alignments with conserved domains. 1153 1154 (float, Default is 0.05)""", 1155 equate=False), 1156 _Option(["-inclusion_ethresh", "inclusion_ethresh"], 1157 "E-value inclusion threshold for pairwise alignments (float, Default is 0.002).", 1158 equate=False), 1159 # DELTA-BLAST options 1160 _Option(["-rpsdb", "rpsdb"], 1161 "BLAST domain database name (dtring, Default = 'cdd_delta').", 1162 equate=False), 1163 _Switch(["-show_domain_hits", "show_domain_hits"], 1164 """Show domain hits? 1165 1166 Incompatible with: remote, subject""") 1167 ] 1168 _Ncbiblast2SeqCommandline.__init__(self, cmd, **kwargs)
1169 1170
1171 -def _test():
1172 """Run the Bio.Blast.Applications module's doctests.""" 1173 import doctest 1174 doctest.testmod(verbose=1)
1175 1176 if __name__ == "__main__": 1177 # Run the doctests 1178 _test() 1179