Package Bio :: Package Phylo :: Package Applications :: Module _Fasttree
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.Applications._Fasttree

  1  # Copyright 2013 by Nate Sutton.  Based on code in _Phyml.py by Eric Talevich.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its license. 
  3  # Please see the LICENSE file that should have been included as part of this 
  4  # package. 
  5  """Command-line wrapper for tree inference program Fasttree.""" 
  6  from __future__ import print_function 
  7   
  8   
  9  from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline 
 10   
 11   
12 -def _is_int(x):
13 """Test whether the argument can be serialized as an integer.""" 14 return isinstance(x, int) or str(x).isdigit()
15 16
17 -def _is_numeric(x):
18 """Test whether the argument can be serialized as a number.""" 19 try: 20 float(str(x)) 21 return True 22 except ValueError: 23 return False
24 25
26 -class FastTreeCommandline(AbstractCommandline):
27 """Command-line wrapper for FastTree. 28 29 Homepage: http://www.microbesonline.org/fasttree/ 30 31 Citations: 32 33 Price, M.N., Dehal, P.S., and Arkin, A.P. (2010) FastTree 2 -- Approximately 34 Maximum-Likelihood Trees for Large Alignments. PLoS ONE, 5(3):e9490. 35 doi:10.1371/journal.pone.0009490. 36 37 Example usage: 38 39 >>> import _Fasttree 40 >>> fasttree_exe = r"C:\FasttreeWin32\fasttree.exe" 41 >>> cmd = _Fasttree.FastTreeCommandline(fasttree_exe, input=r'C:\Input\ExampleAlignment.fsa', out='C:\Output\ExampleTree.tree') 42 >>> print(cmd) 43 >>> out, err = cmd() 44 >>> print(out) 45 >>> print(err) 46 47 Usage advice: 48 the only parameters needed are (fasttree_exe, input='<InputFile>' out='<OutputFile>') 49 50 parameters that use values are added this way: (fasttree_exe, parameter=value, input='<InputFile>' out='<OutputFile>') 51 parameters that don't use values are added this way: (fasttree_exe, parameter=True, input='<InputFile>' out='<OutputFile>') 52 53 from the command line use 'fasttree.exe -help' or 'fasttree.exe -expert' for more explanation of usage options 54 """ 55
56 - def __init__(self, cmd='fasttree', **kwargs):
57 self.parameters = [ 58 _Switch(['-nt', 'nt'], 59 "By default FastTree expects protein alignments, use -nt for nucleotides", 60 ), 61 _Option(['-n', 'n'], 62 """-n -- read N multiple alignments in. 63 64 This only works with phylip interleaved format. For example, you can 65 use it with the output from phylip's seqboot. If you use -n, FastTree 66 will write 1 tree per line to standard output. 67 """, 68 checker_function=_is_int, 69 equate=False, 70 ), 71 _Switch(['-quote', 'quote'], 72 """-quote -- add quotes to sequence names in output. 73 74 Quote sequence names in the output and allow spaces, commas, 75 parentheses, and colons in them but not ' characters (fasta files only). 76 """, 77 ), 78 _Option(['-pseudo', 'pseudo'], 79 """-pseudo [weight] -- Pseudocounts are used with sequence distance estimation. 80 81 Use pseudocounts to estimate distances between sequences with little or no 82 overlap. (Off by default.) Recommended if analyzing the alignment has 83 sequences with little or no overlap. 84 If the weight is not specified, it is 1.0 85 """, 86 checker_function=_is_numeric, 87 equate=False, 88 ), 89 _Option(['-boot', 'boot'], 90 """Specify the number of resamples for support values. 91 92 Support value options: 93 By default, FastTree computes local support values by resampling the site 94 likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, 95 it will compute minimum-evolution bootstrap supports instead 96 In either case, the support values are proportions ranging from 0 to 1 97 98 Use -nosupport to turn off support values or -boot 100 to use just 100 resamples. 99 """, 100 checker_function=_is_int, 101 equate=False, 102 ), 103 _Switch(['-nosupport', 'nosupport'], 104 """Turn off support values. 105 106 Support value options: 107 By default, FastTree computes local support values by resampling the site 108 likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, 109 it will compute minimum-evolution bootstrap supports instead 110 In either case, the support values are proportions ranging from 0 to 1 111 112 Use -nosupport to turn off support values or -boot 100 to use just 100 resamples. 113 """, 114 ), 115 _Option(['-intree', 'intree'], 116 """-intree newickfile -- read the starting tree in from newickfile. 117 118 Any branch lengths in the starting trees are ignored. 119 -intree with -n will read a separate starting tree for each alignment. 120 """, 121 filename=True, 122 equate=False, 123 ), 124 _Option(['-intree1', 'intree1'], 125 "intree1 newickfile -- read the same starting tree for each alignment.", 126 filename=True, 127 equate=False, 128 ), 129 _Switch(['-quiet', 'quiet'], 130 """-quiet -- do not write to standard error during normal operation 131 132 (no progress indicator, no options summary, no likelihood values, etc.) 133 """, 134 ), 135 _Switch(['-nopr', 'nopr'], 136 "-nopr -- do not write the progress indicator to stderr.", 137 ), 138 _Option(['-nni', 'nni'], 139 """Set the rounds of minimum-evolution nearest-neighbor interchanges 140 141 Topology refinement: 142 By default, FastTree tries to improve the tree with up to 4*log2(N) 143 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 144 where N is the number of unique sequences, 2 rounds of 145 subtree-prune-regraft (SPR) moves (also min. evo.), and 146 up to 2*log(N) rounds of maximum-likelihood NNIs. 147 Use -nni to set the number of rounds of min. evo. NNIs. 148 """, 149 checker_function=_is_int, 150 equate=False, 151 ), 152 _Option(['-spr', 'spr'], 153 """Set the rounds of subtree-prune-regraft moves 154 155 Topology refinement: 156 By default, FastTree tries to improve the tree with up to 4*log2(N) 157 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 158 where N is the number of unique sequences, 2 rounds of 159 subtree-prune-regraft (SPR) moves (also min. evo.), and 160 up to 2*log(N) rounds of maximum-likelihood NNIs. 161 Use -nni to set the number of rounds of min. evo. NNIs, 162 and -spr to set the rounds of SPRs. 163 """, 164 checker_function=_is_int, 165 equate=False, 166 ), 167 _Switch(['-noml', 'noml'], 168 """Deactivate min-evo NNIs and SPRs. 169 170 Topology refinement: 171 By default, FastTree tries to improve the tree with up to 4*log2(N) 172 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 173 where N is the number of unique sequences, 2 rounds of 174 subtree-prune-regraft (SPR) moves (also min. evo.), and 175 up to 2*log(N) rounds of maximum-likelihood NNIs. 176 Use -nni to set the number of rounds of min. evo. NNIs, 177 and -spr to set the rounds of SPRs. 178 Use -noml to turn off both min-evo NNIs and SPRs (useful if refining 179 an approximately maximum-likelihood tree with further NNIs). 180 """, 181 ), 182 _Switch(['-mllen', 'mllen'], 183 """Optimize branch lengths on a fixed topology. 184 185 Topology refinement: 186 By default, FastTree tries to improve the tree with up to 4*log2(N) 187 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 188 where N is the number of unique sequences, 2 rounds of 189 subtree-prune-regraft (SPR) moves (also min. evo.), and 190 up to 2*log(N) rounds of maximum-likelihood NNIs. 191 Use -nni to set the number of rounds of min. evo. NNIs, 192 and -spr to set the rounds of SPRs. 193 Use -mllen to optimize branch lengths without ML NNIs 194 Use -mllen -nome with -intree to optimize branch lengths on a fixed topology. 195 """, 196 ), 197 _Switch(['-nome', 'nome'], 198 """Changes support values calculation to a minimum-evolution bootstrap method. 199 200 Topology refinement: 201 By default, FastTree tries to improve the tree with up to 4*log2(N) 202 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 203 where N is the number of unique sequences, 2 rounds of 204 subtree-prune-regraft (SPR) moves (also min. evo.), and 205 up to 2*log(N) rounds of maximum-likelihood NNIs. 206 Use -nni to set the number of rounds of min. evo. NNIs, 207 and -spr to set the rounds of SPRs. 208 Use -mllen to optimize branch lengths without ML NNIs 209 Use -mllen -nome with -intree to optimize branch lengths on a fixed topology 210 211 Support value options: 212 By default, FastTree computes local support values by resampling the site 213 likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, 214 it will compute minimum-evolution bootstrap supports instead 215 In either case, the support values are proportions ranging from 0 to 1. 216 """, 217 ), 218 _Option(['-mlnni', 'mlnni'], 219 """Set the number of rounds of maximum-likelihood NNIs. 220 221 Topology refinement: 222 By default, FastTree tries to improve the tree with up to 4*log2(N) 223 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 224 where N is the number of unique sequences, 2 rounds of 225 subtree-prune-regraft (SPR) moves (also min. evo.), and 226 up to 2*log(N) rounds of maximum-likelihood NNIs. 227 Use -nni to set the number of rounds of min. evo. NNIs, 228 and -spr to set the rounds of SPRs. 229 Use -mlnni to set the number of rounds of maximum-likelihood NNIs. 230 """, 231 checker_function=_is_int, 232 equate=False, 233 ), 234 _Option(['-mlacc', 'mlacc'], 235 """Option for optimization of branches at each NNI. 236 237 Topology refinement: 238 By default, FastTree tries to improve the tree with up to 4*log2(N) 239 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 240 where N is the number of unique sequences, 2 rounds of 241 subtree-prune-regraft (SPR) moves (also min. evo.), and 242 up to 2*log(N) rounds of maximum-likelihood NNIs. 243 Use -nni to set the number of rounds of min. evo. NNIs, 244 and -spr to set the rounds of SPRs. 245 Use -mlacc 2 or -mlacc 3 to always optimize all 5 branches at each NNI, 246 and to optimize all 5 branches in 2 or 3 rounds. 247 """, 248 checker_function=_is_int, 249 equate=False, 250 ), 251 _Switch(['-slownni', 'slownni'], 252 """Turn off heuristics to avoid constant subtrees with NNIs. 253 254 Topology refinement: 255 By default, FastTree tries to improve the tree with up to 4*log2(N) 256 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 257 where N is the number of unique sequences, 2 rounds of 258 subtree-prune-regraft (SPR) moves (also min. evo.), and 259 up to 2*log(N) rounds of maximum-likelihood NNIs. 260 Use -nni to set the number of rounds of min. evo. NNIs, 261 and -spr to set the rounds of SPRs. 262 Use -slownni to turn off heuristics to avoid constant subtrees 263 (affects both ML and ME NNIs). 264 """, 265 ), 266 _Switch(['-wag', 'wag'], 267 """Maximum likelihood model options. 268 269 Whelan-And-Goldman 2001 model instead of (default) 270 Jones-Taylor-Thorton 1992 model (a.a. only) 271 """, 272 ), 273 _Switch(['-gtr', 'gtr'], 274 """Maximum likelihood model options. 275 276 Use generalized time-reversible instead of (default) 277 Jukes-Cantor (nt only) 278 """, 279 ), 280 _Option(['-cat', 'cat'], 281 """Maximum likelihood model options. 282 283 Specify the number of rate categories of sites (default 20).""", 284 checker_function=_is_int, 285 equate=False, 286 ), 287 _Switch(['-nocat', 'nocat'], 288 "Maximum likelihood model options: No CAT model (just 1 category)", 289 ), 290 _Switch(['-gamma', 'gamma'], 291 """Report the likelihood under the discrete gamma model. 292 293 Maximum likelihood model options: 294 -gamma -- after the final round of optimizing branch lengths with the CAT model, 295 report the likelihood under the discrete gamma model with the same 296 number of categories. FastTree uses the same branch lengths but 297 optimizes the gamma shape parameter and the scale of the lengths. 298 The final tree will have rescaled lengths. Used with -log, this 299 also generates per-site likelihoods for use with CONSEL, see 300 GammaLogToPaup.pl and documentation on the FastTree web site. 301 """, 302 ), 303 _Switch(['-slow', 'slow'], 304 """Use an exhaustive search. 305 306 Searching for the best join: 307 By default, FastTree combines the 'visible set' of fast neighbor-joining with 308 local hill-climbing as in relaxed neighbor-joining 309 -slow -- exhaustive search (like NJ or BIONJ, but different gap handling) 310 -slow takes half an hour instead of 8 seconds for 1,250 proteins 311 """, 312 ), 313 _Switch(['-fastest', 'fastest'], 314 """Search the visible set (the top hit for each node) only. 315 316 Searching for the best join: 317 By default, FastTree combines the 'visible set' of fast neighbor-joining with 318 local hill-climbing as in relaxed neighbor-joining 319 -fastest -- search the visible set (the top hit for each node) only 320 Unlike the original fast neighbor-joining, -fastest updates visible(C) 321 after joining A and B if join(AB,C) is better than join(C,visible(C)) 322 -fastest also updates out-distances in a very lazy way, 323 -fastest sets -2nd on as well, use -fastest -no2nd to avoid this 324 """, 325 ), 326 _Switch(['-2nd', 'second'], 327 """Turn 2nd-level top hits heuristic on. 328 329 Top-hit heuristics: 330 By default, FastTree uses a top-hit list to speed up search 331 Use -notop (or -slow) to turn this feature off 332 and compare all leaves to each other, 333 and all new joined nodes to each other 334 335 -2nd or -no2nd to turn 2nd-level top hits heuristic on or off 336 This reduces memory usage and running time but may lead to 337 marginal reductions in tree quality. 338 (By default, -fastest turns on -2nd.) 339 """, 340 ), 341 _Switch(['-no2nd', 'no2nd'], 342 """Turn 2nd-level top hits heuristic off. 343 344 Top-hit heuristics: 345 By default, FastTree uses a top-hit list to speed up search 346 Use -notop (or -slow) to turn this feature off 347 and compare all leaves to each other, 348 and all new joined nodes to each other 349 350 -2nd or -no2nd to turn 2nd-level top hits heuristic on or off 351 This reduces memory usage and running time but may lead to 352 marginal reductions in tree quality. 353 (By default, -fastest turns on -2nd.) 354 """, 355 ), 356 _Option(['-seed', 'seed'], 357 """Use -seed to initialize the random number generator. 358 359 Support value options: 360 By default, FastTree computes local support values by resampling the site 361 likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, 362 it will compute minimum-evolution bootstrap supports instead 363 In either case, the support values are proportions ranging from 0 to 1. 364 """, 365 checker_function=_is_int, 366 equate=False, 367 ), 368 _Switch(['-top', 'top'], 369 """Top-hit list to speed up search 370 371 Top-hit heuristics: 372 By default, FastTree uses a top-hit list to speed up search 373 Use -notop (or -slow) to turn this feature off 374 and compare all leaves to each other, 375 and all new joined nodes to each other. 376 """, 377 ), 378 _Switch(['-notop', 'notop'], 379 """Turn off top-hit list to speed up search 380 381 Top-hit heuristics: 382 By default, FastTree uses a top-hit list to speed up search 383 Use -notop (or -slow) to turn this feature off 384 and compare all leaves to each other, 385 and all new joined nodes to each other. 386 """, 387 ), 388 _Option(['-topm', 'topm'], 389 """Change the top hits calculation method 390 391 Top-hit heuristics: 392 By default, FastTree uses a top-hit list to speed up search 393 -topm 1.0 -- set the top-hit list size to parameter*sqrt(N) 394 FastTree estimates the top m hits of a leaf from the 395 top 2*m hits of a 'close' neighbor, where close is 396 defined as d(seed,close) < 0.75 * d(seed, hit of rank 2*m), 397 and updates the top-hits as joins proceed. 398 """, 399 checker_function=_is_numeric, 400 equate=False, 401 ), 402 _Option(['-close', 'close'], 403 """Modify the close heuristic for the top-hit list 404 405 Top-hit heuristics: 406 By default, FastTree uses a top-hit list to speed up search 407 -close 0.75 -- modify the close heuristic, lower is more conservative. 408 """, 409 checker_function=_is_numeric, 410 equate=False, 411 ), 412 _Option(['-refresh', 'refresh'], 413 """Parameter for conditions that joined nodes are compared to other nodes 414 415 Top-hit heuristics: 416 By default, FastTree uses a top-hit list to speed up search 417 -refresh 0.8 -- compare a joined node to all other nodes if its 418 top-hit list is less than 80% of the desired length, 419 or if the age of the top-hit list is log2(m) or greater. 420 """, 421 checker_function=_is_numeric, 422 equate=False, 423 ), 424 _Option(['-matrix', 'matrix'], 425 """Specify a matrix for nucleotide or amino acid distances 426 427 Distances: 428 Default: For protein sequences, log-corrected distances and an 429 amino acid dissimilarity matrix derived from BLOSUM45 430 or for nucleotide sequences, Jukes-Cantor distances 431 To specify a different matrix, use -matrix FilePrefix or -nomatrix 432 """, 433 filename=True, 434 equate=False, 435 ), 436 _Switch(['-nomatrix', 'nomatrix'], 437 """Specify that no matrix should be used for nucleotide or amino acid distances 438 439 Distances: 440 Default: For protein sequences, log-corrected distances and an 441 amino acid dissimilarity matrix derived from BLOSUM45 442 or for nucleotide sequences, Jukes-Cantor distances 443 To specify a different matrix, use -matrix FilePrefix or -nomatrix 444 """, 445 ), 446 _Switch(['-nj', 'nj'], 447 "Join options: regular (unweighted) neighbor-joining (default)", 448 ), 449 _Switch(['-bionj', 'bionj'], 450 """Join options: weighted joins as in BIONJ. 451 452 FastTree will also weight joins during NNIs. 453 """, 454 ), 455 _Option(['-gtrrates', 'gtrrates'], 456 "-gtrrates ac ag at cg ct gt", 457 equate=False, 458 ), 459 _Option(['-gtrfreq', 'gtrfreq'], 460 "-gtrfreq A C G T", 461 equate=False, 462 ), 463 _Option(['-constraints', 'constraints'], 464 """Specifies an alignment file for use with constrained topology searching 465 466 Constrained topology search options: 467 -constraints alignmentfile -- an alignment with values of 0, 1, and - 468 Not all sequences need be present. A column of 0s and 1s defines a 469 constrained split. Some constraints may be violated 470 (see 'violating constraints:' in standard error). 471 """, 472 filename=True, 473 equate=False, 474 ), 475 _Option(['-constraintWeight', 'constraintWeight'], 476 """Weight strength of contraints in topology searching. 477 478 Constrained topology search options: 479 -constraintWeight -- how strongly to weight the constraints. A value of 1 480 means a penalty of 1 in tree length for violating a constraint 481 Default: 100.0 482 """, 483 checker_function=_is_numeric, 484 equate=False, 485 ), 486 _Option(['-log', 'log'], 487 """Create log files of data such as intermediate trees and per-site rates 488 489 -log logfile -- save intermediate trees so you can extract 490 the trees and restart long-running jobs if they crash 491 -log also reports the per-site rates (1 means slowest category). 492 """, 493 filename=True, 494 equate=False, 495 ), 496 _Option(['-makematrix', 'makematrix'], 497 "-makematrix [alignment]", 498 filename=True, 499 equate=False, 500 ), 501 _Switch(['-rawdist', 'rawdist'], 502 """Turn off or adjust log-correction in AA or NT distances. 503 504 Use -rawdist to turn the log-correction off or to use 505 %different instead of Jukes-Cantor in AA or NT distances 506 507 Distances: 508 Default: For protein sequences, log-corrected distances and an 509 amino acid dissimilarity matrix derived from BLOSUM45 510 or for nucleotide sequences, Jukes-Cantor distances 511 To specify a different matrix, use -matrix FilePrefix or -nomatrix 512 """, 513 ), 514 _Option(['-sprlength', 'sprlength'], 515 """Set maximum SPR move length in topology refinement (default 10). 516 517 Topology refinement: 518 By default, FastTree tries to improve the tree with up to 4*log2(N) 519 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 520 where N is the number of unique sequences, 2 rounds of 521 subtree-prune-regraft (SPR) moves (also min. evo.), and 522 up to 2*log(N) rounds of maximum-likelihood NNIs. 523 Use -nni to set the number of rounds of min. evo. NNIs, 524 and -spr to set the rounds of SPRs. 525 """, 526 checker_function=_is_int, 527 equate=False, 528 ), 529 _Switch(['-help', 'help'], "Show the help."), 530 _Switch(['-expert', 'expert'], "Show the expert level help."), 531 _Option(['-out', 'out'], 532 """Enter <output file> 533 534 The path to a Newick Tree output file needs to be specified. 535 """, 536 filename=True, 537 equate=False, 538 ), 539 _Argument(['input'], 540 """Enter <input file> 541 542 An input file of sequence alignments in fasta or phylip format 543 is needed. By default FastTree expects protein 544 alignments, use -nt for nucleotides. 545 """, 546 filename=True, 547 is_required=True, 548 ), 549 ] 550 551 AbstractCommandline.__init__(self, cmd, **kwargs)
552