Package Bio :: Package Phylo :: Package Applications :: Module _Fasttree
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.Applications._Fasttree

  1  # Copyright 2013 by Nate Sutton.  Based on code in _Phyml.py by Eric Talevich.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its license. 
  3  # Please see the LICENSE file that should have been included as part of this 
  4  # package. 
  5  """Command-line wrapper for tree inference program Fasttree.""" 
  6  from __future__ import print_function 
  7   
  8   
  9  from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline 
 10   
 11   
12 -def _is_int(x):
13 """Test whether the argument can be serialized as an integer.""" 14 return isinstance(x, int) or str(x).isdigit()
15 16
17 -def _is_numeric(x):
18 """Test whether the argument can be serialized as a number.""" 19 try: 20 float(str(x)) 21 return True 22 except ValueError: 23 return False
24 25
26 -class FastTreeCommandline(AbstractCommandline):
27 """Command-line wrapper for FastTree. 28 29 Homepage: http://www.microbesonline.org/fasttree/ 30 31 Citations: 32 33 Price, M.N., Dehal, P.S., and Arkin, A.P. (2010) FastTree 2 -- Approximately 34 Maximum-Likelihood Trees for Large Alignments. PLoS ONE, 5(3):e9490. 35 doi:10.1371/journal.pone.0009490. 36 37 Example usage: 38 39 >>> import _Fasttree 40 >>> fasttree_exe = r"C:\FasttreeWin32\fasttree.exe" 41 >>> cmd = _Fasttree.FastTreeCommandline(fasttree_exe, input=r'C:\Input\ExampleAlignment.fsa', out='C:\Output\ExampleTree.tree') 42 >>> print(cmd) 43 >>> out, err = cmd() 44 >>> print(out) 45 >>> print(err) 46 47 Usage advice: 48 the only parameters needed are (fasttree_exe, input='<InputFile>' out='<OutputFile>') 49 50 parameters that use values are added this way: (fasttree_exe, parameter=value, input='<InputFile>' out='<OutputFile>') 51 parameters that don't use values are added this way: (fasttree_exe, parameter=True, input='<InputFile>' out='<OutputFile>') 52 53 from the command line use 'fasttree.exe -help' or 'fasttree.exe -expert' for more explanation of usage options 54 """ 55
56 - def __init__(self, cmd='fasttree', **kwargs):
57 self.parameters = [ 58 _Switch(['-nt', 'nt'], 59 """By default FastTree expects protein alignments, use -nt for nucleotides""", 60 ), 61 _Option(['-n', 'n'], 62 """-n -- read N multiple alignments in. 63 64 This only works with phylip interleaved format. For example, you can 65 use it with the output from phylip's seqboot. If you use -n, FastTree 66 will write 1 tree per line to standard output.""", 67 checker_function=_is_int, 68 equate=False, 69 ), 70 _Switch(['-quote', 'quote'], 71 """-quote -- add quotes to sequence names in output. 72 73 Quote sequence names in the output and allow spaces, commas, 74 parentheses, and colons in them but not ' characters (fasta files only).""", 75 ), 76 _Option(['-pseudo', 'pseudo'], 77 """-pseudo [weight] -- Pseudocounts are used with sequence distance estimation. 78 79 Use pseudocounts to estimate distances between sequences with little or no 80 overlap. (Off by default.) Recommended if analyzing the alignment has 81 sequences with little or no overlap. 82 If the weight is not specified, it is 1.0 """, 83 checker_function=_is_numeric, 84 equate=False, 85 ), 86 _Option(['-boot', 'boot'], 87 """Specify the number of resamples for support values. 88 89 Support value options: 90 By default, FastTree computes local support values by resampling the site 91 likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, 92 it will compute minimum-evolution bootstrap supports instead 93 In either case, the support values are proportions ranging from 0 to 1 94 95 Use -nosupport to turn off support values or -boot 100 to use just 100 resamples.""", 96 checker_function=_is_int, 97 equate=False, 98 ), 99 _Switch(['-nosupport', 'nosupport'], 100 """Turn off support values. 101 102 Support value options: 103 By default, FastTree computes local support values by resampling the site 104 likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, 105 it will compute minimum-evolution bootstrap supports instead 106 In either case, the support values are proportions ranging from 0 to 1 107 108 Use -nosupport to turn off support values or -boot 100 to use just 100 resamples.""", 109 ), 110 _Option(['-intree', 'intree'], 111 """-intree newickfile -- read the starting tree in from newickfile. 112 113 Any branch lengths in the starting trees are ignored. 114 -intree with -n will read a separate starting tree for each alignment.""", 115 filename=True, 116 equate=False, 117 ), 118 _Option(['-intree1', 'intree1'], 119 """-intree1 newickfile -- read the same starting tree for each alignment.""", 120 filename=True, 121 equate=False, 122 ), 123 _Switch(['-quiet', 'quiet'], 124 """-quiet -- do not write to standard error during normal operation 125 126 (no progress indicator, no options summary, no likelihood values, etc.)""", 127 ), 128 _Switch(['-nopr', 'nopr'], 129 """-nopr -- do not write the progress indicator to stderr.""", 130 ), 131 _Option(['-nni', 'nni'], 132 """Set the rounds of minimum-evolution nearest-neighbor interchanges 133 134 Topology refinement: 135 By default, FastTree tries to improve the tree with up to 4*log2(N) 136 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 137 where N is the number of unique sequences, 2 rounds of 138 subtree-prune-regraft (SPR) moves (also min. evo.), and 139 up to 2*log(N) rounds of maximum-likelihood NNIs. 140 Use -nni to set the number of rounds of min. evo. NNIs.""", 141 checker_function=_is_int, 142 equate=False, 143 ), 144 _Option(['-spr', 'spr'], 145 """Set the rounds of subtree-prune-regraft moves 146 147 Topology refinement: 148 By default, FastTree tries to improve the tree with up to 4*log2(N) 149 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 150 where N is the number of unique sequences, 2 rounds of 151 subtree-prune-regraft (SPR) moves (also min. evo.), and 152 up to 2*log(N) rounds of maximum-likelihood NNIs. 153 Use -nni to set the number of rounds of min. evo. NNIs, 154 and -spr to set the rounds of SPRs.""", 155 checker_function=_is_int, 156 equate=False, 157 ), 158 _Switch(['-noml', 'noml'], 159 """Deactivate min-evo NNIs and SPRs. 160 161 Topology refinement: 162 By default, FastTree tries to improve the tree with up to 4*log2(N) 163 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 164 where N is the number of unique sequences, 2 rounds of 165 subtree-prune-regraft (SPR) moves (also min. evo.), and 166 up to 2*log(N) rounds of maximum-likelihood NNIs. 167 Use -nni to set the number of rounds of min. evo. NNIs, 168 and -spr to set the rounds of SPRs. 169 Use -noml to turn off both min-evo NNIs and SPRs (useful if refining 170 an approximately maximum-likelihood tree with further NNIs) """, 171 ), 172 _Switch(['-mllen', 'mllen'], 173 """Optimize branch lengths on a fixed topology. 174 175 Topology refinement: 176 By default, FastTree tries to improve the tree with up to 4*log2(N) 177 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 178 where N is the number of unique sequences, 2 rounds of 179 subtree-prune-regraft (SPR) moves (also min. evo.), and 180 up to 2*log(N) rounds of maximum-likelihood NNIs. 181 Use -nni to set the number of rounds of min. evo. NNIs, 182 and -spr to set the rounds of SPRs. 183 Use -mllen to optimize branch lengths without ML NNIs 184 Use -mllen -nome with -intree to optimize branch lengths on a fixed topology.""", 185 ), 186 _Switch(['-nome', 'nome'], 187 """Changes support values calculation to a minimum-evolution bootstrap method. 188 189 Topology refinement: 190 By default, FastTree tries to improve the tree with up to 4*log2(N) 191 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 192 where N is the number of unique sequences, 2 rounds of 193 subtree-prune-regraft (SPR) moves (also min. evo.), and 194 up to 2*log(N) rounds of maximum-likelihood NNIs. 195 Use -nni to set the number of rounds of min. evo. NNIs, 196 and -spr to set the rounds of SPRs. 197 Use -mllen to optimize branch lengths without ML NNIs 198 Use -mllen -nome with -intree to optimize branch lengths on a fixed topology 199 200 Support value options: 201 By default, FastTree computes local support values by resampling the site 202 likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, 203 it will compute minimum-evolution bootstrap supports instead 204 In either case, the support values are proportions ranging from 0 to 1.""", 205 ), 206 _Option(['-mlnni', 'mlnni'], 207 """Set the number of rounds of maximum-likelihood NNIs. 208 209 Topology refinement: 210 By default, FastTree tries to improve the tree with up to 4*log2(N) 211 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 212 where N is the number of unique sequences, 2 rounds of 213 subtree-prune-regraft (SPR) moves (also min. evo.), and 214 up to 2*log(N) rounds of maximum-likelihood NNIs. 215 Use -nni to set the number of rounds of min. evo. NNIs, 216 and -spr to set the rounds of SPRs. 217 Use -mlnni to set the number of rounds of maximum-likelihood NNIs.""", 218 checker_function=_is_int, 219 equate=False, 220 ), 221 _Option(['-mlacc', 'mlacc'], 222 """Option for optimization of branches at each NNI. 223 224 Topology refinement: 225 By default, FastTree tries to improve the tree with up to 4*log2(N) 226 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 227 where N is the number of unique sequences, 2 rounds of 228 subtree-prune-regraft (SPR) moves (also min. evo.), and 229 up to 2*log(N) rounds of maximum-likelihood NNIs. 230 Use -nni to set the number of rounds of min. evo. NNIs, 231 and -spr to set the rounds of SPRs. 232 Use -mlacc 2 or -mlacc 3 to always optimize all 5 branches at each NNI, 233 and to optimize all 5 branches in 2 or 3 rounds.""", 234 checker_function=_is_int, 235 equate=False, 236 ), 237 _Switch(['-slownni', 'slownni'], 238 """Turn off heuristics to avoid constant subtrees with NNIs. 239 240 Topology refinement: 241 By default, FastTree tries to improve the tree with up to 4*log2(N) 242 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 243 where N is the number of unique sequences, 2 rounds of 244 subtree-prune-regraft (SPR) moves (also min. evo.), and 245 up to 2*log(N) rounds of maximum-likelihood NNIs. 246 Use -nni to set the number of rounds of min. evo. NNIs, 247 and -spr to set the rounds of SPRs. 248 Use -slownni to turn off heuristics to avoid constant subtrees (affects both 249 ML and ME NNIs).""", 250 ), 251 _Switch(['-wag', 'wag'], 252 """Maximum likelihood model options: Whelan-And-Goldman 2001 model instead of (default) Jones-Taylor-Thorton 1992 model (a.a. only)""", 253 ), 254 _Switch(['-gtr', 'gtr'], 255 """Maximum likelihood model options: Use generalized time-reversible instead of (default) Jukes-Cantor (nt only)""", 256 ), 257 _Option(['-cat', 'cat'], 258 """Maximum likelihood model options: Specify the number of rate categories of sites (default 20).""", 259 checker_function=_is_int, 260 equate=False, 261 ), 262 _Switch(['-nocat', 'nocat'], 263 """Maximum likelihood model options: No CAT model (just 1 category)""", 264 ), 265 _Switch(['-gamma', 'gamma'], 266 """Report the likelihood under the discrete gamma model. 267 268 Maximum likelihood model options: 269 -gamma -- after the final round of optimizing branch lengths with the CAT model, 270 report the likelihood under the discrete gamma model with the same 271 number of categories. FastTree uses the same branch lengths but 272 optimizes the gamma shape parameter and the scale of the lengths. 273 The final tree will have rescaled lengths. Used with -log, this 274 also generates per-site likelihoods for use with CONSEL, see 275 GammaLogToPaup.pl and documentation on the FastTree web site.""", 276 ), 277 _Switch(['-slow', 'slow'], 278 """Use an exhaustive search. 279 280 Searching for the best join: 281 By default, FastTree combines the 'visible set' of fast neighbor-joining with 282 local hill-climbing as in relaxed neighbor-joining 283 -slow -- exhaustive search (like NJ or BIONJ, but different gap handling) 284 -slow takes half an hour instead of 8 seconds for 1,250 proteins""", 285 ), 286 _Switch(['-fastest', 'fastest'], 287 """Search the visible set (the top hit for each node) only. 288 289 Searching for the best join: 290 By default, FastTree combines the 'visible set' of fast neighbor-joining with 291 local hill-climbing as in relaxed neighbor-joining 292 -fastest -- search the visible set (the top hit for each node) only 293 Unlike the original fast neighbor-joining, -fastest updates visible(C) 294 after joining A and B if join(AB,C) is better than join(C,visible(C)) 295 -fastest also updates out-distances in a very lazy way, 296 -fastest sets -2nd on as well, use -fastest -no2nd to avoid this""", 297 ), 298 _Switch(['-2nd', 'second'], 299 """Turn 2nd-level top hits heuristic on. 300 301 Top-hit heuristics: 302 By default, FastTree uses a top-hit list to speed up search 303 Use -notop (or -slow) to turn this feature off 304 and compare all leaves to each other, 305 and all new joined nodes to each other 306 307 -2nd or -no2nd to turn 2nd-level top hits heuristic on or off 308 This reduces memory usage and running time but may lead to 309 marginal reductions in tree quality. 310 (By default, -fastest turns on -2nd.)""", 311 ), 312 _Switch(['-no2nd', 'no2nd'], 313 """Turn 2nd-level top hits heuristic off. 314 315 Top-hit heuristics: 316 By default, FastTree uses a top-hit list to speed up search 317 Use -notop (or -slow) to turn this feature off 318 and compare all leaves to each other, 319 and all new joined nodes to each other 320 321 -2nd or -no2nd to turn 2nd-level top hits heuristic on or off 322 This reduces memory usage and running time but may lead to 323 marginal reductions in tree quality. 324 (By default, -fastest turns on -2nd.)""", 325 ), 326 _Option(['-seed', 'seed'], 327 """Use -seed to initialize the random number generator. 328 329 Support value options: 330 By default, FastTree computes local support values by resampling the site 331 likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome, 332 it will compute minimum-evolution bootstrap supports instead 333 In either case, the support values are proportions ranging from 0 to 1""", 334 checker_function=_is_int, 335 equate=False, 336 ), 337 _Switch(['-top', 'top'], 338 """Top-hit list to speed up search 339 340 Top-hit heuristics: 341 By default, FastTree uses a top-hit list to speed up search 342 Use -notop (or -slow) to turn this feature off 343 and compare all leaves to each other, 344 and all new joined nodes to each other""", 345 ), 346 _Switch(['-notop', 'notop'], 347 """Turn off top-hit list to speed up search 348 349 Top-hit heuristics: 350 By default, FastTree uses a top-hit list to speed up search 351 Use -notop (or -slow) to turn this feature off 352 and compare all leaves to each other, 353 and all new joined nodes to each other""", 354 ), 355 _Option(['-topm', 'topm'], 356 """Change the top hits calculation method 357 358 Top-hit heuristics: 359 By default, FastTree uses a top-hit list to speed up search 360 -topm 1.0 -- set the top-hit list size to parameter*sqrt(N) 361 FastTree estimates the top m hits of a leaf from the 362 top 2*m hits of a 'close' neighbor, where close is 363 defined as d(seed,close) < 0.75 * d(seed, hit of rank 2*m), 364 and updates the top-hits as joins proceed""", 365 checker_function=_is_numeric, 366 equate=False, 367 ), 368 _Option(['-close', 'close'], 369 """Modify the close heuristic for the top-hit list 370 371 Top-hit heuristics: 372 By default, FastTree uses a top-hit list to speed up search 373 -close 0.75 -- modify the close heuristic, lower is more conservative""", 374 checker_function=_is_numeric, 375 equate=False, 376 ), 377 _Option(['-refresh', 'refresh'], 378 """Parameter for conditions that joined nodes are compared to other nodes 379 380 Top-hit heuristics: 381 By default, FastTree uses a top-hit list to speed up search 382 -refresh 0.8 -- compare a joined node to all other nodes if its 383 top-hit list is less than 80% of the desired length, 384 or if the age of the top-hit list is log2(m) or greater""", 385 checker_function=_is_numeric, 386 equate=False, 387 ), 388 _Option(['-matrix', 'matrix'], 389 """Specify a matrix for nucleotide or amino acid distances 390 391 Distances: 392 Default: For protein sequences, log-corrected distances and an 393 amino acid dissimilarity matrix derived from BLOSUM45 394 or for nucleotide sequences, Jukes-Cantor distances 395 To specify a different matrix, use -matrix FilePrefix or -nomatrix""", 396 filename=True, 397 equate=False, 398 ), 399 _Switch(['-nomatrix', 'nomatrix'], 400 """Specify that no matrix should be used for nucleotide or amino acid distances 401 402 Distances: 403 Default: For protein sequences, log-corrected distances and an 404 amino acid dissimilarity matrix derived from BLOSUM45 405 or for nucleotide sequences, Jukes-Cantor distances 406 To specify a different matrix, use -matrix FilePrefix or -nomatrix""", 407 ), 408 _Switch(['-nj', 'nj'], 409 """Join options: regular (unweighted) neighbor-joining (default)""", 410 ), 411 _Switch(['-bionj', 'bionj'], 412 """Join options: weighted joins as in BIONJ. FastTree will also weight joins during NNIs""", 413 ), 414 _Option(['-gtrrates', 'gtrrates'], 415 """-gtrrates ac ag at cg ct gt""", 416 equate=False, 417 ), 418 _Option(['-gtrfreq', 'gtrfreq'], 419 """-gtrfreq A C G T""", 420 equate=False, 421 ), 422 _Option(['-constraints', 'constraints'], 423 """Specifies an alignment file for use with constrained topology searching 424 425 Constrained topology search options: 426 -constraints alignmentfile -- an alignment with values of 0, 1, and - 427 Not all sequences need be present. A column of 0s and 1s defines a 428 constrained split. Some constraints may be violated 429 (see 'violating constraints:' in standard error).""", 430 filename=True, 431 equate=False, 432 ), 433 _Option(['-constraintWeight', 'constraintWeight'], 434 """Weight strength of contraints in topology searching 435 436 Constrained topology search options: 437 -constraintWeight -- how strongly to weight the constraints. A value of 1 438 means a penalty of 1 in tree length for violating a constraint 439 Default: 100.0""", 440 checker_function=_is_numeric, 441 equate=False, 442 ), 443 _Option(['-log', 'log'], 444 """Create log files of data such as intermediate trees and per-site rates 445 446 -log logfile -- save intermediate trees so you can extract 447 the trees and restart long-running jobs if they crash 448 -log also reports the per-site rates (1 means slowest category).""", 449 filename=True, 450 equate=False, 451 ), 452 _Option(['-makematrix', 'makematrix'], 453 """-makematrix [alignment]""", 454 filename=True, 455 equate=False, 456 ), 457 _Switch(['-rawdist', 'rawdist'], 458 """Use -rawdist to turn the log-correction off or to use %different instead of Jukes-Cantor in AA or NT distances 459 460 Distances: 461 Default: For protein sequences, log-corrected distances and an 462 amino acid dissimilarity matrix derived from BLOSUM45 463 or for nucleotide sequences, Jukes-Cantor distances 464 To specify a different matrix, use -matrix FilePrefix or -nomatrix""", 465 ), 466 _Option(['-sprlength', 'sprlength'], 467 """Use -sprlength set the maximum length of a SPR move (default 10) in topology refinement 468 469 Topology refinement: 470 By default, FastTree tries to improve the tree with up to 4*log2(N) 471 rounds of minimum-evolution nearest-neighbor interchanges (NNI), 472 where N is the number of unique sequences, 2 rounds of 473 subtree-prune-regraft (SPR) moves (also min. evo.), and 474 up to 2*log(N) rounds of maximum-likelihood NNIs. 475 Use -nni to set the number of rounds of min. evo. NNIs, 476 and -spr to set the rounds of SPRs.""", 477 checker_function=_is_int, 478 equate=False, 479 ), 480 _Switch(['-help', 'help'], 481 """Show the help""" 482 ), 483 _Switch(['-expert', 'expert'], 484 """Show the expert level help""" 485 ), 486 _Option(['-out', 'out'], 487 """Enter <output file> 488 489 The path to a Newick Tree output file needs to be specified.""", 490 filename=True, 491 equate=False, 492 ), 493 _Argument(['input'], 494 """Enter <input file> 495 496 An input file of sequence alignments in fasta or phylip format is needed. By default FastTree expects protein 497 alignments, use -nt for nucleotides""", 498 filename=True, 499 is_required=True, 500 ), 501 ] 502 503 AbstractCommandline.__init__(self, cmd, **kwargs)
504