Package Bio :: Package Align :: Package Applications :: Module _Muscle
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Muscle

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment program MUSCLE. 
  6  """ 
  7   
  8  from __future__ import print_function 
  9   
 10  from Bio.Application import _Option, _Switch, AbstractCommandline 
 11   
 12   
13 -class MuscleCommandline(AbstractCommandline):
14 r"""Command line wrapper for the multiple alignment program MUSCLE. 15 16 http://www.drive5.com/muscle/ 17 18 Example: 19 -------- 20 21 >>> from Bio.Align.Applications import MuscleCommandline 22 >>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" 23 >>> in_file = r"C:\My Documents\unaligned.fasta" 24 >>> out_file = r"C:\My Documents\aligned.fasta" 25 >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) 26 >>> print(muscle_cline) 27 "C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta" 28 29 You would typically run the command line with muscle_cline() or via 30 the Python subprocess module, as described in the Biopython tutorial. 31 32 Citations: 33 ---------- 34 35 Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high 36 accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97. 37 38 Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with 39 reduced time and space complexity. BMC Bioinformatics 5(1): 113. 40 41 Last checked against version: 3.7, briefly against 3.8 42 """
43 - def __init__(self, cmd="muscle", **kwargs):
44 CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] 45 DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", 46 "kbit20_3", "kmer4_6"] 47 DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \ 48 ["pctid_kimura", "pctid_log"] 49 OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] 50 TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] 51 SEQUENCE_TYPES = ["protein", "nucleo", "auto"] 52 WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb", 53 "gsc", "threeway"] 54 self.parameters = [ 55 # Can't use "in" as the final alias as this 56 # is a reserved word in python: 57 _Option(["-in", "in", "input"], 58 "Input filename", 59 filename=True, 60 equate=False), 61 _Option(["-out", "out"], 62 "Output filename", 63 filename=True, 64 equate=False), 65 _Switch(["-diags", "diags"], 66 "Find diagonals (faster for similar sequences)"), 67 _Switch(["-profile", "profile"], 68 "Perform a profile alignment"), 69 _Option(["-in1", "in1"], 70 "First input filename for profile alignment", 71 filename=True, 72 equate=False), 73 _Option(["-in2", "in2"], 74 "Second input filename for a profile alignment", 75 filename=True, 76 equate=False), 77 # anchorspacing Integer 32 Minimum spacing 78 # between anchor cols 79 _Option(["-anchorspacing", "anchorspacing"], 80 "Minimum spacing between anchor columns", 81 checker_function=lambda x: isinstance(x, int), 82 equate=False), 83 # center Floating point [1] Center parameter. 84 # Should be negative. 85 _Option(["-center", "center"], 86 "Center parameter - should be negative", 87 checker_function=lambda x: isinstance(x, float), 88 equate=False), 89 # cluster1 upgma upgmb Clustering method. 90 _Option(["-cluster1", "cluster1"], 91 "Clustering method used in iteration 1", 92 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 93 equate=False), 94 # cluster2 upgmb cluster1 is used 95 # neighborjoining in iteration 1 and 96 # 2, cluster2 in 97 # later iterations. 98 _Option(["-cluster2", "cluster2"], 99 "Clustering method used in iteration 2", 100 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 101 equate=False), 102 # diaglength Integer 24 Minimum length of 103 # diagonal. 104 _Option(["-diaglength", "diaglength"], 105 "Minimum length of diagonal", 106 checker_function=lambda x: isinstance(x, int), 107 equate=True), 108 # diagmargin Integer 5 Discard this many 109 # positions at ends 110 # of diagonal. 111 _Option(["-diagmargin", "diagmargin"], 112 "Discard this many positions at ends of diagonal", 113 checker_function=lambda x: isinstance(x, int), 114 equate=False), 115 # distance1 kmer6_6 Kmer6_6(amino) or Distance measure 116 # kmer20_3 Kmer4_6(nucleo) for iteration 1 117 # kmer20_4 118 # kbit20_3 119 # kmer4_6 120 _Option(["-distance1", "distance1"], 121 "Distance measure for iteration 1", 122 checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, 123 equate=False), 124 # distance2 kmer6_6 pctid_kimura Distance measure 125 # kmer20_3 for iterations 126 # kmer20_4 2, 3 ... 127 # kbit20_3 128 # pctid_kimura 129 # pctid_log 130 _Option(["-distance2", "distance2"], 131 "Distance measure for iteration 2", 132 checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, 133 equate=False), 134 # gapextend Floating point [1] The gap extend score 135 _Option(["-gapextend", "gapextend"], 136 "Gap extension penalty", 137 checker_function=lambda x: isinstance(x, float), 138 equate=False), 139 # gapopen Floating point [1] The gap open score 140 # Must be negative. 141 _Option(["-gapopen", "gapopen"], 142 "Gap open score - negative number", 143 checker_function=lambda x: isinstance(x, float), 144 equate=False), 145 # hydro Integer 5 Window size for 146 # determining whether 147 # a region is 148 # hydrophobic. 149 _Option(["-hydro", "hydro"], 150 "Window size for hydrophobic region", 151 checker_function=lambda x: isinstance(x, int), 152 equate=False), 153 # hydrofactor Floating point 1.2 Multiplier for gap 154 # open/close 155 # penalties in 156 # hydrophobic regions 157 _Option(["-hydrofactor", "hydrofactor"], 158 "Multiplier for gap penalties in hydrophobic regions", 159 checker_function=lambda x: isinstance(x, float), 160 equate=False), 161 # log File name None. Log file name 162 # (delete existing 163 # file). 164 _Option(["-log", "log"], 165 "Log file name", 166 filename=True, 167 equate=False), 168 # loga File name None. Log file name 169 # (append to existing 170 # file). 171 _Option(["-loga", "loga"], 172 "Log file name (append to existing file)", 173 filename=True, 174 equate=False), 175 # matrix File name None. File name for 176 # substitution matrix 177 # in NCBI or WU-BLAST 178 # format. If you 179 # specify your own 180 # matrix, you should 181 # also specify: 182 # -gapopen <g> 183 # -gapextend <e> 184 # -center 0.0 185 _Option(["-matrix", "matrix"], 186 "path to NCBI or WU-BLAST format protein substitution " 187 "matrix - also set -gapopen, -gapextend and -center", 188 filename=True, 189 equate=False), 190 # diagbreak Integer 1 Maximum distance 191 # between two 192 # diagonals that 193 # allows them to 194 # merge into one 195 # diagonal. 196 _Option(["-diagbreak", "diagbreak"], 197 "Maximum distance between two diagonals that allows " 198 "them to merge into one diagonal", 199 checker_function=lambda x: isinstance(x, int), 200 equate=False), 201 _Option(["-maxdiagbreak", "maxdiagbreak"], # deprecated 3.8 202 "Deprecated in v3.8, use -diagbreak instead.", 203 checker_function=lambda x: isinstance(x, int), 204 equate=False), 205 # maxhours Floating point None. Maximum time to 206 # run in hours. The 207 # actual time may 208 # exceed requested 209 # limit by a few 210 # minutes. Decimals 211 # are allowed, so 1.5 212 # means one hour and 213 # 30 minutes. 214 _Option(["-maxhours", "maxhours"], 215 "Maximum time to run in hours", 216 checker_function=lambda x: isinstance(x, float), 217 equate=False), 218 # maxiters Integer 1, 2 ... 16 Maximum number of 219 # iterations. 220 _Option(["-maxiters", "maxiters"], 221 "Maximum number of iterations", 222 checker_function=lambda x: isinstance(x, int), 223 equate=False), 224 # maxtrees Integer 1 Maximum number of 225 # new trees to build 226 # in iteration 2. 227 _Option(["-maxtrees", "maxtrees"], 228 "Maximum number of trees to build in iteration 2", 229 checker_function=lambda x: isinstance(x, int), 230 equate=False), 231 # minbestcolscore Floating point [1] Minimum score a 232 # column must have to 233 # be an anchor. 234 _Option(["-minbestcolscore", "minbestcolscore"], 235 "Minimum score a column must have to be an anchor", 236 checker_function=lambda x: isinstance(x, float), 237 equate=False), 238 # minsmoothscore Floating point [1] Minimum smoothed 239 # score a column must 240 # have to be an 241 # anchor. 242 _Option(["-minsmoothscore", "minsmoothscore"], 243 "Minimum smoothed score a column must have to " 244 "be an anchor", 245 checker_function=lambda x: isinstance(x, float), 246 equate=False), 247 # objscore sp spm Objective score 248 # ps used by tree 249 # dp dependent 250 # xp refinement. 251 # spf sp=sum-of-pairs 252 # spm score. (dimer 253 # approximation) 254 # spm=sp for < 100 255 # seqs, otherwise spf 256 # dp=dynamic 257 # programming score. 258 # ps=average profile- 259 # sequence score. 260 # xp=cross profile 261 # score. 262 _Option(["-objscore", "objscore"], 263 "Objective score used by tree dependent refinement", 264 checker_function=lambda x: x in OBJECTIVE_SCORES, 265 equate=False), 266 # refinewindow Integer 200 Length of window 267 # for -refinew. 268 _Option(["-refinewindow", "refinewindow"], 269 "Length of window for -refinew", 270 checker_function=lambda x: isinstance(x, int), 271 equate=False), 272 # root1 pseudo pseudo Method used to root 273 _Option(["-root1", "root1"], 274 "Method used to root tree in iteration 1", 275 checker_function=lambda x: x in TREE_ROOT_METHODS, 276 equate=False), 277 # root2 midlongestspan tree; root1 is 278 # minavgleafdist used in iteration 1 279 # and 2, root2 in 280 # later iterations. 281 _Option(["-root2", "root2"], 282 "Method used to root tree in iteration 2", 283 checker_function=lambda x: x in TREE_ROOT_METHODS, 284 equate=False), 285 # scorefile File name None File name where to 286 # write a score file. 287 # This contains one 288 # line for each column 289 # in the alignment. 290 # The line contains 291 # the letters in the 292 # column followed by 293 # the average BLOSUM62 294 # score over pairs of 295 # letters in the 296 # column. 297 _Option(["-scorefile", "scorefile"], 298 "Score file name, contains one line for each column" 299 " in the alignment with average BLOSUM62 score", 300 filename=True, 301 equate=False), 302 # seqtype protein auto Sequence type. 303 # nucleo 304 # auto 305 _Option(["-seqtype", "seqtype"], 306 "Sequence type", 307 checker_function=lambda x: x in SEQUENCE_TYPES, 308 equate=False), 309 # smoothscoreceil Floating point [1] Maximum value of 310 # column score for 311 # smoothing purposes. 312 _Option(["-smoothscoreceil", "smoothscoreceil"], 313 "Maximum value of column score for smoothing", 314 checker_function=lambda x: isinstance(x, float), 315 equate=False), 316 # smoothwindow Integer 7 Window used for 317 # anchor column 318 # smoothing. 319 _Option(["-smoothwindow", "smoothwindow"], 320 "Window used for anchor column smoothing", 321 checker_function=lambda x: isinstance(x, int), 322 equate=False), 323 # spscore File name Compute SP 324 # objective score of 325 # multiple alignment. 326 _Option(["-spscore", "spscore"], 327 "Compute SP objective score of multiple alignment", 328 filename=True, 329 equate=False), 330 # SUEFF Floating point value 0.1 Constant used in 331 # between 0 and 1. UPGMB clustering. 332 # Determines the 333 # relative fraction 334 # of average linkage 335 # (SUEFF) vs. nearest 336 # neighbor linkage 337 # (1 SUEFF). 338 _Option(["-sueff", "sueff"], 339 "Constant used in UPGMB clustering", 340 checker_function=lambda x: isinstance(x, float), 341 equate=False), 342 # tree1 File name None Save tree 343 _Option(["-tree1", "tree1"], 344 "Save Newick tree from iteration 1", 345 equate=False), 346 # tree2 first or second 347 # iteration to given 348 # file in Newick 349 # (Phylip-compatible) 350 # format. 351 _Option(["-tree2", "tree2"], 352 "Save Newick tree from iteration 2", 353 equate=False), 354 # usetree File name None Use given tree as 355 # guide tree. Must by 356 # in Newick 357 # (Phyip-compatible) 358 # format. 359 _Option(["-usetree", "usetree"], 360 "Use given Newick tree as guide tree", 361 filename=True, 362 equate=False), 363 # weight1 none clustalw Sequence weighting 364 _Option(["-weight1", "weight1"], 365 "Weighting scheme used in iteration 1", 366 checker_function=lambda x: x in WEIGHTING_SCHEMES, 367 equate=False), 368 # weight2 henikoff scheme. 369 # henikoffpb weight1 is used in 370 # gsc iterations 1 and 2. 371 # clustalw weight2 is used for 372 # threeway tree-dependent 373 # refinement. 374 # none=all sequences 375 # have equal weight. 376 # henikoff=Henikoff & 377 # Henikoff weighting 378 # scheme. 379 # henikoffpb=Modified 380 # Henikoff scheme as 381 # used in PSI-BLAST. 382 # clustalw=CLUSTALW 383 # method. 384 # threeway=Gotoh 385 # three-way method. 386 _Option(["-weight2", "weight2"], 387 "Weighting scheme used in iteration 2", 388 checker_function=lambda x: x in WEIGHTING_SCHEMES, 389 equate=False), 390 # ################### FORMATS #################################### 391 # Multiple formats can be specified on the command line 392 # If -msf appears it will be used regardless of other formats 393 # specified. If -clw appears (and not -msf), clustalw format will 394 # be used regardless of other formats specified. If both -clw and 395 # -clwstrict are specified -clwstrict will be used regardless of 396 # other formats specified. If -fasta is specified and not -msf, 397 # -clw, or clwstrict, fasta will be used. If -fasta and -html are 398 # specified -fasta will be used. Only if -html is specified alone 399 # will html be used. I kid ye not. 400 # clw no Write output in CLUSTALW format 401 # (default is FASTA). 402 _Switch(["-clw", "clw"], 403 "Write output in CLUSTALW format (with a MUSCLE header)"), 404 # clwstrict no Write output in CLUSTALW format with 405 # the "CLUSTAL W (1.81)" header rather 406 # than the MUSCLE version. This is 407 # useful when a post-processing step is 408 # picky about the file header. 409 _Switch(["-clwstrict", "clwstrict"], 410 "Write output in CLUSTALW format with version" 411 "1.81 header"), 412 # fasta yes Write output in FASTA format. 413 # Alternatives include clw, 414 # clwstrict, msf and html. 415 _Switch(["-fasta", "fasta"], 416 "Write output in FASTA format"), 417 # html no Write output in HTML format (default 418 # is FASTA). 419 _Switch(["-html", "html"], 420 "Write output in HTML format"), 421 # msf no Write output in MSF format (default 422 # is FASTA). 423 _Switch(["-msf", "msf"], 424 "Write output in MSF format"), 425 # Phylip interleaved - undocumented as of 3.7 426 _Switch(["-phyi", "phyi"], 427 "Write output in PHYLIP interleaved format"), 428 # Phylip sequential - undocumented as of 3.7 429 _Switch(["-phys", "phys"], 430 "Write output in PHYLIP sequential format"), 431 # ################# Additional specified output files ######### 432 _Option(["-phyiout", "phyiout"], 433 "Write PHYLIP interleaved output to specified filename", 434 filename=True, 435 equate=False), 436 _Option(["-physout", "physout"], 437 "Write PHYLIP sequential format to specified filename", 438 filename=True, 439 equate=False), 440 _Option(["-htmlout", "htmlout"], 441 "Write HTML output to specified filename", 442 filename=True, 443 equate=False), 444 _Option(["-clwout", "clwout"], 445 "Write CLUSTALW output (with MUSCLE header) to specified " 446 "filename", 447 filename=True, 448 equate=False), 449 _Option(["-clwstrictout", "clwstrictout"], 450 "Write CLUSTALW output (with version 1.81 header) to " 451 "specified filename", 452 filename=True, 453 equate=False), 454 _Option(["-msfout", "msfout"], 455 "Write MSF format output to specified filename", 456 filename=True, 457 equate=False), 458 _Option(["-fastaout", "fastaout"], 459 "Write FASTA format output to specified filename", 460 filename=True, 461 equate=False), 462 # ############# END FORMATS ################################### 463 # anchors yes Use anchor optimization in tree 464 # dependent refinement iterations. 465 _Switch(["-anchors", "anchors"], 466 "Use anchor optimisation in tree dependent " 467 "refinement iterations"), 468 # noanchors no Disable anchor optimization. Default 469 # is anchors. 470 _Switch(["-noanchors", "noanchors"], 471 "Do not use anchor optimisation in tree dependent " 472 "refinement iterations"), 473 # brenner no Use Steven Brenner's method for 474 # computing the root alignment. 475 _Switch(["-brenner", "brenner"], 476 "Use Steve Brenner's root alignment method"), 477 # cluster no Perform fast clustering of input 478 # sequences. Use the tree1 option to 479 # save the tree. 480 _Switch(["-cluster", "cluster"], 481 "Perform fast clustering of input sequences, " 482 "use -tree1 to save tree"), 483 # dimer no Use dimer approximation for the 484 # SP score (faster, less accurate). 485 _Switch(["-dimer", "dimer"], 486 "Use faster (slightly less accurate) dimer approximation" 487 "for the SP score"), 488 # group yes Group similar sequences together 489 # in the output. This is the default. 490 # See also stable. 491 _Switch(["-group", "group"], 492 "Group similar sequences in output"), 493 # ############# log-expectation profile score #################### 494 # One of either -le, -sp, or -sv 495 # 496 # According to the doc, spn is default and the only option for 497 # nucleotides: this doesn't appear to be true. -le, -sp, and -sv 498 # can be used and produce numerically different logs 499 # (what is going on?) 500 # 501 # spn fails on proteins 502 # le maybe Use log-expectation profile score 503 # (VTML240). Alternatives are to use sp 504 # or sv. This is the default for amino 505 # acid sequences. 506 _Switch(["-le", "le"], 507 "Use log-expectation profile score (VTML240)"), 508 # sv no Use sum-of-pairs profile score 509 # (VTML240). Default is le. 510 _Switch(["-sv", "sv"], 511 "Use sum-of-pairs profile score (VTML240)"), 512 # sp no Use sum-of-pairs protein profile 513 # score (PAM200). Default is le. 514 _Switch(["-sp", "sp"], 515 "Use sum-of-pairs protein profile score (PAM200)"), 516 # spn maybe Use sum-of-pairs nucleotide profile 517 # score (BLASTZ parameters). This is 518 # the only option for nucleotides, 519 # and is therefore the default. 520 _Switch(["-spn", "spn"], 521 "Use sum-of-pairs protein nucleotide profile score"), 522 # ########## END log-expectation profile score ################### 523 # quiet no Do not display progress messages. 524 _Switch(["-quiet", "quiet"], 525 "Do not display progress messages"), 526 # refine no Input file is already aligned, skip 527 # first two iterations and begin tree 528 # dependent refinement. 529 _Switch(["-refine", "refine"], 530 "Only do tree dependent refinement"), 531 # refinew no Refine an alignment by dividing it 532 # into non-overlapping windows and 533 # re-aligning each window. Typically 534 # used for whole-genome nucleotide 535 # alignments. 536 _Switch(["-refinew", "refinew"], 537 "Only do tree dependent refinement using " 538 "sliding window approach"), 539 # core yes in muscle, Do not catch exceptions. 540 # no in muscled. 541 _Switch(["-core", "core"], 542 "Do not catch exceptions"), 543 # nocore no in muscle, Catch exceptions and give an 544 # yes in muscled. error message if possible. 545 _Switch(["-nocore", "nocore"], 546 "Catch exceptions"), 547 # stable no Preserve input order of sequences 548 # in output file. Default is to group 549 # sequences by similarity (group). 550 _Switch(["-stable", "stable"], 551 "Do not group similar sequences in output " 552 "(not supported in v3.8)"), 553 554 # termgaps4 yes Use 4-way test for treatment of 555 # terminal gaps. 556 # (Cannot be disabled in this version). 557 # 558 # termgapsfull no Terminal gaps penalized with 559 # full penalty. [1] Not fully 560 # supported in this version 561 # 562 # termgapshalf yes Terminal gaps penalized with 563 # half penalty. [1] Not fully 564 # supported in this version 565 # 566 # termgapshalflonger no Terminal gaps penalized with 567 # half penalty if gap relative 568 # to longer sequence, otherwise with 569 # full penalty. [1] Not fully 570 # supported in this version 571 # 572 # verbose no Write parameter settings and 573 # progress messages to log file. 574 _Switch(["-verbose", "verbose"], 575 "Write parameter settings and progress"), 576 # version no Write version string to 577 # stdout and exit 578 _Switch(["-version", "version"], 579 "Write version string to stdout and exit"), 580 ] 581 AbstractCommandline.__init__(self, cmd, **kwargs)
582 583 584 if __name__ == "__main__": 585 from Bio._utils import run_doctest 586 run_doctest() 587