Package Bio :: Package Align :: Package Applications :: Module _Muscle
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Muscle

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment program MUSCLE. 
  6  """ 
  7   
  8  from __future__ import print_function 
  9   
 10  from Bio.Application import _Option, _Switch, AbstractCommandline 
 11   
 12   
13 -class MuscleCommandline(AbstractCommandline):
14 r"""Command line wrapper for the multiple alignment program MUSCLE. 15 16 http://www.drive5.com/muscle/ 17 18 Example: 19 -------- 20 21 >>> from Bio.Align.Applications import MuscleCommandline 22 >>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" 23 >>> in_file = r"C:\My Documents\unaligned.fasta" 24 >>> out_file = r"C:\My Documents\aligned.fasta" 25 >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) 26 >>> print(muscle_cline) 27 "C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta" 28 29 You would typically run the command line with muscle_cline() or via 30 the Python subprocess module, as described in the Biopython tutorial. 31 32 Citations: 33 ---------- 34 35 Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high 36 accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97. 37 38 Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with 39 reduced time and space complexity. BMC Bioinformatics 5(1): 113. 40 41 Last checked against version: 3.7, briefly against 3.8 42 """
43 - def __init__(self, cmd="muscle", **kwargs):
44 CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] 45 DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3", 46 "kmer4_6"] 47 DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \ 48 ["pctid_kimura", "pctid_log"] 49 OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] 50 TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] 51 SEQUENCE_TYPES = ["protein", "nucleo", "auto"] 52 WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb", "gsc", "threeway"] 53 self.parameters = \ 54 [ 55 # Can't use "in" as the final alias as this is a reserved word in python: 56 _Option(["-in", "in", "input"], 57 "Input filename", 58 filename=True, 59 equate=False), 60 _Option(["-out", "out"], 61 "Output filename", 62 filename=True, 63 equate=False), 64 _Switch(["-diags", "diags"], 65 "Find diagonals (faster for similar sequences)"), 66 _Switch(["-profile", "profile"], 67 "Perform a profile alignment"), 68 _Option(["-in1", "in1"], 69 "First input filename for profile alignment", 70 filename=True, 71 equate=False), 72 _Option(["-in2", "in2"], 73 "Second input filename for a profile alignment", 74 filename=True, 75 equate=False), 76 # anchorspacing Integer 32 Minimum spacing between 77 _Option(["-anchorspacing", "anchorspacing"], 78 "Minimum spacing between anchor columns", 79 checker_function=lambda x: isinstance(x, int), 80 equate=False), 81 # center Floating point [1] Center parameter. 82 # Should be negative. 83 _Option(["-center", "center"], 84 "Center parameter - should be negative", 85 checker_function=lambda x: isinstance(x, float), 86 equate=False), 87 # cluster1 upgma upgmb Clustering method. 88 _Option(["-cluster1", "cluster1"], 89 "Clustering method used in iteration 1", 90 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 91 equate=False), 92 # cluster2 upgmb cluster1 is used in 93 # neighborjoining iteration 1 and 2, 94 # cluster2 in later 95 # iterations. 96 _Option(["-cluster2", "cluster2"], 97 "Clustering method used in iteration 2", 98 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 99 equate=False), 100 # diaglength Integer 24 Minimum length of 101 # diagonal. 102 _Option(["-diaglength", "diaglength"], 103 "Minimum length of diagonal", 104 checker_function=lambda x: isinstance(x, int), 105 equate=True), 106 # diagmargin Integer 5 Discard this many 107 # positions at ends of 108 # diagonal. 109 _Option(["-diagmargin", "diagmargin"], 110 "Discard this many positions at ends of diagonal", 111 checker_function=lambda x: isinstance(x, int), 112 equate=False), 113 # distance1 kmer6_6 Kmer6_6 (amino) or Distance measure for 114 # kmer20_3 Kmer4_6 (nucleo) iteration 1. 115 # kmer20_4 116 # kbit20_3 117 # kmer4_6 118 _Option(["-distance1", "distance1"], 119 "Distance measure for iteration 1", 120 checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, 121 equate=False), 122 # distance2 kmer6_6 pctid_kimura Distance measure for 123 # kmer20_3 iterations 2, 3 ... 124 # kmer20_4 125 # kbit20_3 126 # pctid_kimura 127 # pctid_log 128 _Option(["-distance2", "distance2"], 129 "Distance measure for iteration 2", 130 checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, 131 equate=False), 132 # gapopen Floating point [1] The gap open score. 133 # Must be negative. 134 _Option(["-gapopen", "gapopen"], 135 "Gap open score - negative number", 136 checker_function=lambda x: isinstance(x, float), 137 equate=False), 138 # hydro Integer 5 Window size for 139 # determining whether a 140 # region is hydrophobic. 141 _Option(["-hydro", "hydro"], 142 "Window size for hydrophobic region", 143 checker_function=lambda x: isinstance(x, int), 144 equate=False), 145 # hydrofactor Floating point 1.2 Multiplier for gap 146 # open/close penalties in 147 # hydrophobic regions. 148 _Option(["-hydrofactor", "hydrofactor"], 149 "Multiplier for gap penalties in hydrophobic regions", 150 checker_function=lambda x: isinstance(x, float), 151 equate=False), 152 # log File name None. Log file name (delete 153 # existing file). 154 _Option(["-log", "log"], 155 "Log file name", 156 filename=True, 157 equate=False), 158 # loga File name None. Log file name (append 159 # to existing file). 160 _Option(["-loga", "loga"], 161 "Log file name (append to existing file)", 162 filename=True, 163 equate=False), 164 # maxdiagbreak Integer 1 Maximum distance 165 # between two diagonals 166 # that allows them to 167 # merge into one 168 # diagonal. 169 _Option(["-maxdiagbreak", "maxdiagbreak"], 170 "Maximum distance between two diagonals that allows " 171 "them to merge into one diagonal", 172 checker_function=lambda x: isinstance(x, int), 173 equate=False), 174 # maxhours Floating point None. Maximum time to run in 175 # hours. The actual time 176 # may exceed the 177 # requested limit by a 178 # few minutes. Decimals 179 # are allowed, so 1.5 180 # means one hour and 30 181 # minutes. 182 _Option(["-maxhours", "maxhours"], 183 "Maximum time to run in hours", 184 checker_function=lambda x: isinstance(x, float), 185 equate=False), 186 # maxiters Integer 1, 2 ... 16 Maximum number of 187 # iterations. 188 _Option(["-maxiters", "maxiters"], 189 "Maximum number of iterations", 190 checker_function=lambda x: isinstance(x, int), 191 equate=False), 192 # maxtrees Integer 1 Maximum number of new 193 # trees to build in 194 # iteration 2. 195 _Option(["-maxtrees", "maxtrees"], 196 "Maximum number of trees to build in iteration 2", 197 checker_function=lambda x: isinstance(x, int), 198 equate=False), 199 # minbestcolscore Floating point [1] Minimum score a column 200 # must have to be an 201 # anchor. 202 _Option(["-minbestcolscore", "minbestcolscore"], 203 "Minimum score a column must have to be an anchor", 204 checker_function=lambda x: isinstance(x, float), 205 equate=False), 206 # minsmoothscore Floating point [1] Minimum smoothed score 207 # a column must have to 208 # be an anchor. 209 _Option(["-minsmoothscore", "minsmoothscore"], 210 "Minimum smoothed score a column must have to " 211 "be an anchor", 212 checker_function=lambda x: isinstance(x, float), 213 equate=False), 214 # objscore sp spm Objective score used by 215 # ps tree dependent 216 # dp refinement. 217 # xp sp=sum-of-pairs score. 218 # spf spf=sum-of-pairs score 219 # spm (dimer approximation) 220 # spm=sp for < 100 seqs, 221 # otherwise spf 222 # dp=dynamic programming 223 # score. 224 # ps=average profile- 225 # sequence score. 226 # xp=cross profile score. 227 _Option(["-objscore", "objscore"], 228 "Objective score used by tree dependent refinement", 229 checker_function=lambda x: x in OBJECTIVE_SCORES, 230 equate=False), 231 # root1 pseudo pseudo Method used to root 232 _Option(["-root1", "root1"], 233 "Method used to root tree in iteration 1", 234 checker_function=lambda x: x in TREE_ROOT_METHODS, 235 equate=False), 236 # root2 midlongestspan tree; root1 is used in 237 # minavgleafdist iteration 1 and 2, 238 # root2 in later 239 # iterations. 240 _Option(["-root2", "root2"], 241 "Method used to root tree in iteration 2", 242 checker_function=lambda x: x in TREE_ROOT_METHODS, 243 equate=False), 244 # seqtype protein auto Sequence type. 245 # nucleo 246 # auto 247 _Option(["-seqtype", "seqtype"], 248 "Sequence type", 249 checker_function=lambda x: x in SEQUENCE_TYPES, 250 equate=False), 251 # smoothscoreceil Floating point [1] Maximum value of column 252 # score for smoothing 253 # purposes. 254 _Option(["-smoothscoreceil", "smoothscoreceil"], 255 "Maximum value of column score for smoothing", 256 checker_function=lambda x: isinstance(x, float), 257 equate=False), 258 # smoothwindow Integer 7 Window used for anchor 259 # column smoothing. 260 _Option(["-smoothwindow", "smoothwindow"], 261 "Window used for anchor column smoothing", 262 checker_function=lambda x: isinstance(x, int), 263 equate=False), 264 # SUEFF Floating point value 0.1 Constant used in UPGMB 265 # between 0 and 1. clustering. Determines 266 # the relative fraction 267 # of average linkage 268 # (SUEFF) vs. nearest- 269 # neighbor linkage (1 270 # SUEFF). 271 _Option(["-sueff", "sueff"], 272 "Constant used in UPGMB clustering", 273 checker_function=lambda x: isinstance(x, float), 274 equate=False), 275 # tree1 File name None Save tree produced in 276 _Option(["-tree1", "tree1"], 277 "Save Newick tree from iteration 1", 278 equate=False), 279 # tree2 first or second 280 # iteration to given file 281 # in Newick (Phylip- 282 # compatible) format. 283 _Option(["-tree2", "tree2"], 284 "Save Newick tree from iteration 2", 285 equate=False), 286 # weight1 none clustalw Sequence weighting 287 _Option(["-weight1", "weight1"], 288 "Weighting scheme used in iteration 1", 289 checker_function=lambda x: x in WEIGHTING_SCHEMES, 290 equate=False), 291 # weight2 henikoff scheme. 292 # henikoffpb weight1 is used in 293 # gsc iterations 1 and 2. 294 # clustalw weight2 is used for 295 # threeway tree-dependent 296 # refinement. 297 # none=all sequences have 298 # equal weight. 299 # henikoff=Henikoff & 300 # Henikoff weighting 301 # scheme. 302 # henikoffpb=Modified 303 # Henikoff scheme as used 304 # in PSI-BLAST. 305 # clustalw=CLUSTALW 306 # method. 307 # threeway=Gotoh three- 308 # way method. 309 _Option(["-weight2", "weight2"], 310 "Weighting scheme used in iteration 2", 311 checker_function=lambda x: x in WEIGHTING_SCHEMES, 312 equate=False), 313 # ################### FORMATS ####################################### 314 # Multiple formats can be specified on the command line 315 # If -msf appears it will be used regardless of other formats 316 # specified. If -clw appears (and not -msf), clustalw format will be 317 # used regardless of other formats specified. If both -clw and 318 # -clwstrict are specified -clwstrict will be used regardless of 319 # other formats specified. If -fasta is specified and not -msf, 320 # -clw, or clwstrict, fasta will be used. If -fasta and -html are 321 # specified -fasta will be used. Only if -html is specified alone 322 # will html be used. I kid ye not. 323 # clw no Write output in CLUSTALW format (default is 324 # FASTA). 325 _Switch(["-clw", "clw"], 326 "Write output in CLUSTALW format (with a MUSCLE header)"), 327 # clwstrict no Write output in CLUSTALW format with the 328 # "CLUSTAL W (1.81)" header rather than the 329 # MUSCLE version. This is useful when a post- 330 # processing step is picky about the file 331 # header. 332 _Switch(["-clwstrict", "clwstrict"], 333 "Write output in CLUSTALW format with version 1.81 header"), 334 # fasta yes Write output in FASTA format. Alternatives 335 # include clw, 336 # clwstrict, msf and html. 337 _Switch(["-fasta", "fasta"], 338 "Write output in FASTA format"), 339 # html no Write output in HTML format (default is 340 # FASTA). 341 _Switch(["-html", "html"], 342 "Write output in HTML format"), 343 # msf no Write output in MSF format (default is 344 # FASTA). 345 _Switch(["-msf", "msf"], 346 "Write output in MSF format"), 347 # Phylip interleaved - undocumented as of 3.7 348 _Switch(["-phyi", "phyi"], 349 "Write output in PHYLIP interleaved format"), 350 # Phylip sequential - undocumented as of 3.7 351 _Switch(["-phys", "phys"], 352 "Write output in PHYLIP sequential format"), 353 # ################# Additional specified output files ######### 354 _Option(["-phyiout", "phyiout"], 355 "Write PHYLIP interleaved output to specified filename", 356 filename=True, 357 equate=False), 358 _Option(["-physout", "physout"], "Write PHYLIP sequential format to specified filename", 359 filename=True, 360 equate=False), 361 _Option(["-htmlout", "htmlout"], "Write HTML output to specified filename", 362 filename=True, 363 equate=False), 364 _Option(["-clwout", "clwout"], 365 "Write CLUSTALW output (with MUSCLE header) to specified " 366 "filename", 367 filename=True, 368 equate=False), 369 _Option(["-clwstrictout", "clwstrictout"], 370 "Write CLUSTALW output (with version 1.81 header) to " 371 "specified filename", 372 filename=True, 373 equate=False), 374 _Option(["-msfout", "msfout"], 375 "Write MSF format output to specified filename", 376 filename=True, 377 equate=False), 378 _Option(["-fastaout", "fastaout"], 379 "Write FASTA format output to specified filename", 380 filename=True, 381 equate=False), 382 # ############# END FORMATS ################################### 383 # anchors yes Use anchor optimization in tree dependent 384 # refinement iterations. 385 _Switch(["-anchors", "anchors"], 386 "Use anchor optimisation in tree dependent " 387 "refinement iterations"), 388 # noanchors no Disable anchor optimization. Default is 389 # anchors. 390 _Switch(["-noanchors", "noanchors"], 391 "Do not use anchor optimisation in tree dependent " 392 "refinement iterations"), 393 # group yes Group similar sequences together in the 394 # output. This is the default. See also 395 # stable. 396 _Switch(["-group", "group"], 397 "Group similar sequences in output"), 398 # stable no Preserve input order of sequences in output 399 # file. Default is to group sequences by 400 # similarity (group). 401 _Switch(["-stable", "stable"], 402 "Do not group similar sequences in output (not supported in v3.8)"), 403 # ############# log-expectation profile score ###################### 404 # One of either -le, -sp, or -sv 405 # 406 # According to the doc, spn is default and the only option for 407 # nucleotides: this doesn't appear to be true. -le, -sp, and -sv can 408 # be used and produce numerically different logs (what is going on?) 409 # 410 # spn fails on proteins 411 # le maybe Use log-expectation profile score (VTML240). 412 # Alternatives are to use sp or sv. This is 413 # the default for amino acid sequences. 414 _Switch(["-le", "le"], 415 "Use log-expectation profile score (VTML240)"), 416 # sv no Use sum-of-pairs profile score (VTML240). 417 # Default is le. 418 _Switch(["-sv", "sv"], 419 "Use sum-of-pairs profile score (VTML240)"), 420 # sp no Use sum-of-pairs protein profile score 421 # (PAM200). Default is le. 422 _Switch(["-sp", "sp"], 423 "Use sum-of-pairs protein profile score (PAM200)"), 424 # spn maybe Use sum-of-pairs nucleotide profile score 425 # (BLASTZ parameters). This is the only option 426 # for nucleotides, and is therefore the 427 # default. 428 _Switch(["-spn", "spn"], 429 "Use sum-of-pairs protein nucleotide profile score"), 430 # ############# END log-expectation profile score ###################### 431 # quiet no Do not display progress messages. 432 _Switch(["-quiet", "quiet"], 433 "Use sum-of-pairs protein nucleotide profile score"), 434 # refine no Input file is already aligned, skip first 435 # two iterations and begin tree dependent 436 # refinement. 437 _Switch(["-refine", "refine"], 438 "Only do tree dependent refinement"), 439 # core yes in muscle, Do not catch exceptions. 440 # no in muscled. 441 _Switch(["-core", "core"], 442 "Catch exceptions"), 443 # nocore no in muscle, Catch exceptions and give an error message 444 # yes in muscled. if possible. 445 _Switch(["-nocore", "nocore"], 446 "Do not catch exceptions"), 447 # termgapsfull no Terminal gaps penalized with full penalty. 448 # [1] Not fully supported in this version. 449 # 450 # termgapshalf yes Terminal gaps penalized with half penalty. 451 # [1] Not fully supported in this version. 452 # 453 # termgapshalflonger no Terminal gaps penalized with half penalty if 454 # gap relative to 455 # longer sequence, otherwise with full 456 # penalty. 457 # [1] Not fully supported in this version. 458 # verbose no Write parameter settings and progress 459 # messages to log file. 460 _Switch(["-verbose", "verbose"], 461 "Write parameter settings and progress"), 462 # version no Write version string to stdout and exit. 463 _Switch(["-version", "version"], 464 "Write version string to stdout and exit"), 465 ] 466 AbstractCommandline.__init__(self, cmd, **kwargs)
467 468 469 if __name__ == "__main__": 470 from Bio._utils import run_doctest 471 run_doctest() 472