Package Bio :: Package Align :: Package Applications :: Module _Muscle
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Muscle

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment program MUSCLE. 
  6  """ 
  7   
  8  from __future__ import print_function 
  9   
 10  from Bio.Application import _Option, _Switch, AbstractCommandline 
 11   
 12   
13 -class MuscleCommandline(AbstractCommandline):
14 r"""Command line wrapper for the multiple alignment program MUSCLE. 15 16 http://www.drive5.com/muscle/ 17 18 Example: 19 -------- 20 21 >>> from Bio.Align.Applications import MuscleCommandline 22 >>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" 23 >>> in_file = r"C:\My Documents\unaligned.fasta" 24 >>> out_file = r"C:\My Documents\aligned.fasta" 25 >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) 26 >>> print(muscle_cline) 27 "C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta" 28 29 You would typically run the command line with muscle_cline() or via 30 the Python subprocess module, as described in the Biopython tutorial. 31 32 Citations: 33 ---------- 34 35 Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high 36 accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97. 37 38 Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with 39 reduced time and space complexity. BMC Bioinformatics 5(1): 113. 40 41 Last checked against version: 3.7, briefly against 3.8 42 """
43 - def __init__(self, cmd="muscle", **kwargs):
44 CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] 45 DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3", 46 "kmer4_6"] 47 DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \ 48 ["pctid_kimura", "pctid_log"] 49 OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] 50 TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] 51 SEQUENCE_TYPES = ["protein", "nucleo", "auto"] 52 WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb", "gsc", "threeway"] 53 self.parameters = [ 54 # Can't use "in" as the final alias as this is a reserved word in python: 55 _Option(["-in", "in", "input"], 56 "Input filename", 57 filename=True, 58 equate=False), 59 _Option(["-out", "out"], 60 "Output filename", 61 filename=True, 62 equate=False), 63 _Switch(["-diags", "diags"], 64 "Find diagonals (faster for similar sequences)"), 65 _Switch(["-profile", "profile"], 66 "Perform a profile alignment"), 67 _Option(["-in1", "in1"], 68 "First input filename for profile alignment", 69 filename=True, 70 equate=False), 71 _Option(["-in2", "in2"], 72 "Second input filename for a profile alignment", 73 filename=True, 74 equate=False), 75 # anchorspacing Integer 32 Minimum spacing between 76 _Option(["-anchorspacing", "anchorspacing"], 77 "Minimum spacing between anchor columns", 78 checker_function=lambda x: isinstance(x, int), 79 equate=False), 80 # center Floating point [1] Center parameter. 81 # Should be negative. 82 _Option(["-center", "center"], 83 "Center parameter - should be negative", 84 checker_function=lambda x: isinstance(x, float), 85 equate=False), 86 # cluster1 upgma upgmb Clustering method. 87 _Option(["-cluster1", "cluster1"], 88 "Clustering method used in iteration 1", 89 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 90 equate=False), 91 # cluster2 upgmb cluster1 is used in 92 # neighborjoining iteration 1 and 2, 93 # cluster2 in later 94 # iterations. 95 _Option(["-cluster2", "cluster2"], 96 "Clustering method used in iteration 2", 97 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 98 equate=False), 99 # diaglength Integer 24 Minimum length of 100 # diagonal. 101 _Option(["-diaglength", "diaglength"], 102 "Minimum length of diagonal", 103 checker_function=lambda x: isinstance(x, int), 104 equate=True), 105 # diagmargin Integer 5 Discard this many 106 # positions at ends of 107 # diagonal. 108 _Option(["-diagmargin", "diagmargin"], 109 "Discard this many positions at ends of diagonal", 110 checker_function=lambda x: isinstance(x, int), 111 equate=False), 112 # distance1 kmer6_6 Kmer6_6 (amino) or Distance measure for 113 # kmer20_3 Kmer4_6 (nucleo) iteration 1. 114 # kmer20_4 115 # kbit20_3 116 # kmer4_6 117 _Option(["-distance1", "distance1"], 118 "Distance measure for iteration 1", 119 checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, 120 equate=False), 121 # distance2 kmer6_6 pctid_kimura Distance measure for 122 # kmer20_3 iterations 2, 3 ... 123 # kmer20_4 124 # kbit20_3 125 # pctid_kimura 126 # pctid_log 127 _Option(["-distance2", "distance2"], 128 "Distance measure for iteration 2", 129 checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, 130 equate=False), 131 # gapopen Floating point [1] The gap open score. 132 # Must be negative. 133 _Option(["-gapopen", "gapopen"], 134 "Gap open score - negative number", 135 checker_function=lambda x: isinstance(x, float), 136 equate=False), 137 # hydro Integer 5 Window size for 138 # determining whether a 139 # region is hydrophobic. 140 _Option(["-hydro", "hydro"], 141 "Window size for hydrophobic region", 142 checker_function=lambda x: isinstance(x, int), 143 equate=False), 144 # hydrofactor Floating point 1.2 Multiplier for gap 145 # open/close penalties in 146 # hydrophobic regions. 147 _Option(["-hydrofactor", "hydrofactor"], 148 "Multiplier for gap penalties in hydrophobic regions", 149 checker_function=lambda x: isinstance(x, float), 150 equate=False), 151 # log File name None. Log file name (delete 152 # existing file). 153 _Option(["-log", "log"], 154 "Log file name", 155 filename=True, 156 equate=False), 157 # loga File name None. Log file name (append 158 # to existing file). 159 _Option(["-loga", "loga"], 160 "Log file name (append to existing file)", 161 filename=True, 162 equate=False), 163 # maxdiagbreak Integer 1 Maximum distance 164 # between two diagonals 165 # that allows them to 166 # merge into one 167 # diagonal. 168 _Option(["-maxdiagbreak", "maxdiagbreak"], 169 "Maximum distance between two diagonals that allows " 170 "them to merge into one diagonal", 171 checker_function=lambda x: isinstance(x, int), 172 equate=False), 173 # maxhours Floating point None. Maximum time to run in 174 # hours. The actual time 175 # may exceed the 176 # requested limit by a 177 # few minutes. Decimals 178 # are allowed, so 1.5 179 # means one hour and 30 180 # minutes. 181 _Option(["-maxhours", "maxhours"], 182 "Maximum time to run in hours", 183 checker_function=lambda x: isinstance(x, float), 184 equate=False), 185 # maxiters Integer 1, 2 ... 16 Maximum number of 186 # iterations. 187 _Option(["-maxiters", "maxiters"], 188 "Maximum number of iterations", 189 checker_function=lambda x: isinstance(x, int), 190 equate=False), 191 # maxtrees Integer 1 Maximum number of new 192 # trees to build in 193 # iteration 2. 194 _Option(["-maxtrees", "maxtrees"], 195 "Maximum number of trees to build in iteration 2", 196 checker_function=lambda x: isinstance(x, int), 197 equate=False), 198 # minbestcolscore Floating point [1] Minimum score a column 199 # must have to be an 200 # anchor. 201 _Option(["-minbestcolscore", "minbestcolscore"], 202 "Minimum score a column must have to be an anchor", 203 checker_function=lambda x: isinstance(x, float), 204 equate=False), 205 # minsmoothscore Floating point [1] Minimum smoothed score 206 # a column must have to 207 # be an anchor. 208 _Option(["-minsmoothscore", "minsmoothscore"], 209 "Minimum smoothed score a column must have to " 210 "be an anchor", 211 checker_function=lambda x: isinstance(x, float), 212 equate=False), 213 # objscore sp spm Objective score used by 214 # ps tree dependent 215 # dp refinement. 216 # xp sp=sum-of-pairs score. 217 # spf spf=sum-of-pairs score 218 # spm (dimer approximation) 219 # spm=sp for < 100 seqs, 220 # otherwise spf 221 # dp=dynamic programming 222 # score. 223 # ps=average profile- 224 # sequence score. 225 # xp=cross profile score. 226 _Option(["-objscore", "objscore"], 227 "Objective score used by tree dependent refinement", 228 checker_function=lambda x: x in OBJECTIVE_SCORES, 229 equate=False), 230 # root1 pseudo pseudo Method used to root 231 _Option(["-root1", "root1"], 232 "Method used to root tree in iteration 1", 233 checker_function=lambda x: x in TREE_ROOT_METHODS, 234 equate=False), 235 # root2 midlongestspan tree; root1 is used in 236 # minavgleafdist iteration 1 and 2, 237 # root2 in later 238 # iterations. 239 _Option(["-root2", "root2"], 240 "Method used to root tree in iteration 2", 241 checker_function=lambda x: x in TREE_ROOT_METHODS, 242 equate=False), 243 # seqtype protein auto Sequence type. 244 # nucleo 245 # auto 246 _Option(["-seqtype", "seqtype"], 247 "Sequence type", 248 checker_function=lambda x: x in SEQUENCE_TYPES, 249 equate=False), 250 # smoothscoreceil Floating point [1] Maximum value of column 251 # score for smoothing 252 # purposes. 253 _Option(["-smoothscoreceil", "smoothscoreceil"], 254 "Maximum value of column score for smoothing", 255 checker_function=lambda x: isinstance(x, float), 256 equate=False), 257 # smoothwindow Integer 7 Window used for anchor 258 # column smoothing. 259 _Option(["-smoothwindow", "smoothwindow"], 260 "Window used for anchor column smoothing", 261 checker_function=lambda x: isinstance(x, int), 262 equate=False), 263 # SUEFF Floating point value 0.1 Constant used in UPGMB 264 # between 0 and 1. clustering. Determines 265 # the relative fraction 266 # of average linkage 267 # (SUEFF) vs. nearest- 268 # neighbor linkage (1 269 # SUEFF). 270 _Option(["-sueff", "sueff"], 271 "Constant used in UPGMB clustering", 272 checker_function=lambda x: isinstance(x, float), 273 equate=False), 274 # tree1 File name None Save tree produced in 275 _Option(["-tree1", "tree1"], 276 "Save Newick tree from iteration 1", 277 equate=False), 278 # tree2 first or second 279 # iteration to given file 280 # in Newick (Phylip- 281 # compatible) format. 282 _Option(["-tree2", "tree2"], 283 "Save Newick tree from iteration 2", 284 equate=False), 285 # weight1 none clustalw Sequence weighting 286 _Option(["-weight1", "weight1"], 287 "Weighting scheme used in iteration 1", 288 checker_function=lambda x: x in WEIGHTING_SCHEMES, 289 equate=False), 290 # weight2 henikoff scheme. 291 # henikoffpb weight1 is used in 292 # gsc iterations 1 and 2. 293 # clustalw weight2 is used for 294 # threeway tree-dependent 295 # refinement. 296 # none=all sequences have 297 # equal weight. 298 # henikoff=Henikoff & 299 # Henikoff weighting 300 # scheme. 301 # henikoffpb=Modified 302 # Henikoff scheme as used 303 # in PSI-BLAST. 304 # clustalw=CLUSTALW 305 # method. 306 # threeway=Gotoh three- 307 # way method. 308 _Option(["-weight2", "weight2"], 309 "Weighting scheme used in iteration 2", 310 checker_function=lambda x: x in WEIGHTING_SCHEMES, 311 equate=False), 312 # ################### FORMATS ####################################### 313 # Multiple formats can be specified on the command line 314 # If -msf appears it will be used regardless of other formats 315 # specified. If -clw appears (and not -msf), clustalw format will be 316 # used regardless of other formats specified. If both -clw and 317 # -clwstrict are specified -clwstrict will be used regardless of 318 # other formats specified. If -fasta is specified and not -msf, 319 # -clw, or clwstrict, fasta will be used. If -fasta and -html are 320 # specified -fasta will be used. Only if -html is specified alone 321 # will html be used. I kid ye not. 322 # clw no Write output in CLUSTALW format (default is 323 # FASTA). 324 _Switch(["-clw", "clw"], 325 "Write output in CLUSTALW format (with a MUSCLE header)"), 326 # clwstrict no Write output in CLUSTALW format with the 327 # "CLUSTAL W (1.81)" header rather than the 328 # MUSCLE version. This is useful when a post- 329 # processing step is picky about the file 330 # header. 331 _Switch(["-clwstrict", "clwstrict"], 332 "Write output in CLUSTALW format with version 1.81 header"), 333 # fasta yes Write output in FASTA format. Alternatives 334 # include clw, 335 # clwstrict, msf and html. 336 _Switch(["-fasta", "fasta"], 337 "Write output in FASTA format"), 338 # html no Write output in HTML format (default is 339 # FASTA). 340 _Switch(["-html", "html"], 341 "Write output in HTML format"), 342 # msf no Write output in MSF format (default is 343 # FASTA). 344 _Switch(["-msf", "msf"], 345 "Write output in MSF format"), 346 # Phylip interleaved - undocumented as of 3.7 347 _Switch(["-phyi", "phyi"], 348 "Write output in PHYLIP interleaved format"), 349 # Phylip sequential - undocumented as of 3.7 350 _Switch(["-phys", "phys"], 351 "Write output in PHYLIP sequential format"), 352 # ################# Additional specified output files ######### 353 _Option(["-phyiout", "phyiout"], 354 "Write PHYLIP interleaved output to specified filename", 355 filename=True, 356 equate=False), 357 _Option(["-physout", "physout"], "Write PHYLIP sequential format to specified filename", 358 filename=True, 359 equate=False), 360 _Option(["-htmlout", "htmlout"], "Write HTML output to specified filename", 361 filename=True, 362 equate=False), 363 _Option(["-clwout", "clwout"], 364 "Write CLUSTALW output (with MUSCLE header) to specified " 365 "filename", 366 filename=True, 367 equate=False), 368 _Option(["-clwstrictout", "clwstrictout"], 369 "Write CLUSTALW output (with version 1.81 header) to " 370 "specified filename", 371 filename=True, 372 equate=False), 373 _Option(["-msfout", "msfout"], 374 "Write MSF format output to specified filename", 375 filename=True, 376 equate=False), 377 _Option(["-fastaout", "fastaout"], 378 "Write FASTA format output to specified filename", 379 filename=True, 380 equate=False), 381 # ############# END FORMATS ################################### 382 # anchors yes Use anchor optimization in tree dependent 383 # refinement iterations. 384 _Switch(["-anchors", "anchors"], 385 "Use anchor optimisation in tree dependent " 386 "refinement iterations"), 387 # noanchors no Disable anchor optimization. Default is 388 # anchors. 389 _Switch(["-noanchors", "noanchors"], 390 "Do not use anchor optimisation in tree dependent " 391 "refinement iterations"), 392 # group yes Group similar sequences together in the 393 # output. This is the default. See also 394 # stable. 395 _Switch(["-group", "group"], 396 "Group similar sequences in output"), 397 # stable no Preserve input order of sequences in output 398 # file. Default is to group sequences by 399 # similarity (group). 400 _Switch(["-stable", "stable"], 401 "Do not group similar sequences in output (not supported in v3.8)"), 402 # ############# log-expectation profile score ###################### 403 # One of either -le, -sp, or -sv 404 # 405 # According to the doc, spn is default and the only option for 406 # nucleotides: this doesn't appear to be true. -le, -sp, and -sv can 407 # be used and produce numerically different logs (what is going on?) 408 # 409 # spn fails on proteins 410 # le maybe Use log-expectation profile score (VTML240). 411 # Alternatives are to use sp or sv. This is 412 # the default for amino acid sequences. 413 _Switch(["-le", "le"], 414 "Use log-expectation profile score (VTML240)"), 415 # sv no Use sum-of-pairs profile score (VTML240). 416 # Default is le. 417 _Switch(["-sv", "sv"], 418 "Use sum-of-pairs profile score (VTML240)"), 419 # sp no Use sum-of-pairs protein profile score 420 # (PAM200). Default is le. 421 _Switch(["-sp", "sp"], 422 "Use sum-of-pairs protein profile score (PAM200)"), 423 # spn maybe Use sum-of-pairs nucleotide profile score 424 # (BLASTZ parameters). This is the only option 425 # for nucleotides, and is therefore the 426 # default. 427 _Switch(["-spn", "spn"], 428 "Use sum-of-pairs protein nucleotide profile score"), 429 # ############# END log-expectation profile score ###################### 430 # quiet no Do not display progress messages. 431 _Switch(["-quiet", "quiet"], 432 "Use sum-of-pairs protein nucleotide profile score"), 433 # refine no Input file is already aligned, skip first 434 # two iterations and begin tree dependent 435 # refinement. 436 _Switch(["-refine", "refine"], 437 "Only do tree dependent refinement"), 438 # core yes in muscle, Do not catch exceptions. 439 # no in muscled. 440 _Switch(["-core", "core"], 441 "Catch exceptions"), 442 # nocore no in muscle, Catch exceptions and give an error message 443 # yes in muscled. if possible. 444 _Switch(["-nocore", "nocore"], 445 "Do not catch exceptions"), 446 # termgapsfull no Terminal gaps penalized with full penalty. 447 # [1] Not fully supported in this version. 448 # 449 # termgapshalf yes Terminal gaps penalized with half penalty. 450 # [1] Not fully supported in this version. 451 # 452 # termgapshalflonger no Terminal gaps penalized with half penalty if 453 # gap relative to 454 # longer sequence, otherwise with full 455 # penalty. 456 # [1] Not fully supported in this version. 457 # verbose no Write parameter settings and progress 458 # messages to log file. 459 _Switch(["-verbose", "verbose"], 460 "Write parameter settings and progress"), 461 # version no Write version string to stdout and exit. 462 _Switch(["-version", "version"], 463 "Write version string to stdout and exit"), 464 ] 465 AbstractCommandline.__init__(self, cmd, **kwargs)
466 467 468 if __name__ == "__main__": 469 from Bio._utils import run_doctest 470 run_doctest() 471