Package Bio :: Package Align :: Package Applications :: Module _Muscle
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Muscle

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment program MUSCLE. 
  6  """ 
  7   
  8  from __future__ import print_function 
  9   
 10  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
 11   
 12  from Bio.Application import _Option, _Switch, AbstractCommandline 
 13   
 14   
15 -class MuscleCommandline(AbstractCommandline):
16 r"""Command line wrapper for the multiple alignment program MUSCLE. 17 18 http://www.drive5.com/muscle/ 19 20 Example: 21 22 >>> from Bio.Align.Applications import MuscleCommandline 23 >>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" 24 >>> in_file = r"C:\My Documents\unaligned.fasta" 25 >>> out_file = r"C:\My Documents\aligned.fasta" 26 >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) 27 >>> print(muscle_cline) 28 "C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta" 29 30 You would typically run the command line with muscle_cline() or via 31 the Python subprocess module, as described in the Biopython tutorial. 32 33 Citations: 34 35 Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high 36 accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97. 37 38 Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with 39 reduced time and space complexity. BMC Bioinformatics 5(1): 113. 40 41 Last checked against version: 3.7, briefly against 3.8 42 """
43 - def __init__(self, cmd="muscle", **kwargs):
44 CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] 45 DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3", 46 "kmer4_6"] 47 DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \ 48 ["pctid_kimura", "pctid_log"] 49 OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] 50 TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] 51 SEQUENCE_TYPES = ["protein", "nucleo", "auto"] 52 WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb", 53 "gsc", "threeway"] 54 self.parameters = \ 55 [ 56 #Can't use "in" as the final alias as this is a reserved word in python: 57 _Option(["-in", "in", "input"], 58 "Input filename", 59 filename=True, 60 equate=False), 61 _Option(["-out", "out"], 62 "Output filename", 63 filename=True, 64 equate=False), 65 _Switch(["-diags", "diags"], 66 "Find diagonals (faster for similar sequences)"), 67 _Switch(["-profile", "profile"], 68 "Perform a profile alignment"), 69 _Option(["-in1", "in1"], 70 "First input filename for profile alignment", 71 filename=True, 72 equate=False), 73 _Option(["-in2", "in2"], 74 "Second input filename for a profile alignment", 75 filename=True, 76 equate=False), 77 #anchorspacing Integer 32 Minimum spacing between 78 _Option(["-anchorspacing", "anchorspacing"], 79 "Minimum spacing between anchor columns", 80 checker_function=lambda x: isinstance(x, int), 81 equate=False), 82 #center Floating point [1] Center parameter. 83 # Should be negative. 84 _Option(["-center", "center"], 85 "Center parameter - should be negative", 86 checker_function=lambda x: isinstance(x, float), 87 equate=False), 88 #cluster1 upgma upgmb Clustering method. 89 _Option(["-cluster1", "cluster1"], 90 "Clustering method used in iteration 1", 91 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 92 equate=False), 93 #cluster2 upgmb cluster1 is used in 94 # neighborjoining iteration 1 and 2, 95 # cluster2 in later 96 # iterations. 97 _Option(["-cluster2", "cluster2"], 98 "Clustering method used in iteration 2", 99 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 100 equate=False), 101 #diaglength Integer 24 Minimum length of 102 # diagonal. 103 _Option(["-diaglength", "diaglength"], 104 "Minimum length of diagonal", 105 checker_function=lambda x: isinstance(x, int), 106 equate=True), 107 #diagmargin Integer 5 Discard this many 108 # positions at ends of 109 # diagonal. 110 _Option(["-diagmargin", "diagmargin"], 111 "Discard this many positions at ends of diagonal", 112 checker_function=lambda x: isinstance(x, int), 113 equate=False), 114 #distance1 kmer6_6 Kmer6_6 (amino) or Distance measure for 115 # kmer20_3 Kmer4_6 (nucleo) iteration 1. 116 # kmer20_4 117 # kbit20_3 118 # kmer4_6 119 _Option(["-distance1", "distance1"], 120 "Distance measure for iteration 1", 121 checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, 122 equate=False), 123 #distance2 kmer6_6 pctid_kimura Distance measure for 124 # kmer20_3 iterations 2, 3 ... 125 # kmer20_4 126 # kbit20_3 127 # pctid_kimura 128 # pctid_log 129 _Option(["-distance2", "distance2"], 130 "Distance measure for iteration 2", 131 checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, 132 equate=False), 133 #gapopen Floating point [1] The gap open score. 134 # Must be negative. 135 _Option(["-gapopen", "gapopen"], 136 "Gap open score - negative number", 137 checker_function=lambda x: isinstance(x, float), 138 equate=False), 139 #hydro Integer 5 Window size for 140 # determining whether a 141 # region is hydrophobic. 142 _Option(["-hydro", "hydro"], 143 "Window size for hydrophobic region", 144 checker_function=lambda x: isinstance(x, int), 145 equate=False), 146 #hydrofactor Floating point 1.2 Multiplier for gap 147 # open/close penalties in 148 # hydrophobic regions. 149 _Option(["-hydrofactor", "hydrofactor"], 150 "Multiplier for gap penalties in hydrophobic regions", 151 checker_function=lambda x: isinstance(x, float), 152 equate=False), 153 #log File name None. Log file name (delete 154 # existing file). 155 _Option(["-log", "log"], 156 "Log file name", 157 filename=True, 158 equate=False), 159 #loga File name None. Log file name (append 160 # to existing file). 161 _Option(["-loga", "loga"], 162 "Log file name (append to existing file)", 163 filename=True, 164 equate=False), 165 #maxdiagbreak Integer 1 Maximum distance 166 # between two diagonals 167 # that allows them to 168 # merge into one 169 # diagonal. 170 _Option(["-maxdiagbreak", "maxdiagbreak"], 171 "Maximum distance between two diagonals that allows " 172 "them to merge into one diagonal", 173 checker_function=lambda x: isinstance(x, int), 174 equate=False), 175 #maxhours Floating point None. Maximum time to run in 176 # hours. The actual time 177 # may exceed the 178 # requested limit by a 179 # few minutes. Decimals 180 # are allowed, so 1.5 181 # means one hour and 30 182 # minutes. 183 _Option(["-maxhours", "maxhours"], 184 "Maximum time to run in hours", 185 checker_function=lambda x: isinstance(x, float), 186 equate=False), 187 #maxiters Integer 1, 2 ... 16 Maximum number of 188 # iterations. 189 _Option(["-maxiters", "maxiters"], 190 "Maximum number of iterations", 191 checker_function=lambda x: isinstance(x, int), 192 equate=False), 193 #maxtrees Integer 1 Maximum number of new 194 # trees to build in 195 # iteration 2. 196 _Option(["-maxtrees", "maxtrees"], 197 "Maximum number of trees to build in iteration 2", 198 checker_function=lambda x: isinstance(x, int), 199 equate=False), 200 #minbestcolscore Floating point [1] Minimum score a column 201 # must have to be an 202 # anchor. 203 _Option(["-minbestcolscore", "minbestcolscore"], 204 "Minimum score a column must have to be an anchor", 205 checker_function=lambda x: isinstance(x, float), 206 equate=False), 207 #minsmoothscore Floating point [1] Minimum smoothed score 208 # a column must have to 209 # be an anchor. 210 _Option(["-minsmoothscore", "minsmoothscore"], 211 "Minimum smoothed score a column must have to " 212 "be an anchor", 213 checker_function=lambda x: isinstance(x, float), 214 equate=False), 215 #objscore sp spm Objective score used by 216 # ps tree dependent 217 # dp refinement. 218 # xp sp=sum-of-pairs score. 219 # spf spf=sum-of-pairs score 220 # spm (dimer approximation) 221 # spm=sp for < 100 seqs, 222 # otherwise spf 223 # dp=dynamic programming 224 # score. 225 # ps=average profile- 226 # sequence score. 227 # xp=cross profile score. 228 _Option(["-objscore", "objscore"], 229 "Objective score used by tree dependent refinement", 230 checker_function=lambda x: x in OBJECTIVE_SCORES, 231 equate=False), 232 #root1 pseudo pseudo Method used to root 233 _Option(["-root1", "root1"], 234 "Method used to root tree in iteration 1", 235 checker_function=lambda x: x in TREE_ROOT_METHODS, 236 equate=False), 237 #root2 midlongestspan tree; root1 is used in 238 # minavgleafdist iteration 1 and 2, 239 # root2 in later 240 # iterations. 241 _Option(["-root2", "root2"], 242 "Method used to root tree in iteration 2", 243 checker_function=lambda x: x in TREE_ROOT_METHODS, 244 equate=False), 245 #seqtype protein auto Sequence type. 246 # nucleo 247 # auto 248 _Option(["-seqtype", "seqtype"], 249 "Sequence type", 250 checker_function=lambda x: x in SEQUENCE_TYPES, 251 equate=False), 252 #smoothscoreceil Floating point [1] Maximum value of column 253 # score for smoothing 254 # purposes. 255 _Option(["-smoothscoreceil", "smoothscoreceil"], 256 "Maximum value of column score for smoothing", 257 checker_function=lambda x: isinstance(x, float), 258 equate=False), 259 #smoothwindow Integer 7 Window used for anchor 260 # column smoothing. 261 _Option(["-smoothwindow", "smoothwindow"], 262 "Window used for anchor column smoothing", 263 checker_function=lambda x: isinstance(x, int), 264 equate=False), 265 #SUEFF Floating point value 0.1 Constant used in UPGMB 266 # between 0 and 1. clustering. Determines 267 # the relative fraction 268 # of average linkage 269 # (SUEFF) vs. nearest- 270 # neighbor linkage (1 271 # SUEFF). 272 _Option(["-sueff", "sueff"], 273 "Constant used in UPGMB clustering", 274 checker_function=lambda x: isinstance(x, float), 275 equate=False), 276 #tree1 File name None Save tree produced in 277 _Option(["-tree1", "tree1"], 278 "Save Newick tree from iteration 1", 279 equate=False), 280 #tree2 first or second 281 # iteration to given file 282 # in Newick (Phylip- 283 # compatible) format. 284 _Option(["-tree2", "tree2"], 285 "Save Newick tree from iteration 2", 286 equate=False), 287 #weight1 none clustalw Sequence weighting 288 _Option(["-weight1", "weight1"], 289 "Weighting scheme used in iteration 1", 290 checker_function=lambda x: x in WEIGHTING_SCHEMES, 291 equate=False), 292 #weight2 henikoff scheme. 293 # henikoffpb weight1 is used in 294 # gsc iterations 1 and 2. 295 # clustalw weight2 is used for 296 # threeway tree-dependent 297 # refinement. 298 # none=all sequences have 299 # equal weight. 300 # henikoff=Henikoff & 301 # Henikoff weighting 302 # scheme. 303 # henikoffpb=Modified 304 # Henikoff scheme as used 305 # in PSI-BLAST. 306 # clustalw=CLUSTALW 307 # method. 308 # threeway=Gotoh three- 309 # way method. 310 _Option(["-weight2", "weight2"], 311 "Weighting scheme used in iteration 2", 312 checker_function=lambda x: x in WEIGHTING_SCHEMES, 313 equate=False), 314 #################### FORMATS ####################################### 315 # Multiple formats can be specified on the command line 316 # If -msf appears it will be used regardless of other formats 317 # specified. If -clw appears (and not -msf), clustalw format will be 318 # used regardless of other formats specified. If both -clw and 319 # -clwstrict are specified -clwstrict will be used regardless of 320 # other formats specified. If -fasta is specified and not -msf, 321 # -clw, or clwstrict, fasta will be used. If -fasta and -html are 322 # specified -fasta will be used. Only if -html is specified alone 323 # will html be used. I kid ye not. 324 #clw no Write output in CLUSTALW format (default is 325 # FASTA). 326 _Switch(["-clw", "clw"], 327 "Write output in CLUSTALW format (with a MUSCLE header)"), 328 #clwstrict no Write output in CLUSTALW format with the 329 # "CLUSTAL W (1.81)" header rather than the 330 # MUSCLE version. This is useful when a post- 331 # processing step is picky about the file 332 # header. 333 _Switch(["-clwstrict", "clwstrict"], 334 "Write output in CLUSTALW format with version 1.81 header"), 335 #fasta yes Write output in FASTA format. Alternatives 336 # include clw, 337 # clwstrict, msf and html. 338 _Switch(["-fasta", "fasta"], 339 "Write output in FASTA format"), 340 #html no Write output in HTML format (default is 341 # FASTA). 342 _Switch(["-html", "html"], 343 "Write output in HTML format"), 344 #msf no Write output in MSF format (default is 345 # FASTA). 346 _Switch(["-msf", "msf"], 347 "Write output in MSF format"), 348 #Phylip interleaved - undocumented as of 3.7 349 _Switch(["-phyi", "phyi"], 350 "Write output in PHYLIP interleaved format"), 351 #Phylip sequential - undocumented as of 3.7 352 _Switch(["-phys", "phys"], 353 "Write output in PHYLIP sequential format"), 354 ################## Additional specified output files ######### 355 _Option(["-phyiout", "phyiout"], 356 "Write PHYLIP interleaved output to specified filename", 357 filename=True, 358 equate=False), 359 _Option(["-physout", "physout"], "Write PHYLIP sequential format to specified filename", 360 filename=True, 361 equate=False), 362 _Option(["-htmlout", "htmlout"], "Write HTML output to specified filename", 363 filename=True, 364 equate=False), 365 _Option(["-clwout", "clwout"], 366 "Write CLUSTALW output (with MUSCLE header) to specified " 367 "filename", 368 filename=True, 369 equate=False), 370 _Option(["-clwstrictout", "clwstrictout"], 371 "Write CLUSTALW output (with version 1.81 header) to " 372 "specified filename", 373 filename=True, 374 equate=False), 375 _Option(["-msfout", "msfout"], 376 "Write MSF format output to specified filename", 377 filename=True, 378 equate=False), 379 _Option(["-fastaout", "fastaout"], 380 "Write FASTA format output to specified filename", 381 filename=True, 382 equate=False), 383 ############## END FORMATS ################################### 384 #anchors yes Use anchor optimization in tree dependent 385 # refinement iterations. 386 _Switch(["-anchors", "anchors"], 387 "Use anchor optimisation in tree dependent " 388 "refinement iterations"), 389 #noanchors no Disable anchor optimization. Default is 390 # anchors. 391 _Switch(["-noanchors", "noanchors"], 392 "Do not use anchor optimisation in tree dependent " 393 "refinement iterations"), 394 #group yes Group similar sequences together in the 395 # output. This is the default. See also 396 # stable. 397 _Switch(["-group", "group"], 398 "Group similar sequences in output"), 399 #stable no Preserve input order of sequences in output 400 # file. Default is to group sequences by 401 # similarity (group). 402 _Switch(["-stable", "stable"], 403 "Do not group similar sequences in output (not supported in v3.8)"), 404 ############## log-expectation profile score ###################### 405 # One of either -le, -sp, or -sv 406 # 407 # According to the doc, spn is default and the only option for 408 # nucleotides: this doesnt appear to be true. -le, -sp, and -sv can 409 # be used and produce numerically different logs (what is going on?) 410 # 411 #spn fails on proteins 412 #le maybe Use log-expectation profile score (VTML240). 413 # Alternatives are to use sp or sv. This is 414 # the default for amino acid sequences. 415 _Switch(["-le", "le"], 416 "Use log-expectation profile score (VTML240)"), 417 #sv no Use sum-of-pairs profile score (VTML240). 418 # Default is le. 419 _Switch(["-sv", "sv"], 420 "Use sum-of-pairs profile score (VTML240)"), 421 #sp no Use sum-of-pairs protein profile score 422 # (PAM200). Default is le. 423 _Switch(["-sp", "sp"], 424 "Use sum-of-pairs protein profile score (PAM200)"), 425 #spn maybe Use sum-of-pairs nucleotide profile score 426 # (BLASTZ parameters). This is the only option 427 # for nucleotides, and is therefore the 428 # default. 429 _Switch(["-spn", "spn"], 430 "Use sum-of-pairs protein nucleotide profile score"), 431 ############## END log-expectation profile score ###################### 432 #quiet no Do not display progress messages. 433 _Switch(["-quiet", "quiet"], 434 "Use sum-of-pairs protein nucleotide profile score"), 435 #refine no Input file is already aligned, skip first 436 # two iterations and begin tree dependent 437 # refinement. 438 _Switch(["-refine", "refine"], 439 "Only do tree dependent refinement"), 440 #core yes in muscle, Do not catch exceptions. 441 # no in muscled. 442 _Switch(["-core", "core"], 443 "Catch exceptions"), 444 #nocore no in muscle, Catch exceptions and give an error message 445 # yes in muscled. if possible. 446 _Switch(["-nocore", "nocore"], 447 "Do not catch exceptions"), 448 #termgapsfull no Terminal gaps penalized with full penalty. 449 # [1] Not fully supported in this version. 450 # 451 #termgapshalf yes Terminal gaps penalized with half penalty. 452 # [1] Not fully supported in this version. 453 # 454 #termgapshalflonger no Terminal gaps penalized with half penalty if 455 # gap relative to 456 # longer sequence, otherwise with full 457 # penalty. 458 # [1] Not fully supported in this version. 459 #verbose no Write parameter settings and progress 460 # messages to log file. 461 _Switch(["-verbose", "verbose"], 462 "Write parameter settings and progress"), 463 #version no Write version string to stdout and exit. 464 _Switch(["-version", "version"], 465 "Write version string to stdout and exit"), 466 ] 467 AbstractCommandline.__init__(self, cmd, **kwargs)
468 469
470 -def _test():
471 """Run the module's doctests (PRIVATE).""" 472 print("Running MUSCLE doctests...") 473 import doctest 474 doctest.testmod() 475 print("Done")
476 477 if __name__ == "__main__": 478 _test() 479