Package Bio :: Package Align :: Package Applications :: Module _Mafft
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Mafft

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment programme MAFFT. 
  6  """ 
  7   
  8  from __future__ import print_function 
  9   
 10  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
 11   
 12  import os 
 13  from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline 
 14   
 15   
16 -class MafftCommandline(AbstractCommandline):
17 """Command line wrapper for the multiple alignment program MAFFT. 18 19 http://align.bmr.kyushu-u.ac.jp/mafft/software/ 20 21 Example: 22 23 >>> from Bio.Align.Applications import MafftCommandline 24 >>> mafft_exe = "/opt/local/mafft" 25 >>> in_file = "../Doc/examples/opuntia.fasta" 26 >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) 27 >>> print(mafft_cline) 28 /opt/local/mafft ../Doc/examples/opuntia.fasta 29 30 If the mafft binary is on the path (typically the case on a Unix style 31 operating system) then you don't need to supply the executable location: 32 33 >>> from Bio.Align.Applications import MafftCommandline 34 >>> in_file = "../Doc/examples/opuntia.fasta" 35 >>> mafft_cline = MafftCommandline(input=in_file) 36 >>> print(mafft_cline) 37 mafft ../Doc/examples/opuntia.fasta 38 39 You would typically run the command line with mafft_cline() or via 40 the Python subprocess module, as described in the Biopython tutorial. 41 Note that MAFFT will write the alignment to stdout, which you may 42 want to save to a file and then parse, e.g.:: 43 44 stdout, stderr = mafft_cline() 45 with open("aligned.fasta", "w") as handle: 46 handle.write(stdout) 47 from Bio import AlignIO 48 align = AlignIO.read("aligned.fasta", "fasta") 49 50 Alternatively, to parse the output with AlignIO directly you can 51 use StringIO to turn the string into a handle:: 52 53 stdout, stderr = mafft_cline() 54 from StringIO import StringIO 55 from Bio import AlignIO 56 align = AlignIO.read(StringIO(stdout), "fasta") 57 58 Citations: 59 60 Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of 61 multiple ncRNA alignment by incorporating structural information into 62 a MAFFT-based framework (describes RNA structural alignment methods) 63 64 Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent 65 developments in the MAFFT multiple sequence alignment program 66 (outlines version 6) 67 68 Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an 69 algorithm to build an approximate tree from a large number of 70 unaligned sequences (describes the PartTree algorithm) 71 72 Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT 73 version 5: improvement in accuracy of multiple sequence alignment 74 (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i 75 strategies) 76 77 Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) 78 79 Last checked against version: MAFFT v6.717b (2009/12/03) 80 """
81 - def __init__(self, cmd="mafft", **kwargs):
82 BLOSUM_MATRICES = ["30", "45", "62", "80"] 83 self.parameters = \ 84 [ 85 #**** Algorithm **** 86 #Automatically selects an appropriate strategy from L-INS-i, FFT-NS- 87 #i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) 88 _Switch(["--auto", "auto"], 89 "Automatically select strategy. Default off."), 90 #Distance is calculated based on the number of shared 6mers. Default: on 91 _Switch(["--6merpair", "6merpair", "sixmerpair"], 92 "Distance is calculated based on the number of shared " 93 "6mers. Default: on"), 94 #All pairwise alignments are computed with the Needleman-Wunsch 95 #algorithm. More accurate but slower than --6merpair. Suitable for a 96 #set of globally alignable sequences. Applicable to up to ~200 97 #sequences. A combination with --maxiterate 1000 is recommended (G- 98 #INS-i). Default: off (6mer distance is used) 99 _Switch(["--globalpair", "globalpair"], 100 "All pairwise alignments are computed with the " 101 "Needleman-Wunsch algorithm. Default: off"), 102 #All pairwise alignments are computed with the Smith-Waterman 103 #algorithm. More accurate but slower than --6merpair. Suitable for a 104 #set of locally alignable sequences. Applicable to up to ~200 105 #sequences. A combination with --maxiterate 1000 is recommended (L- 106 #INS-i). Default: off (6mer distance is used) 107 _Switch(["--localpair", "localpair"], 108 "All pairwise alignments are computed with the " 109 "Smith-Waterman algorithm. Default: off"), 110 #All pairwise alignments are computed with a local algorithm with 111 #the generalized affine gap cost (Altschul 1998). More accurate but 112 #slower than --6merpair. Suitable when large internal gaps are 113 #expected. Applicable to up to ~200 sequences. A combination with -- 114 #maxiterate 1000 is recommended (E-INS-i). Default: off (6mer 115 #distance is used) 116 _Switch(["--genafpair", "genafpair"], 117 "All pairwise alignments are computed with a local " 118 "algorithm with the generalized affine gap cost " 119 "(Altschul 1998). Default: off"), 120 #All pairwise alignments are computed with FASTA (Pearson and Lipman 121 #1988). FASTA is required. Default: off (6mer distance is used) 122 _Switch(["--fastapair", "fastapair"], 123 "All pairwise alignments are computed with FASTA " 124 "(Pearson and Lipman 1988). Default: off"), 125 #Weighting factor for the consistency term calculated from pairwise 126 #alignments. Valid when either of --blobalpair, --localpair, -- 127 #genafpair, --fastapair or --blastpair is selected. Default: 2.7 128 _Option(["--weighti", "weighti"], 129 "Weighting factor for the consistency term calculated " 130 "from pairwise alignments. Default: 2.7", 131 checker_function=lambda x: isinstance(x, float), 132 equate=False), 133 #Guide tree is built number times in the progressive stage. Valid 134 #with 6mer distance. Default: 2 135 _Option(["--retree", "retree"], 136 "Guide tree is built number times in the progressive " 137 "stage. Valid with 6mer distance. Default: 2", 138 checker_function=lambda x: isinstance(x, int), 139 equate=False), 140 #Number cycles of iterative refinement are performed. Default: 0 141 _Option(["--maxiterate", "maxiterate"], 142 "Number cycles of iterative refinement are performed. " 143 "Default: 0", 144 checker_function=lambda x: isinstance(x, int), 145 equate=False), 146 #Use FFT approximation in group-to-group alignment. Default: on 147 _Switch(["--fft", "fft"], 148 "Use FFT approximation in group-to-group alignment. " 149 "Default: on"), 150 #Do not use FFT approximation in group-to-group alignment. Default: 151 #off 152 _Switch(["--nofft", "nofft"], 153 "Do not use FFT approximation in group-to-group " 154 "alignment. Default: off"), 155 #Alignment score is not checked in the iterative refinement stage. 156 #Default: off (score is checked) 157 _Switch(["--noscore", "noscore"], 158 "Alignment score is not checked in the iterative " 159 "refinement stage. Default: off (score is checked)"), 160 #Use the Myers-Miller (1988) algorithm. Default: automatically 161 #turned on when the alignment length exceeds 10,000 (aa/nt). 162 _Switch(["--memsave", "memsave"], 163 "Use the Myers-Miller (1988) algorithm. Default: " 164 "automatically turned on when the alignment length " 165 "exceeds 10,000 (aa/nt)."), 166 #Use a fast tree-building method (PartTree, Katoh and Toh 2007) with 167 #the 6mer distance. Recommended for a large number (> ~10,000) of 168 #sequences are input. Default: off 169 _Switch(["--parttree", "parttree"], 170 "Use a fast tree-building method with the 6mer " 171 "distance. Default: off"), 172 #The PartTree algorithm is used with distances based on DP. Slightly 173 #more accurate and slower than --parttree. Recommended for a large 174 #number (> ~10,000) of sequences are input. Default: off 175 _Switch(["--dpparttree", "dpparttree"], 176 "The PartTree algorithm is used with distances " 177 "based on DP. Default: off"), 178 #The PartTree algorithm is used with distances based on FASTA. 179 #Slightly more accurate and slower than --parttree. Recommended for 180 #a large number (> ~10,000) of sequences are input. FASTA is 181 #required. Default: off 182 _Switch(["--fastaparttree", "fastaparttree"], 183 "The PartTree algorithm is used with distances based " 184 "on FASTA. Default: off"), 185 #The number of partitions in the PartTree algorithm. Default: 50 186 _Option(["--partsize", "partsize"], 187 "The number of partitions in the PartTree algorithm. " 188 "Default: 50", 189 checker_function=lambda x: isinstance(x, int), 190 equate=False), 191 #Do not make alignment larger than number sequences. Valid only with 192 #the --*parttree options. Default: the number of input sequences 193 _Switch(["--groupsize", "groupsize"], 194 "Do not make alignment larger than number sequences. " 195 "Default: the number of input sequences"), 196 #Adjust direction according to the first sequence 197 #Mafft V6 beta function 198 _Switch(["--adjustdirection", "adjustdirection"], 199 "Adjust direction according to the first sequence. " 200 "Default off."), 201 #Adjust direction according to the first sequence 202 #for highly diverged data; very slow 203 #Mafft V6 beta function 204 _Switch(["--adjustdirectionaccurately", "adjustdirectionaccurately"], 205 "Adjust direction according to the first sequence," 206 "for highly diverged data; very slow" 207 "Default off."), 208 #**** Parameter **** 209 #Gap opening penalty at group-to-group alignment. Default: 1.53 210 _Option(["--op", "op"], 211 "Gap opening penalty at group-to-group alignment. " 212 "Default: 1.53", 213 checker_function=lambda x: isinstance(x, float), 214 equate=False), 215 #Offset value, which works like gap extension penalty, for group-to- 216 #group alignment. Deafult: 0.123 217 _Option(["--ep", "ep"], 218 "Offset value, which works like gap extension penalty, " 219 "for group-to- group alignment. Default: 0.123", 220 checker_function=lambda x: isinstance(x, float), 221 equate=False), 222 #Gap opening penalty at local pairwise alignment. Valid when the -- 223 #localpair or --genafpair option is selected. Default: -2.00 224 _Option(["--lop", "lop"], 225 "Gap opening penalty at local pairwise alignment. " 226 "Default: 0.123", 227 checker_function=lambda x: isinstance(x, float), 228 equate=False), 229 #Offset value at local pairwise alignment. Valid when the -- 230 #localpair or --genafpair option is selected. Default: 0.1 231 _Option(["--lep", "lep"], 232 "Offset value at local pairwise alignment. " 233 "Default: 0.1", 234 checker_function=lambda x: isinstance(x, float), 235 equate=False), 236 #Gap extension penalty at local pairwise alignment. Valid when the - 237 #-localpair or --genafpair option is selected. Default: -0.1 238 _Option(["--lexp", "lexp"], 239 "Gap extension penalty at local pairwise alignment. " 240 "Default: -0.1", 241 checker_function=lambda x: isinstance(x, float), 242 equate=False), 243 #Gap opening penalty to skip the alignment. Valid when the -- 244 #genafpair option is selected. Default: -6.00 245 _Option(["--LOP", "LOP"], 246 "Gap opening penalty to skip the alignment. " 247 "Default: -6.00", 248 checker_function=lambda x: isinstance(x, float), 249 equate=False), 250 #Gap extension penalty to skip the alignment. Valid when the -- 251 #genafpair option is selected. Default: 0.00 252 _Option(["--LEXP", "LEXP"], 253 "Gap extension penalty to skip the alignment. " 254 "Default: 0.00", 255 checker_function=lambda x: isinstance(x, float), 256 equate=False), 257 258 #BLOSUM number matrix (Henikoff and Henikoff 1992) is used. 259 #number=30, 45, 62 or 80. Default: 62 260 _Option(["--bl", "bl"], 261 "BLOSUM number matrix is used. Default: 62", 262 checker_function=lambda x: x in BLOSUM_MATRICES, 263 equate=False), 264 #JTT PAM number (Jones et al. 1992) matrix is used. number>0. 265 #Default: BLOSUM62 266 _Option(["--jtt", "jtt"], 267 "JTT PAM number (Jones et al. 1992) matrix is used. " 268 "number>0. Default: BLOSUM62", 269 equate=False), 270 #Transmembrane PAM number (Jones et al. 1994) matrix is used. 271 #number>0. Default: BLOSUM62 272 _Option(["--tm", "tm"], 273 "Transmembrane PAM number (Jones et al. 1994) " 274 "matrix is used. number>0. Default: BLOSUM62", 275 filename=True, 276 equate=False), 277 #Use a user-defined AA scoring matrix. The format of matrixfile is 278 #the same to that of BLAST. Ignored when nucleotide sequences are 279 #input. Default: BLOSUM62 280 _Option(["--aamatrix", "aamatrix"], 281 "Use a user-defined AA scoring matrix. " 282 "Default: BLOSUM62", 283 filename=True, 284 equate=False), 285 #Incorporate the AA/nuc composition information into the scoring 286 #matrix. Default: off 287 _Switch(["--fmodel", "fmodel"], 288 "Incorporate the AA/nuc composition information into " 289 "the scoring matrix (True) or not (False, default)"), 290 #**** Output **** 291 #Name length for CLUSTAL and PHYLIP format output 292 _Option(["--namelength", "namelength"], 293 """Name length in CLUSTAL and PHYLIP output. 294 295 MAFFT v6.847 (2011) added --namelength for use with 296 the --clustalout option for CLUSTAL output. 297 298 MAFFT v7.024 (2013) added support for this with the 299 --phylipout option for PHYLIP output (default 10). 300 """, 301 checker_function=lambda x: isinstance(x, int), 302 equate=False), 303 #Output format: clustal format. Default: off (fasta format) 304 _Switch(["--clustalout", "clustalout"], 305 "Output format: clustal (True) or fasta (False, default)"), 306 #Output format: phylip format. 307 #Added in beta with v6.847, fixed in v6.850 (2011) 308 _Switch(["--phylipout", "phylipout"], 309 "Output format: phylip (True), or fasta (False, default)"), 310 #Output order: same as input. Default: on 311 _Switch(["--inputorder", "inputorder"], 312 "Output order: same as input (True, default) or alignment " 313 "based (False)"), 314 #Output order: aligned. Default: off (inputorder) 315 _Switch(["--reorder", "reorder"], 316 "Output order: aligned (True) or in input order (False, " 317 "default)"), 318 #Guide tree is output to the input.tree file. Default: off 319 _Switch(["--treeout", "treeout"], 320 "Guide tree is output to the input.tree file (True) or " 321 "not (False, default)"), 322 #Do not report progress. Default: off 323 _Switch(["--quiet", "quiet"], 324 "Do not report progress (True) or not (False, default)."), 325 #**** Input **** 326 #Assume the sequences are nucleotide. Deafult: auto 327 _Switch(["--nuc", "nuc"], 328 "Assume the sequences are nucleotide (True/False). " 329 "Default: auto"), 330 #Assume the sequences are amino acid. Deafult: auto 331 _Switch(["--amino", "amino"], 332 "Assume the sequences are amino acid (True/False). " 333 "Default: auto"), 334 ###################### SEEDS ##################################### 335 # MAFFT has multiple --seed commands where the unaligned input is 336 # aligned to the seed alignment. There can be multiple seeds in the 337 # form: "mafft --seed align1 --seed align2 [etc] input" 338 # Effectively for n number of seed alignments. Here we're going to 339 # assume 6 extra are enough 340 _Option(["--seed", "seed"], 341 "Seed alignments given in alignment_n (fasta format) " 342 "are aligned with sequences in input.", 343 filename=True, 344 equate=False), 345 #The old solution of also defining extra parameters with 346 #["--seed", "seed1"] etc worked, but clashes with the recent 347 #code in the base class to look for duplicate parameters and raise 348 #an error. Perhaps that check should be ignored here, or maybe 349 #we can handle this more elegantly... 350 #TODO - Create an _OptionList parameter which allows a list to be 351 #assigned to the value? 352 ####################### END SEEDS ################################ 353 #The input (must be FASTA format) 354 _Argument(["input"], 355 "Input file name", 356 filename=True, 357 is_required=True), 358 ################################################################### 359 #mafft-profile takes a second alignment input as an argument: 360 #mafft-profile align1 align2 361 _Argument(["input1"], 362 "Second input file name for the mafft-profile command", 363 filename=True), 364 ] 365 AbstractCommandline.__init__(self, cmd, **kwargs)
366 367 368 if __name__ == "__main__": 369 from Bio._utils import run_doctest 370 run_doctest() 371