Package Bio :: Package Align :: Package Applications :: Module _Mafft
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Mafft

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment programme MAFFT.""" 
  6   
  7  from __future__ import print_function 
  8   
  9   
 10  import os 
 11  from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline 
 12   
 13   
14 -class MafftCommandline(AbstractCommandline):
15 """Command line wrapper for the multiple alignment program MAFFT. 16 17 http://align.bmr.kyushu-u.ac.jp/mafft/software/ 18 19 Example: 20 -------- 21 22 >>> from Bio.Align.Applications import MafftCommandline 23 >>> mafft_exe = "/opt/local/mafft" 24 >>> in_file = "../Doc/examples/opuntia.fasta" 25 >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) 26 >>> print(mafft_cline) 27 /opt/local/mafft ../Doc/examples/opuntia.fasta 28 29 If the mafft binary is on the path (typically the case on a Unix style 30 operating system) then you don't need to supply the executable location: 31 32 >>> from Bio.Align.Applications import MafftCommandline 33 >>> in_file = "../Doc/examples/opuntia.fasta" 34 >>> mafft_cline = MafftCommandline(input=in_file) 35 >>> print(mafft_cline) 36 mafft ../Doc/examples/opuntia.fasta 37 38 You would typically run the command line with mafft_cline() or via 39 the Python subprocess module, as described in the Biopython tutorial. 40 Note that MAFFT will write the alignment to stdout, which you may 41 want to save to a file and then parse, e.g.:: 42 43 stdout, stderr = mafft_cline() 44 with open("aligned.fasta", "w") as handle: 45 handle.write(stdout) 46 from Bio import AlignIO 47 align = AlignIO.read("aligned.fasta", "fasta") 48 49 Alternatively, to parse the output with AlignIO directly you can 50 use StringIO to turn the string into a handle:: 51 52 stdout, stderr = mafft_cline() 53 from StringIO import StringIO 54 from Bio import AlignIO 55 align = AlignIO.read(StringIO(stdout), "fasta") 56 57 Citations: 58 ---------- 59 60 Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of 61 multiple ncRNA alignment by incorporating structural information into 62 a MAFFT-based framework (describes RNA structural alignment methods) 63 64 Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent 65 developments in the MAFFT multiple sequence alignment program 66 (outlines version 6) 67 68 Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an 69 algorithm to build an approximate tree from a large number of 70 unaligned sequences (describes the PartTree algorithm) 71 72 Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT 73 version 5: improvement in accuracy of multiple sequence alignment 74 (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i 75 strategies) 76 77 Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) 78 79 Last checked against version: MAFFT v6.717b (2009/12/03) 80 """ 81
82 - def __init__(self, cmd="mafft", **kwargs):
83 BLOSUM_MATRICES = ["30", "45", "62", "80"] 84 self.parameters = \ 85 [ 86 # **** Algorithm **** 87 # Automatically selects an appropriate strategy from L-INS-i, FFT-NS- 88 # i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) 89 _Switch(["--auto", "auto"], 90 "Automatically select strategy. Default off."), 91 # Distance is calculated based on the number of shared 6mers. Default: on 92 _Switch(["--6merpair", "6merpair", "sixmerpair"], 93 "Distance is calculated based on the number of shared " 94 "6mers. Default: on"), 95 # All pairwise alignments are computed with the Needleman-Wunsch 96 # algorithm. More accurate but slower than --6merpair. Suitable for a 97 # set of globally alignable sequences. Applicable to up to ~200 98 # sequences. A combination with --maxiterate 1000 is recommended (G- 99 # INS-i). Default: off (6mer distance is used) 100 _Switch(["--globalpair", "globalpair"], 101 "All pairwise alignments are computed with the " 102 "Needleman-Wunsch algorithm. Default: off"), 103 # All pairwise alignments are computed with the Smith-Waterman 104 # algorithm. More accurate but slower than --6merpair. Suitable for a 105 # set of locally alignable sequences. Applicable to up to ~200 106 # sequences. A combination with --maxiterate 1000 is recommended (L- 107 # INS-i). Default: off (6mer distance is used) 108 _Switch(["--localpair", "localpair"], 109 "All pairwise alignments are computed with the " 110 "Smith-Waterman algorithm. Default: off"), 111 # All pairwise alignments are computed with a local algorithm with 112 # the generalized affine gap cost (Altschul 1998). More accurate but 113 # slower than --6merpair. Suitable when large internal gaps are 114 # expected. Applicable to up to ~200 sequences. A combination with -- 115 # maxiterate 1000 is recommended (E-INS-i). Default: off (6mer 116 # distance is used) 117 _Switch(["--genafpair", "genafpair"], 118 "All pairwise alignments are computed with a local " 119 "algorithm with the generalized affine gap cost " 120 "(Altschul 1998). Default: off"), 121 # All pairwise alignments are computed with FASTA (Pearson and Lipman 122 # 1988). FASTA is required. Default: off (6mer distance is used) 123 _Switch(["--fastapair", "fastapair"], 124 "All pairwise alignments are computed with FASTA " 125 "(Pearson and Lipman 1988). Default: off"), 126 # Weighting factor for the consistency term calculated from pairwise 127 # alignments. Valid when either of --blobalpair, --localpair, -- 128 # genafpair, --fastapair or --blastpair is selected. Default: 2.7 129 _Option(["--weighti", "weighti"], 130 "Weighting factor for the consistency term calculated " 131 "from pairwise alignments. Default: 2.7", 132 checker_function=lambda x: isinstance(x, float), 133 equate=False), 134 # Guide tree is built number times in the progressive stage. Valid 135 # with 6mer distance. Default: 2 136 _Option(["--retree", "retree"], 137 "Guide tree is built number times in the progressive " 138 "stage. Valid with 6mer distance. Default: 2", 139 checker_function=lambda x: isinstance(x, int), 140 equate=False), 141 # Number cycles of iterative refinement are performed. Default: 0 142 _Option(["--maxiterate", "maxiterate"], 143 "Number cycles of iterative refinement are performed. " 144 "Default: 0", 145 checker_function=lambda x: isinstance(x, int), 146 equate=False), 147 # Number of threads to use. Default: 1 148 _Option(["--thread", "thread"], 149 "Number of threads to use. Default: 1", 150 checker_function=lambda x: isinstance(x, int), 151 equate=False), 152 # Use FFT approximation in group-to-group alignment. Default: on 153 _Switch(["--fft", "fft"], 154 "Use FFT approximation in group-to-group alignment. " 155 "Default: on"), 156 # Do not use FFT approximation in group-to-group alignment. Default: 157 # off 158 _Switch(["--nofft", "nofft"], 159 "Do not use FFT approximation in group-to-group " 160 "alignment. Default: off"), 161 # Alignment score is not checked in the iterative refinement stage. 162 # Default: off (score is checked) 163 _Switch(["--noscore", "noscore"], 164 "Alignment score is not checked in the iterative " 165 "refinement stage. Default: off (score is checked)"), 166 # Use the Myers-Miller (1988) algorithm. Default: automatically 167 # turned on when the alignment length exceeds 10,000 (aa/nt). 168 _Switch(["--memsave", "memsave"], 169 "Use the Myers-Miller (1988) algorithm. Default: " 170 "automatically turned on when the alignment length " 171 "exceeds 10,000 (aa/nt)."), 172 # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with 173 # the 6mer distance. Recommended for a large number (> ~10,000) of 174 # sequences are input. Default: off 175 _Switch(["--parttree", "parttree"], 176 "Use a fast tree-building method with the 6mer " 177 "distance. Default: off"), 178 # The PartTree algorithm is used with distances based on DP. Slightly 179 # more accurate and slower than --parttree. Recommended for a large 180 # number (> ~10,000) of sequences are input. Default: off 181 _Switch(["--dpparttree", "dpparttree"], 182 "The PartTree algorithm is used with distances " 183 "based on DP. Default: off"), 184 # The PartTree algorithm is used with distances based on FASTA. 185 # Slightly more accurate and slower than --parttree. Recommended for 186 # a large number (> ~10,000) of sequences are input. FASTA is 187 # required. Default: off 188 _Switch(["--fastaparttree", "fastaparttree"], 189 "The PartTree algorithm is used with distances based " 190 "on FASTA. Default: off"), 191 # The number of partitions in the PartTree algorithm. Default: 50 192 _Option(["--partsize", "partsize"], 193 "The number of partitions in the PartTree algorithm. " 194 "Default: 50", 195 checker_function=lambda x: isinstance(x, int), 196 equate=False), 197 # Do not make alignment larger than number sequences. Valid only with 198 # the --*parttree options. Default: the number of input sequences 199 _Switch(["--groupsize", "groupsize"], 200 "Do not make alignment larger than number sequences. " 201 "Default: the number of input sequences"), 202 # Adjust direction according to the first sequence 203 # Mafft V6 beta function 204 _Switch(["--adjustdirection", "adjustdirection"], 205 "Adjust direction according to the first sequence. " 206 "Default off."), 207 # Adjust direction according to the first sequence 208 # for highly diverged data; very slow 209 # Mafft V6 beta function 210 _Switch(["--adjustdirectionaccurately", "adjustdirectionaccurately"], 211 "Adjust direction according to the first sequence," 212 "for highly diverged data; very slow" 213 "Default off."), 214 # **** Parameter **** 215 # Gap opening penalty at group-to-group alignment. Default: 1.53 216 _Option(["--op", "op"], 217 "Gap opening penalty at group-to-group alignment. " 218 "Default: 1.53", 219 checker_function=lambda x: isinstance(x, float), 220 equate=False), 221 # Offset value, which works like gap extension penalty, for group-to- 222 # group alignment. Deafult: 0.123 223 _Option(["--ep", "ep"], 224 "Offset value, which works like gap extension penalty, " 225 "for group-to- group alignment. Default: 0.123", 226 checker_function=lambda x: isinstance(x, float), 227 equate=False), 228 # Gap opening penalty at local pairwise alignment. Valid when the -- 229 # localpair or --genafpair option is selected. Default: -2.00 230 _Option(["--lop", "lop"], 231 "Gap opening penalty at local pairwise alignment. " 232 "Default: 0.123", 233 checker_function=lambda x: isinstance(x, float), 234 equate=False), 235 # Offset value at local pairwise alignment. Valid when the -- 236 # localpair or --genafpair option is selected. Default: 0.1 237 _Option(["--lep", "lep"], 238 "Offset value at local pairwise alignment. " 239 "Default: 0.1", 240 checker_function=lambda x: isinstance(x, float), 241 equate=False), 242 # Gap extension penalty at local pairwise alignment. Valid when the - 243 # -localpair or --genafpair option is selected. Default: -0.1 244 _Option(["--lexp", "lexp"], 245 "Gap extension penalty at local pairwise alignment. " 246 "Default: -0.1", 247 checker_function=lambda x: isinstance(x, float), 248 equate=False), 249 # Gap opening penalty to skip the alignment. Valid when the -- 250 # genafpair option is selected. Default: -6.00 251 _Option(["--LOP", "LOP"], 252 "Gap opening penalty to skip the alignment. " 253 "Default: -6.00", 254 checker_function=lambda x: isinstance(x, float), 255 equate=False), 256 # Gap extension penalty to skip the alignment. Valid when the -- 257 # genafpair option is selected. Default: 0.00 258 _Option(["--LEXP", "LEXP"], 259 "Gap extension penalty to skip the alignment. " 260 "Default: 0.00", 261 checker_function=lambda x: isinstance(x, float), 262 equate=False), 263 264 # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. 265 # number=30, 45, 62 or 80. Default: 62 266 _Option(["--bl", "bl"], 267 "BLOSUM number matrix is used. Default: 62", 268 checker_function=lambda x: x in BLOSUM_MATRICES, 269 equate=False), 270 # JTT PAM number (Jones et al. 1992) matrix is used. number>0. 271 # Default: BLOSUM62 272 _Option(["--jtt", "jtt"], 273 "JTT PAM number (Jones et al. 1992) matrix is used. " 274 "number>0. Default: BLOSUM62", 275 equate=False), 276 # Transmembrane PAM number (Jones et al. 1994) matrix is used. 277 # number>0. Default: BLOSUM62 278 _Option(["--tm", "tm"], 279 "Transmembrane PAM number (Jones et al. 1994) " 280 "matrix is used. number>0. Default: BLOSUM62", 281 filename=True, 282 equate=False), 283 # Use a user-defined AA scoring matrix. The format of matrixfile is 284 # the same to that of BLAST. Ignored when nucleotide sequences are 285 # input. Default: BLOSUM62 286 _Option(["--aamatrix", "aamatrix"], 287 "Use a user-defined AA scoring matrix. " 288 "Default: BLOSUM62", 289 filename=True, 290 equate=False), 291 # Incorporate the AA/nuc composition information into the scoring 292 # matrix. Default: off 293 _Switch(["--fmodel", "fmodel"], 294 "Incorporate the AA/nuc composition information into " 295 "the scoring matrix (True) or not (False, default)"), 296 # **** Output **** 297 # Name length for CLUSTAL and PHYLIP format output 298 _Option(["--namelength", "namelength"], 299 """Name length in CLUSTAL and PHYLIP output. 300 301 MAFFT v6.847 (2011) added --namelength for use with 302 the --clustalout option for CLUSTAL output. 303 304 MAFFT v7.024 (2013) added support for this with the 305 --phylipout option for PHYLIP output (default 10). 306 """, 307 checker_function=lambda x: isinstance(x, int), 308 equate=False), 309 # Output format: clustal format. Default: off (fasta format) 310 _Switch(["--clustalout", "clustalout"], 311 "Output format: clustal (True) or fasta (False, default)"), 312 # Output format: phylip format. 313 # Added in beta with v6.847, fixed in v6.850 (2011) 314 _Switch(["--phylipout", "phylipout"], 315 "Output format: phylip (True), or fasta (False, default)"), 316 # Output order: same as input. Default: on 317 _Switch(["--inputorder", "inputorder"], 318 "Output order: same as input (True, default) or alignment " 319 "based (False)"), 320 # Output order: aligned. Default: off (inputorder) 321 _Switch(["--reorder", "reorder"], 322 "Output order: aligned (True) or in input order (False, " 323 "default)"), 324 # Guide tree is output to the input.tree file. Default: off 325 _Switch(["--treeout", "treeout"], 326 "Guide tree is output to the input.tree file (True) or " 327 "not (False, default)"), 328 # Do not report progress. Default: off 329 _Switch(["--quiet", "quiet"], 330 "Do not report progress (True) or not (False, default)."), 331 # **** Input **** 332 # Assume the sequences are nucleotide. Deafult: auto 333 _Switch(["--nuc", "nuc"], 334 "Assume the sequences are nucleotide (True/False). " 335 "Default: auto"), 336 # Assume the sequences are amino acid. Deafult: auto 337 _Switch(["--amino", "amino"], 338 "Assume the sequences are amino acid (True/False). " 339 "Default: auto"), 340 # MAFFT has multiple --seed commands where the unaligned input is 341 # aligned to the seed alignment. There can be multiple seeds in the 342 # form: "mafft --seed align1 --seed align2 [etc] input" 343 # Effectively for n number of seed alignments. 344 # TODO - Can we use class _ArgumentList here? 345 _Option(["--seed", "seed"], 346 "Seed alignments given in alignment_n (fasta format) " 347 "are aligned with sequences in input.", 348 filename=True, 349 equate=False), 350 # The input (must be FASTA format) 351 _Argument(["input"], 352 "Input file name", 353 filename=True, 354 is_required=True), 355 # mafft-profile takes a second alignment input as an argument: 356 # mafft-profile align1 align2 357 _Argument(["input1"], 358 "Second input file name for the mafft-profile command", 359 filename=True), 360 ] 361 AbstractCommandline.__init__(self, cmd, **kwargs)
362 363 364 if __name__ == "__main__": 365 from Bio._utils import run_doctest 366 run_doctest() 367