Package Bio :: Package Align :: Package Applications :: Module _Mafft
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Mafft

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment programme MAFFT. 
  6  """ 
  7   
  8  from __future__ import print_function 
  9   
 10   
 11  import os 
 12  from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline 
 13   
 14   
15 -class MafftCommandline(AbstractCommandline):
16 """Command line wrapper for the multiple alignment program MAFFT. 17 18 http://align.bmr.kyushu-u.ac.jp/mafft/software/ 19 20 Example: 21 -------- 22 23 >>> from Bio.Align.Applications import MafftCommandline 24 >>> mafft_exe = "/opt/local/mafft" 25 >>> in_file = "../Doc/examples/opuntia.fasta" 26 >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) 27 >>> print(mafft_cline) 28 /opt/local/mafft ../Doc/examples/opuntia.fasta 29 30 If the mafft binary is on the path (typically the case on a Unix style 31 operating system) then you don't need to supply the executable location: 32 33 >>> from Bio.Align.Applications import MafftCommandline 34 >>> in_file = "../Doc/examples/opuntia.fasta" 35 >>> mafft_cline = MafftCommandline(input=in_file) 36 >>> print(mafft_cline) 37 mafft ../Doc/examples/opuntia.fasta 38 39 You would typically run the command line with mafft_cline() or via 40 the Python subprocess module, as described in the Biopython tutorial. 41 Note that MAFFT will write the alignment to stdout, which you may 42 want to save to a file and then parse, e.g.:: 43 44 stdout, stderr = mafft_cline() 45 with open("aligned.fasta", "w") as handle: 46 handle.write(stdout) 47 from Bio import AlignIO 48 align = AlignIO.read("aligned.fasta", "fasta") 49 50 Alternatively, to parse the output with AlignIO directly you can 51 use StringIO to turn the string into a handle:: 52 53 stdout, stderr = mafft_cline() 54 from StringIO import StringIO 55 from Bio import AlignIO 56 align = AlignIO.read(StringIO(stdout), "fasta") 57 58 Citations: 59 ---------- 60 61 Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of 62 multiple ncRNA alignment by incorporating structural information into 63 a MAFFT-based framework (describes RNA structural alignment methods) 64 65 Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent 66 developments in the MAFFT multiple sequence alignment program 67 (outlines version 6) 68 69 Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an 70 algorithm to build an approximate tree from a large number of 71 unaligned sequences (describes the PartTree algorithm) 72 73 Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT 74 version 5: improvement in accuracy of multiple sequence alignment 75 (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i 76 strategies) 77 78 Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) 79 80 Last checked against version: MAFFT v6.717b (2009/12/03) 81 """
82 - def __init__(self, cmd="mafft", **kwargs):
83 BLOSUM_MATRICES = ["30", "45", "62", "80"] 84 self.parameters = \ 85 [ 86 # **** Algorithm **** 87 # Automatically selects an appropriate strategy from L-INS-i, FFT-NS- 88 # i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) 89 _Switch(["--auto", "auto"], 90 "Automatically select strategy. Default off."), 91 # Distance is calculated based on the number of shared 6mers. Default: on 92 _Switch(["--6merpair", "6merpair", "sixmerpair"], 93 "Distance is calculated based on the number of shared " 94 "6mers. Default: on"), 95 # All pairwise alignments are computed with the Needleman-Wunsch 96 # algorithm. More accurate but slower than --6merpair. Suitable for a 97 # set of globally alignable sequences. Applicable to up to ~200 98 # sequences. A combination with --maxiterate 1000 is recommended (G- 99 # INS-i). Default: off (6mer distance is used) 100 _Switch(["--globalpair", "globalpair"], 101 "All pairwise alignments are computed with the " 102 "Needleman-Wunsch algorithm. Default: off"), 103 # All pairwise alignments are computed with the Smith-Waterman 104 # algorithm. More accurate but slower than --6merpair. Suitable for a 105 # set of locally alignable sequences. Applicable to up to ~200 106 # sequences. A combination with --maxiterate 1000 is recommended (L- 107 # INS-i). Default: off (6mer distance is used) 108 _Switch(["--localpair", "localpair"], 109 "All pairwise alignments are computed with the " 110 "Smith-Waterman algorithm. Default: off"), 111 # All pairwise alignments are computed with a local algorithm with 112 # the generalized affine gap cost (Altschul 1998). More accurate but 113 # slower than --6merpair. Suitable when large internal gaps are 114 # expected. Applicable to up to ~200 sequences. A combination with -- 115 # maxiterate 1000 is recommended (E-INS-i). Default: off (6mer 116 # distance is used) 117 _Switch(["--genafpair", "genafpair"], 118 "All pairwise alignments are computed with a local " 119 "algorithm with the generalized affine gap cost " 120 "(Altschul 1998). Default: off"), 121 # All pairwise alignments are computed with FASTA (Pearson and Lipman 122 # 1988). FASTA is required. Default: off (6mer distance is used) 123 _Switch(["--fastapair", "fastapair"], 124 "All pairwise alignments are computed with FASTA " 125 "(Pearson and Lipman 1988). Default: off"), 126 # Weighting factor for the consistency term calculated from pairwise 127 # alignments. Valid when either of --blobalpair, --localpair, -- 128 # genafpair, --fastapair or --blastpair is selected. Default: 2.7 129 _Option(["--weighti", "weighti"], 130 "Weighting factor for the consistency term calculated " 131 "from pairwise alignments. Default: 2.7", 132 checker_function=lambda x: isinstance(x, float), 133 equate=False), 134 # Guide tree is built number times in the progressive stage. Valid 135 # with 6mer distance. Default: 2 136 _Option(["--retree", "retree"], 137 "Guide tree is built number times in the progressive " 138 "stage. Valid with 6mer distance. Default: 2", 139 checker_function=lambda x: isinstance(x, int), 140 equate=False), 141 # Number cycles of iterative refinement are performed. Default: 0 142 _Option(["--maxiterate", "maxiterate"], 143 "Number cycles of iterative refinement are performed. " 144 "Default: 0", 145 checker_function=lambda x: isinstance(x, int), 146 equate=False), 147 # Use FFT approximation in group-to-group alignment. Default: on 148 _Switch(["--fft", "fft"], 149 "Use FFT approximation in group-to-group alignment. " 150 "Default: on"), 151 # Do not use FFT approximation in group-to-group alignment. Default: 152 # off 153 _Switch(["--nofft", "nofft"], 154 "Do not use FFT approximation in group-to-group " 155 "alignment. Default: off"), 156 # Alignment score is not checked in the iterative refinement stage. 157 # Default: off (score is checked) 158 _Switch(["--noscore", "noscore"], 159 "Alignment score is not checked in the iterative " 160 "refinement stage. Default: off (score is checked)"), 161 # Use the Myers-Miller (1988) algorithm. Default: automatically 162 # turned on when the alignment length exceeds 10,000 (aa/nt). 163 _Switch(["--memsave", "memsave"], 164 "Use the Myers-Miller (1988) algorithm. Default: " 165 "automatically turned on when the alignment length " 166 "exceeds 10,000 (aa/nt)."), 167 # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with 168 # the 6mer distance. Recommended for a large number (> ~10,000) of 169 # sequences are input. Default: off 170 _Switch(["--parttree", "parttree"], 171 "Use a fast tree-building method with the 6mer " 172 "distance. Default: off"), 173 # The PartTree algorithm is used with distances based on DP. Slightly 174 # more accurate and slower than --parttree. Recommended for a large 175 # number (> ~10,000) of sequences are input. Default: off 176 _Switch(["--dpparttree", "dpparttree"], 177 "The PartTree algorithm is used with distances " 178 "based on DP. Default: off"), 179 # The PartTree algorithm is used with distances based on FASTA. 180 # Slightly more accurate and slower than --parttree. Recommended for 181 # a large number (> ~10,000) of sequences are input. FASTA is 182 # required. Default: off 183 _Switch(["--fastaparttree", "fastaparttree"], 184 "The PartTree algorithm is used with distances based " 185 "on FASTA. Default: off"), 186 # The number of partitions in the PartTree algorithm. Default: 50 187 _Option(["--partsize", "partsize"], 188 "The number of partitions in the PartTree algorithm. " 189 "Default: 50", 190 checker_function=lambda x: isinstance(x, int), 191 equate=False), 192 # Do not make alignment larger than number sequences. Valid only with 193 # the --*parttree options. Default: the number of input sequences 194 _Switch(["--groupsize", "groupsize"], 195 "Do not make alignment larger than number sequences. " 196 "Default: the number of input sequences"), 197 # Adjust direction according to the first sequence 198 # Mafft V6 beta function 199 _Switch(["--adjustdirection", "adjustdirection"], 200 "Adjust direction according to the first sequence. " 201 "Default off."), 202 # Adjust direction according to the first sequence 203 # for highly diverged data; very slow 204 # Mafft V6 beta function 205 _Switch(["--adjustdirectionaccurately", "adjustdirectionaccurately"], 206 "Adjust direction according to the first sequence," 207 "for highly diverged data; very slow" 208 "Default off."), 209 # **** Parameter **** 210 # Gap opening penalty at group-to-group alignment. Default: 1.53 211 _Option(["--op", "op"], 212 "Gap opening penalty at group-to-group alignment. " 213 "Default: 1.53", 214 checker_function=lambda x: isinstance(x, float), 215 equate=False), 216 # Offset value, which works like gap extension penalty, for group-to- 217 # group alignment. Deafult: 0.123 218 _Option(["--ep", "ep"], 219 "Offset value, which works like gap extension penalty, " 220 "for group-to- group alignment. Default: 0.123", 221 checker_function=lambda x: isinstance(x, float), 222 equate=False), 223 # Gap opening penalty at local pairwise alignment. Valid when the -- 224 # localpair or --genafpair option is selected. Default: -2.00 225 _Option(["--lop", "lop"], 226 "Gap opening penalty at local pairwise alignment. " 227 "Default: 0.123", 228 checker_function=lambda x: isinstance(x, float), 229 equate=False), 230 # Offset value at local pairwise alignment. Valid when the -- 231 # localpair or --genafpair option is selected. Default: 0.1 232 _Option(["--lep", "lep"], 233 "Offset value at local pairwise alignment. " 234 "Default: 0.1", 235 checker_function=lambda x: isinstance(x, float), 236 equate=False), 237 # Gap extension penalty at local pairwise alignment. Valid when the - 238 # -localpair or --genafpair option is selected. Default: -0.1 239 _Option(["--lexp", "lexp"], 240 "Gap extension penalty at local pairwise alignment. " 241 "Default: -0.1", 242 checker_function=lambda x: isinstance(x, float), 243 equate=False), 244 # Gap opening penalty to skip the alignment. Valid when the -- 245 # genafpair option is selected. Default: -6.00 246 _Option(["--LOP", "LOP"], 247 "Gap opening penalty to skip the alignment. " 248 "Default: -6.00", 249 checker_function=lambda x: isinstance(x, float), 250 equate=False), 251 # Gap extension penalty to skip the alignment. Valid when the -- 252 # genafpair option is selected. Default: 0.00 253 _Option(["--LEXP", "LEXP"], 254 "Gap extension penalty to skip the alignment. " 255 "Default: 0.00", 256 checker_function=lambda x: isinstance(x, float), 257 equate=False), 258 259 # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. 260 # number=30, 45, 62 or 80. Default: 62 261 _Option(["--bl", "bl"], 262 "BLOSUM number matrix is used. Default: 62", 263 checker_function=lambda x: x in BLOSUM_MATRICES, 264 equate=False), 265 # JTT PAM number (Jones et al. 1992) matrix is used. number>0. 266 # Default: BLOSUM62 267 _Option(["--jtt", "jtt"], 268 "JTT PAM number (Jones et al. 1992) matrix is used. " 269 "number>0. Default: BLOSUM62", 270 equate=False), 271 # Transmembrane PAM number (Jones et al. 1994) matrix is used. 272 # number>0. Default: BLOSUM62 273 _Option(["--tm", "tm"], 274 "Transmembrane PAM number (Jones et al. 1994) " 275 "matrix is used. number>0. Default: BLOSUM62", 276 filename=True, 277 equate=False), 278 # Use a user-defined AA scoring matrix. The format of matrixfile is 279 # the same to that of BLAST. Ignored when nucleotide sequences are 280 # input. Default: BLOSUM62 281 _Option(["--aamatrix", "aamatrix"], 282 "Use a user-defined AA scoring matrix. " 283 "Default: BLOSUM62", 284 filename=True, 285 equate=False), 286 # Incorporate the AA/nuc composition information into the scoring 287 # matrix. Default: off 288 _Switch(["--fmodel", "fmodel"], 289 "Incorporate the AA/nuc composition information into " 290 "the scoring matrix (True) or not (False, default)"), 291 # **** Output **** 292 # Name length for CLUSTAL and PHYLIP format output 293 _Option(["--namelength", "namelength"], 294 """Name length in CLUSTAL and PHYLIP output. 295 296 MAFFT v6.847 (2011) added --namelength for use with 297 the --clustalout option for CLUSTAL output. 298 299 MAFFT v7.024 (2013) added support for this with the 300 --phylipout option for PHYLIP output (default 10). 301 """, 302 checker_function=lambda x: isinstance(x, int), 303 equate=False), 304 # Output format: clustal format. Default: off (fasta format) 305 _Switch(["--clustalout", "clustalout"], 306 "Output format: clustal (True) or fasta (False, default)"), 307 # Output format: phylip format. 308 # Added in beta with v6.847, fixed in v6.850 (2011) 309 _Switch(["--phylipout", "phylipout"], 310 "Output format: phylip (True), or fasta (False, default)"), 311 # Output order: same as input. Default: on 312 _Switch(["--inputorder", "inputorder"], 313 "Output order: same as input (True, default) or alignment " 314 "based (False)"), 315 # Output order: aligned. Default: off (inputorder) 316 _Switch(["--reorder", "reorder"], 317 "Output order: aligned (True) or in input order (False, " 318 "default)"), 319 # Guide tree is output to the input.tree file. Default: off 320 _Switch(["--treeout", "treeout"], 321 "Guide tree is output to the input.tree file (True) or " 322 "not (False, default)"), 323 # Do not report progress. Default: off 324 _Switch(["--quiet", "quiet"], 325 "Do not report progress (True) or not (False, default)."), 326 # **** Input **** 327 # Assume the sequences are nucleotide. Deafult: auto 328 _Switch(["--nuc", "nuc"], 329 "Assume the sequences are nucleotide (True/False). " 330 "Default: auto"), 331 # Assume the sequences are amino acid. Deafult: auto 332 _Switch(["--amino", "amino"], 333 "Assume the sequences are amino acid (True/False). " 334 "Default: auto"), 335 # MAFFT has multiple --seed commands where the unaligned input is 336 # aligned to the seed alignment. There can be multiple seeds in the 337 # form: "mafft --seed align1 --seed align2 [etc] input" 338 # Effectively for n number of seed alignments. 339 # TODO - Can we use class _ArgumentList here? 340 _Option(["--seed", "seed"], 341 "Seed alignments given in alignment_n (fasta format) " 342 "are aligned with sequences in input.", 343 filename=True, 344 equate=False), 345 # The input (must be FASTA format) 346 _Argument(["input"], 347 "Input file name", 348 filename=True, 349 is_required=True), 350 # mafft-profile takes a second alignment input as an argument: 351 # mafft-profile align1 align2 352 _Argument(["input1"], 353 "Second input file name for the mafft-profile command", 354 filename=True), 355 ] 356 AbstractCommandline.__init__(self, cmd, **kwargs)
357 358 359 if __name__ == "__main__": 360 from Bio._utils import run_doctest 361 run_doctest() 362