Package Bio :: Package Align :: Package Applications :: Module _Mafft
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Mafft

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment programme MAFFT. 
  6  """ 
  7   
  8  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
  9   
 10  import os 
 11  from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline 
 12   
 13   
14 -class MafftCommandline(AbstractCommandline):
15 """Command line wrapper for the multiple alignment program MAFFT. 16 17 http://align.bmr.kyushu-u.ac.jp/mafft/software/ 18 19 Example: 20 21 >>> from Bio.Align.Applications import MafftCommandline 22 >>> mafft_exe = "/opt/local/mafft" 23 >>> in_file = "../Doc/examples/opuntia.fasta" 24 >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) 25 >>> print mafft_cline 26 /opt/local/mafft ../Doc/examples/opuntia.fasta 27 28 If the mafft binary is on the path (typically the case on a Unix style 29 operating system) then you don't need to supply the executable location: 30 31 >>> from Bio.Align.Applications import MafftCommandline 32 >>> in_file = "../Doc/examples/opuntia.fasta" 33 >>> mafft_cline = MafftCommandline(input=in_file) 34 >>> print mafft_cline 35 mafft ../Doc/examples/opuntia.fasta 36 37 You would typically run the command line with mafft_cline() or via 38 the Python subprocess module, as described in the Biopython tutorial. 39 Note that MAFFT will write the alignment to stdout, which you may 40 want to save to a file and then parse, e.g. 41 42 stdout, stderr = mafft_cline() 43 handle = open("aligned.fasta", "w") 44 handle.write(stdout) 45 handle.close() 46 from Bio import AlignIO 47 align = AlignIO.read("aligned.fasta", "fasta") 48 49 Alternatively, to parse the output with AlignIO directly you can 50 use StringIO to turn the string into a handle: 51 52 stdout, stderr = mafft_cline() 53 from StringIO import StringIO 54 from Bio import AlignIO 55 align = AlignIO.read(StringIO(stdout), "fasta") 56 57 Citations: 58 59 Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of 60 multiple ncRNA alignment by incorporating structural information into 61 a MAFFT-based framework (describes RNA structural alignment methods) 62 63 Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent 64 developments in the MAFFT multiple sequence alignment program 65 (outlines version 6) 66 67 Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an 68 algorithm to build an approximate tree from a large number of 69 unaligned sequences (describes the PartTree algorithm) 70 71 Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT 72 version 5: improvement in accuracy of multiple sequence alignment 73 (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i 74 strategies) 75 76 Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) 77 78 Last checked against version: MAFFT v6.717b (2009/12/03) 79 """
80 - def __init__(self, cmd="mafft", **kwargs):
81 BLOSUM_MATRICES = ["30","45","62","80"] 82 self.parameters = \ 83 [ 84 #**** Algorithm **** 85 #Automatically selects an appropriate strategy from L-INS-i, FFT-NS- 86 #i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) 87 _Switch(["--auto", "auto"], 88 "Automatically select strategy. Default off."), 89 #Distance is calculated based on the number of shared 6mers. Default: on 90 _Switch(["--6merpair", "6merpair", "sixmerpair"], 91 "Distance is calculated based on the number of shared " 92 "6mers. Default: on"), 93 #All pairwise alignments are computed with the Needleman-Wunsch 94 #algorithm. More accurate but slower than --6merpair. Suitable for a 95 #set of globally alignable sequences. Applicable to up to ~200 96 #sequences. A combination with --maxiterate 1000 is recommended (G- 97 #INS-i). Default: off (6mer distance is used) 98 _Switch(["--globalpair", "globalpair"], 99 "All pairwise alignments are computed with the " 100 "Needleman-Wunsch algorithm. Default: off"), 101 #All pairwise alignments are computed with the Smith-Waterman 102 #algorithm. More accurate but slower than --6merpair. Suitable for a 103 #set of locally alignable sequences. Applicable to up to ~200 104 #sequences. A combination with --maxiterate 1000 is recommended (L- 105 #INS-i). Default: off (6mer distance is used) 106 _Switch(["--localpair", "localpair"], 107 "All pairwise alignments are computed with the " 108 "Smith-Waterman algorithm. Default: off"), 109 #All pairwise alignments are computed with a local algorithm with 110 #the generalized affine gap cost (Altschul 1998). More accurate but 111 #slower than --6merpair. Suitable when large internal gaps are 112 #expected. Applicable to up to ~200 sequences. A combination with -- 113 #maxiterate 1000 is recommended (E-INS-i). Default: off (6mer 114 #distance is used) 115 _Switch(["--genafpair", "genafpair"], 116 "All pairwise alignments are computed with a local " 117 "algorithm with the generalized affine gap cost " 118 "(Altschul 1998). Default: off"), 119 #All pairwise alignments are computed with FASTA (Pearson and Lipman 120 #1988). FASTA is required. Default: off (6mer distance is used) 121 _Switch(["--fastapair", "fastapair"], 122 "All pairwise alignments are computed with FASTA " 123 "(Pearson and Lipman 1988). Default: off"), 124 #Weighting factor for the consistency term calculated from pairwise 125 #alignments. Valid when either of --blobalpair, --localpair, -- 126 #genafpair, --fastapair or --blastpair is selected. Default: 2.7 127 _Option(["--weighti", "weighti"], 128 "Weighting factor for the consistency term calculated " 129 "from pairwise alignments. Default: 2.7", 130 checker_function=lambda x: isinstance(x, float), 131 equate=False), 132 #Guide tree is built number times in the progressive stage. Valid 133 #with 6mer distance. Default: 2 134 _Option(["--retree", "retree"], 135 "Guide tree is built number times in the progressive " 136 "stage. Valid with 6mer distance. Default: 2", 137 checker_function=lambda x: isinstance(x, int), 138 equate=False), 139 #Number cycles of iterative refinement are performed. Default: 0 140 _Option(["--maxiterate", "maxiterate"], 141 "Number cycles of iterative refinement are performed. " 142 "Default: 0", 143 checker_function=lambda x: isinstance(x, int), 144 equate=False), 145 #Use FFT approximation in group-to-group alignment. Default: on 146 _Switch(["--fft", "fft"], 147 "Use FFT approximation in group-to-group alignment. " 148 "Default: on"), 149 #Do not use FFT approximation in group-to-group alignment. Default: 150 #off 151 _Switch(["--nofft", "nofft"], 152 "Do not use FFT approximation in group-to-group " 153 "alignment. Default: off"), 154 #Alignment score is not checked in the iterative refinement stage. 155 #Default: off (score is checked) 156 _Switch(["--noscore", "noscore"], 157 "Alignment score is not checked in the iterative " 158 "refinement stage. Default: off (score is checked)"), 159 #Use the Myers-Miller (1988) algorithm. Default: automatically 160 #turned on when the alignment length exceeds 10,000 (aa/nt). 161 _Switch(["--memsave", "memsave"], 162 "Use the Myers-Miller (1988) algorithm. Default: " 163 "automatically turned on when the alignment length " 164 "exceeds 10,000 (aa/nt)."), 165 #Use a fast tree-building method (PartTree, Katoh and Toh 2007) with 166 #the 6mer distance. Recommended for a large number (> ~10,000) of 167 #sequences are input. Default: off 168 _Switch(["--parttree", "parttree"], 169 "Use a fast tree-building method with the 6mer " 170 "distance. Default: off"), 171 #The PartTree algorithm is used with distances based on DP. Slightly 172 #more accurate and slower than --parttree. Recommended for a large 173 #number (> ~10,000) of sequences are input. Default: off 174 _Switch(["--dpparttree", "dpparttree"], 175 "The PartTree algorithm is used with distances " 176 "based on DP. Default: off"), 177 #The PartTree algorithm is used with distances based on FASTA. 178 #Slightly more accurate and slower than --parttree. Recommended for 179 #a large number (> ~10,000) of sequences are input. FASTA is 180 #required. Default: off 181 _Switch(["--fastaparttree", "fastaparttree"], 182 "The PartTree algorithm is used with distances based " 183 "on FASTA. Default: off"), 184 #The number of partitions in the PartTree algorithm. Default: 50 185 _Option(["--partsize", "partsize"], 186 "The number of partitions in the PartTree algorithm. " 187 "Default: 50", 188 checker_function=lambda x: isinstance(x, int), 189 equate=False), 190 #Do not make alignment larger than number sequences. Valid only with 191 #the --*parttree options. Default: the number of input sequences 192 _Switch(["--groupsize", "groupsize"], 193 "Do not make alignment larger than number sequences. " 194 "Default: the number of input sequences"), 195 #Adjust direction according to the first sequence 196 #Mafft V6 beta function 197 _Switch(["--adjustdirection", "adjustdirection"], 198 "Adjust direction according to the first sequence. " 199 "Default off."), 200 #Adjust direction according to the first sequence 201 #for highly diverged data; very slow 202 #Mafft V6 beta function 203 _Switch(["--adjustdirectionaccurately", "adjustdirectionaccurately"], 204 "Adjust direction according to the first sequence," 205 "for highly diverged data; very slow" 206 "Default off."), 207 #**** Parameter **** 208 #Gap opening penalty at group-to-group alignment. Default: 1.53 209 _Option(["--op", "op"], 210 "Gap opening penalty at group-to-group alignment. " 211 "Default: 1.53", 212 checker_function=lambda x: isinstance(x, float), 213 equate=False), 214 #Offset value, which works like gap extension penalty, for group-to- 215 #group alignment. Deafult: 0.123 216 _Option(["--ep", "ep"], 217 "Offset value, which works like gap extension penalty, " 218 "for group-to- group alignment. Default: 0.123", 219 checker_function=lambda x: isinstance(x, float), 220 equate=False), 221 #Gap opening penalty at local pairwise alignment. Valid when the -- 222 #localpair or --genafpair option is selected. Default: -2.00 223 _Option(["--lop", "lop"], 224 "Gap opening penalty at local pairwise alignment. " 225 "Default: 0.123", 226 checker_function=lambda x: isinstance(x, float), 227 equate=False), 228 #Offset value at local pairwise alignment. Valid when the -- 229 #localpair or --genafpair option is selected. Default: 0.1 230 _Option(["--lep", "lep"], 231 "Offset value at local pairwise alignment. " 232 "Default: 0.1", 233 checker_function=lambda x: isinstance(x, float), 234 equate=False), 235 #Gap extension penalty at local pairwise alignment. Valid when the - 236 #-localpair or --genafpair option is selected. Default: -0.1 237 _Option(["--lexp", "lexp"], 238 "Gap extension penalty at local pairwise alignment. " 239 "Default: -0.1", 240 checker_function=lambda x: isinstance(x, float), 241 equate=False), 242 #Gap opening penalty to skip the alignment. Valid when the -- 243 #genafpair option is selected. Default: -6.00 244 _Option(["--LOP", "LOP"], 245 "Gap opening penalty to skip the alignment. " 246 "Default: -6.00", 247 checker_function=lambda x: isinstance(x, float), 248 equate=False), 249 #Gap extension penalty to skip the alignment. Valid when the -- 250 #genafpair option is selected. Default: 0.00 251 _Option(["--LEXP", "LEXP"], 252 "Gap extension penalty to skip the alignment. " 253 "Default: 0.00", 254 checker_function=lambda x: isinstance(x, float), 255 equate=False), 256 257 #BLOSUM number matrix (Henikoff and Henikoff 1992) is used. 258 #number=30, 45, 62 or 80. Default: 62 259 _Option(["--bl", "bl"], 260 "BLOSUM number matrix is used. Default: 62", 261 checker_function=lambda x: x in BLOSUM_MATRICES, 262 equate=False), 263 #JTT PAM number (Jones et al. 1992) matrix is used. number>0. 264 #Default: BLOSUM62 265 _Option(["--jtt", "jtt"], 266 "JTT PAM number (Jones et al. 1992) matrix is used. " 267 "number>0. Default: BLOSUM62", 268 equate=False), 269 #Transmembrane PAM number (Jones et al. 1994) matrix is used. 270 #number>0. Default: BLOSUM62 271 _Option(["--tm", "tm"], 272 "Transmembrane PAM number (Jones et al. 1994) " 273 "matrix is used. number>0. Default: BLOSUM62", 274 filename=True, 275 equate=False), 276 #Use a user-defined AA scoring matrix. The format of matrixfile is 277 #the same to that of BLAST. Ignored when nucleotide sequences are 278 #input. Default: BLOSUM62 279 _Option(["--aamatrix", "aamatrix"], 280 "Use a user-defined AA scoring matrix. " 281 "Default: BLOSUM62", 282 filename=True, 283 equate=False), 284 #Incorporate the AA/nuc composition information into the scoring 285 #matrix. Default: off 286 _Switch(["--fmodel", "fmodel"], 287 "Incorporate the AA/nuc composition information into " 288 "the scoring matrix (True) or not (False, default)"), 289 #**** Output **** 290 #Output format: clustal format. Default: off (fasta format) 291 _Switch(["--clustalout", "clustalout"], 292 "Output format: clustal (True) or fasta (False, default)"), 293 #Output order: same as input. Default: on 294 _Switch(["--inputorder", "inputorder"], 295 "Output order: same as input (True, default) or alignment " 296 "based (False)"), 297 #Output order: aligned. Default: off (inputorder) 298 _Switch(["--reorder", "reorder"], 299 "Output order: aligned (True) or in input order (False, " 300 "default)"), 301 #Guide tree is output to the input.tree file. Default: off 302 _Switch(["--treeout", "treeout"], 303 "Guide tree is output to the input.tree file (True) or " 304 "not (False, default)"), 305 #Do not report progress. Default: off 306 _Switch(["--quiet", "quiet"], 307 "Do not report progress (True) or not (False, default)."), 308 #**** Input **** 309 #Assume the sequences are nucleotide. Deafult: auto 310 _Switch(["--nuc", "nuc"], 311 "Assume the sequences are nucleotide (True/False). " 312 "Default: auto"), 313 #Assume the sequences are amino acid. Deafult: auto 314 _Switch(["--amino", "amino"], 315 "Assume the sequences are amino acid (True/False). " 316 "Default: auto"), 317 ###################### SEEDS ##################################### 318 # MAFFT has multiple --seed commands where the unaligned input is 319 # aligned to the seed alignment. There can be multiple seeds in the 320 # form: "mafft --seed align1 --seed align2 [etc] input" 321 # Effectively for n number of seed alignments. Here we're going to 322 # assume 6 extra are enough 323 _Option(["--seed", "seed"], 324 "Seed alignments given in alignment_n (fasta format) " 325 "are aligned with sequences in input.", 326 filename=True, 327 equate=False), 328 #The old solution of also defining extra parameters with 329 #["--seed", "seed1"] etc worked, but clashes with the recent 330 #code in the base class to look for duplicate parameters and raise 331 #an error. Perhaps that check should be ignored here, or maybe 332 #we can handle this more elegantly... 333 #TODO - Create an _OptionList parameter which allows a list to be 334 #assigned to the value? 335 ####################### END SEEDS ################################ 336 #The input (must be FASTA format) 337 _Argument(["input"], 338 "Input file name", 339 filename=True, 340 is_required=True), 341 ################################################################### 342 #mafft-profile takes a second alignment input as an argument: 343 #mafft-profile align1 align2 344 _Argument(["input1"], 345 "Second input file name for the mafft-profile command", 346 filename=True), 347 ] 348 AbstractCommandline.__init__(self, cmd, **kwargs)
349 350 351 if __name__ == "__main__": 352 from Bio._utils import run_doctest 353 run_doctest() 354