Package Bio :: Package Sequencing :: Package Applications :: Module _bwa
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Applications._bwa

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Command line wrapper for bwa.""" 
  7   
  8  from __future__ import print_function 
  9  from Bio._py3k import basestring 
 10   
 11  from Bio.Application import _Option, _Argument, _Switch, AbstractCommandline 
 12  from Bio.Application import _StaticArgument 
 13   
 14   
15 -class BwaIndexCommandline(AbstractCommandline):
16 """Command line wrapper for Burrows Wheeler Aligner (BWA) index. 17 18 Index database sequences in the FASTA format, equivalent to:: 19 20 $ bwa index [-p prefix] [-a algoType] [-c] <in.db.fasta> 21 22 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 23 24 Example 25 ------- 26 27 >>> from Bio.Sequencing.Applications import BwaIndexCommandline 28 >>> reference_genome = "/path/to/reference_genome.fasta" 29 >>> index_cmd = BwaIndexCommandline(infile=reference_genome, algorithm="bwtsw") 30 >>> print(index_cmd) 31 bwa index -a bwtsw /path/to/reference_genome.fasta 32 33 You would typically run the command using index_cmd() or via the 34 Python subprocess module, as described in the Biopython tutorial. 35 36 """ 37
38 - def __init__(self, cmd="bwa", **kwargs):
39 self.program_name = cmd 40 self.parameters = [ 41 _StaticArgument("index"), 42 _Option(["-a", "a", "algorithm"], 43 """Algorithm for constructing BWT index. 44 45 Available options are: 46 - is: IS linear-time algorithm for constructing suffix array. 47 It requires 5.37N memory where N is the size of the database. 48 IS is moderately fast, but does not work with database larger 49 than 2GB. IS is the default algorithm due to its simplicity. 50 - bwtsw: Algorithm implemented in BWT-SW. This method works with the 51 whole human genome, but it does not work with database 52 smaller than 10MB and it is usually slower than IS.""", 53 checker_function=lambda x: x in ["is", "bwtsw"], 54 equate=False, is_required=True), 55 _Option(["-p", "p", "prefix"], 56 "Prefix of the output database [same as db filename]", 57 equate=False, is_required=False), 58 _Argument(["infile"], "Input file name", filename=True, is_required=True), 59 _Switch(["-c", "c"], 60 "Build color-space index. The input fasta should be in nucleotide space.") 61 ] 62 AbstractCommandline.__init__(self, cmd, **kwargs)
63 64
65 -class BwaAlignCommandline(AbstractCommandline):
66 """Command line wrapper for Burrows Wheeler Aligner (BWA) aln. 67 68 Run a BWA alignment, equivalent to:: 69 70 $ bwa aln [...] <in.db.fasta> <in.query.fq> > <out.sai> 71 72 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 73 74 Example: 75 76 >>> from Bio.Sequencing.Applications import BwaAlignCommandline 77 >>> reference_genome = "/path/to/reference_genome.fasta" 78 >>> read_file = "/path/to/read_1.fq" 79 >>> output_sai_file = "/path/to/read_1.sai" 80 >>> read_group="@RG\tID:foo\tSM:bar" 81 >>> align_cmd = BwaAlignCommandline(reference=reference_genome, read_file=read_file) 82 >>> print(align_cmd) 83 bwa aln /path/to/reference_genome.fasta /path/to/read_1.fq 84 85 You would typically run the command line using align_cmd(stdout=output_sai_file) 86 or via the Python subprocess module, as described in the Biopython tutorial. 87 """ 88
89 - def __init__(self, cmd="bwa", **kwargs):
90 self.program_name = cmd 91 self.parameters = [ 92 _StaticArgument("aln"), 93 _Argument(["reference"], "Reference file name", 94 filename=True, is_required=True), 95 _Argument(["read_file"], "Read file name", 96 filename=True, is_required=True), 97 _Option(["-n", "n"], 98 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 99 checker_function=lambda x: isinstance(x, (int, float)), 100 equate=False), 101 _Option(["-o", "o"], 102 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 103 checker_function=lambda x: isinstance(x, (int, float)), 104 equate=False), 105 _Option(["-e", "e"], 106 "Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]", 107 checker_function=lambda x: isinstance(x, int), 108 equate=False), 109 _Option(["-d", "d"], 110 "Disallow a long deletion within INT bp towards the 3-end [16]", 111 checker_function=lambda x: isinstance(x, int), 112 equate=False), 113 _Option(["-i", "i"], 114 "Disallow an indel within INT bp towards the ends [5]", 115 checker_function=lambda x: isinstance(x, int), 116 equate=False), 117 _Option(["-l", "l"], 118 """Take the first INT subsequence as seed. 119 120 If INT is larger than the query sequence, seeding will be disabled. 121 For long reads, this option is typically ranged from 25 to 35 for 122 -k 2. [inf]""", 123 checker_function=lambda x: isinstance(x, int), 124 equate=False), 125 _Option(["-k", "k"], "Maximum edit distance in the seed [2]", 126 checker_function=lambda x: isinstance(x, int), 127 equate=False), 128 _Option(["-t", "t"], "Number of threads (multi-threading mode) [1]", 129 checker_function=lambda x: isinstance(x, int), 130 equate=False), 131 _Option(["-M", "M"], 132 "Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3]", 133 checker_function=lambda x: isinstance(x, int), 134 equate=False), 135 _Option(["-O", "O"], "Gap open penalty [11]", 136 checker_function=lambda x: isinstance(x, int), 137 equate=False), 138 _Option(["-E", "E"], "Gap extension penalty [4]", 139 checker_function=lambda x: isinstance(x, int), 140 equate=False), 141 _Option(["-R", "R"], 142 """Proceed with suboptimal alignments if there are no more than INT equally best hits. 143 144 This option only affects paired-end mapping. Increasing this threshold helps 145 to improve the pairing accuracy at the cost of speed, especially for short 146 reads (~32bp).""", 147 checker_function=lambda x: isinstance(x, int), 148 equate=False), 149 _Option(["-q", "q"], 150 """Parameter for read trimming [0]. 151 152 BWA trims a read down to argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l<INT 153 where l is the original read length.""", 154 checker_function=lambda x: isinstance(x, int), 155 equate=False), 156 _Option(["-B", "B"], 157 "Length of barcode starting from the 5-end. When INT is positive, the barcode of each read will be trimmed before mapping and will be written at the BC SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]", 158 checker_function=lambda x: isinstance(x, int), 159 equate=False), 160 _Switch(["-c", "c"], 161 "Reverse query but not complement it, which is required for alignment in the color space."), 162 _Switch(["-N", "N"], 163 "Disable iterative search. All hits with no more than maxDiff differences will be found. This mode is much slower than the default."), 164 _Switch(["-I", "I"], 165 "The input is in the Illumina 1.3+ read format (quality equals ASCII-64)."), 166 _Switch(["-b", "b"], 167 "Specify the input read sequence file is the BAM format"), 168 _Switch(["-b1", "b1"], 169 "When -b is specified, only use the first read in a read pair in mapping (skip single-end reads and the second reads)."), 170 _Switch(["-b2", "b2"], 171 "When -b is specified, only use the second read in a read pair in mapping.") 172 ] 173 AbstractCommandline.__init__(self, cmd, **kwargs)
174 175
176 -class BwaSamseCommandline(AbstractCommandline):
177 """Command line wrapper for Burrows Wheeler Aligner (BWA) samse. 178 179 Generate alignments in the SAM format given single-end reads. 180 Equvialent to:: 181 182 $ bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam> 183 184 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 185 186 Example: 187 188 >>> from Bio.Sequencing.Applications import BwaSamseCommandline 189 >>> reference_genome = "/path/to/reference_genome.fasta" 190 >>> read_file = "/path/to/read_1.fq" 191 >>> sai_file = "/path/to/read_1.sai" 192 >>> output_sam_file = "/path/to/read_1.sam" 193 >>> samse_cmd = BwaSamseCommandline(reference=reference_genome, 194 ... read_file=read_file, sai_file=sai_file) 195 >>> print(samse_cmd) 196 bwa samse /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_1.fq 197 198 You would typically run the command line using samse_cmd(stdout=output_sam_file) 199 or via the Python subprocess module, as described in the Biopython tutorial. 200 """ 201
202 - def __init__(self, cmd="bwa", **kwargs):
203 self.program_name = cmd 204 self.parameters = [ 205 _StaticArgument("samse"), 206 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 207 _Argument(["sai_file"], "Sai file name", filename=True, is_required=True), 208 _Argument(["read_file"], "Read file name", filename=True, is_required=True), 209 _Option(["-n", "n"], 210 """Maximum number of alignments to output in the XA tag for reads paired properly. 211 212 If a read has more than INT hits, the XA tag will not be written. [3]""", 213 checker_function=lambda x: isinstance(x, int), 214 equate=False), 215 _Option(["-r", "r"], 216 "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 217 checker_function=lambda x: isinstance(x, int), 218 equate=False), 219 ] 220 AbstractCommandline.__init__(self, cmd, **kwargs)
221 222
223 -class BwaSampeCommandline(AbstractCommandline):
224 """Command line wrapper for Burrows Wheeler Aligner (BWA) sampe. 225 226 Generate alignments in the SAM format given paired-end reads. 227 Equivalent to:: 228 229 $ bwa sampe [...] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam> 230 231 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 232 233 Example: 234 235 >>> from Bio.Sequencing.Applications import BwaSampeCommandline 236 >>> reference_genome = "/path/to/reference_genome.fasta" 237 >>> read_file1 = "/path/to/read_1.fq" 238 >>> read_file2 = "/path/to/read_2.fq" 239 >>> sai_file1 = "/path/to/read_1.sai" 240 >>> sai_file2 = "/path/to/read_2.sai" 241 >>> output_sam_file = "/path/to/output.sam" 242 >>> read_group = "@RG\tID:foo\tSM:bar" 243 >>> sampe_cmd = BwaSampeCommandline(reference=reference_genome, 244 ... sai_file1=sai_file1, sai_file2=sai_file2, 245 ... read_file1=read_file1, read_file2=read_file2, 246 ... r=read_group) 247 >>> print(sampe_cmd) 248 bwa sampe /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_2.sai /path/to/read_1.fq /path/to/read_2.fq -r @RG ID:foo SM:bar 249 250 You would typically run the command line using sampe_cmd(stdout=output_sam_file) 251 or via the Python subprocess module, as described in the Biopython tutorial. 252 """ 253 254 # TODO - Should the read group have a raw tab in it, or \t? 255
256 - def __init__(self, cmd="bwa", **kwargs):
257 self.program_name = cmd 258 self.parameters = [ 259 _StaticArgument("sampe"), 260 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 261 _Argument(["sai_file1"], "Sai file 1", filename=True, is_required=True), 262 _Argument(["sai_file2"], "Sai file 2", filename=True, is_required=True), 263 _Argument(["read_file1"], "Read file 1", filename=True, is_required=True), 264 _Argument(["read_file2"], "Read file 2", filename=True, is_required=True), 265 _Option(["-a", "a"], 266 """Maximum insert size for a read pair to be considered being mapped properly [500]. 267 268 Since 0.4.5, this option is only used when there are not enough 269 good alignments to infer the distribution of insert sizes.""", 270 checker_function=lambda x: isinstance(x, int), 271 equate=False), 272 _Option(["-o", "o"], 273 """Maximum occurrences of a read for pairing [100000]. 274 275 A read with more occurrences will be treated as a single-end read. 276 Reducing this parameter helps faster pairing.""", 277 checker_function=lambda x: isinstance(x, int), 278 equate=False), 279 _Option(["-n", "n"], 280 """Maximum number of alignments to output in the XA tag for reads paired properly [3]. 281 282 If a read has more than INT hits, the XA tag will not be written.""", 283 checker_function=lambda x: isinstance(x, int), 284 equate=False), 285 _Option(["-N", "N"], 286 """Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) [10]. 287 288 If a read has more than INT hits, the XA tag will not be written.""", 289 checker_function=lambda x: isinstance(x, int), 290 equate=False), 291 _Option(["-r", "r"], "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 292 checker_function=lambda x: isinstance(x, basestring), 293 equate=False), 294 ] 295 AbstractCommandline.__init__(self, cmd, **kwargs)
296 297
298 -class BwaBwaswCommandline(AbstractCommandline):
299 """Command line wrapper for Burrows Wheeler Aligner (BWA) bwasw. 300 301 Align query sequences from FASTQ files. Equivalent to:: 302 303 $ bwa bwasw [...] <in.db.fasta> <in.fq> 304 305 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 306 307 Example: 308 309 >>> from Bio.Sequencing.Applications import BwaBwaswCommandline 310 >>> reference_genome = "/path/to/reference_genome.fasta" 311 >>> read_file = "/path/to/read_1.fq" 312 >>> bwasw_cmd = BwaBwaswCommandline(reference=reference_genome, read_file=read_file) 313 >>> print(bwasw_cmd) 314 bwa bwasw /path/to/reference_genome.fasta /path/to/read_1.fq 315 316 You would typically run the command line using bwasw_cmd() or via the 317 Python subprocess module, as described in the Biopython tutorial. 318 """ 319
320 - def __init__(self, cmd="bwa", **kwargs):
321 self.program_name = cmd 322 self.parameters = [ 323 _StaticArgument("bwasw"), 324 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 325 _Argument(["read_file"], "Read file", filename=True, is_required=True), 326 _Argument(["mate_file"], "Mate file", filename=True, is_required=False), 327 _Option(["-a", "a"], 328 "Score of a match [1]", 329 checker_function=lambda x: isinstance(x, int), 330 equate=False), 331 _Option(["-b", "b"], 332 "Mismatch penalty [3]", 333 checker_function=lambda x: isinstance(x, int), 334 equate=False), 335 _Option(["-q", "q"], 336 "Gap open penalty [5]", 337 checker_function=lambda x: isinstance(x, int), 338 equate=False), 339 _Option(["-r", "r"], 340 "Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2]", 341 checker_function=lambda x: isinstance(x, int), 342 equate=False), 343 _Option(["-t", "t"], 344 "Number of threads in the multi-threading mode [1]", 345 checker_function=lambda x: isinstance(x, int), 346 equate=False), 347 _Option(["-w", "w"], 348 "Band width in the banded alignment [33]", 349 checker_function=lambda x: isinstance(x, int), 350 equate=False), 351 _Option(["-T", "T"], 352 "Minimum score threshold divided by a [37]", 353 checker_function=lambda x: isinstance(x, int), 354 equate=False), 355 _Option(["-c", "c"], 356 """Coefficient for threshold adjustment according to query length [5.5]. 357 358 Given an l-long query, the threshold for a hit to be retained is 359 a*max{T,c*log(l)}.""", 360 checker_function=lambda x: isinstance(x, float), 361 equate=False), 362 _Option(["-z", "z"], 363 "Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]", 364 checker_function=lambda x: isinstance(x, int), 365 equate=False), 366 _Option(["-s", "s"], 367 """Maximum SA interval size for initiating a seed [3]. 368 369 Higher -s increases accuracy at the cost of speed.""", 370 checker_function=lambda x: isinstance(x, int), 371 equate=False), 372 _Option(["-N", "N"], 373 "Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5]", 374 checker_function=lambda x: isinstance(x, int), 375 equate=False), 376 ] 377 AbstractCommandline.__init__(self, cmd, **kwargs)
378 379 380 if __name__ == "__main__": 381 from Bio._utils import run_doctest 382 run_doctest() 383