Package Bio :: Package Sequencing :: Package Applications :: Module _bwa
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Applications._bwa

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Command line wrapper for bwa 
  7  """ 
  8   
  9  from __future__ import print_function 
 10  from Bio._py3k import basestring 
 11   
 12  from Bio.Application import _Option, _Argument, _Switch, AbstractCommandline 
 13  from Bio.Application import _StaticArgument 
 14   
 15   
16 -class BwaIndexCommandline(AbstractCommandline):
17 """Command line wrapper for Burrows Wheeler Aligner (BWA) index. 18 19 Index database sequences in the FASTA format, equivalent to:: 20 21 $ bwa index [-p prefix] [-a algoType] [-c] <in.db.fasta> 22 23 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 24 25 Example 26 ------- 27 28 >>> from Bio.Sequencing.Applications import BwaIndexCommandline 29 >>> reference_genome = "/path/to/reference_genome.fasta" 30 >>> index_cmd = BwaIndexCommandline(infile=reference_genome, algorithm="bwtsw") 31 >>> print(index_cmd) 32 bwa index -a bwtsw /path/to/reference_genome.fasta 33 34 You would typically run the command using index_cmd() or via the 35 Python subprocess module, as described in the Biopython tutorial. 36 37 """
38 - def __init__(self, cmd="bwa", **kwargs):
39 self.program_name = cmd 40 self.parameters = \ 41 [ 42 _StaticArgument("index"), 43 _Option(["-a", "a", "algorithm"], 44 """Algorithm for constructing BWT index. 45 46 Available options are: 47 - is: IS linear-time algorithm for constructing suffix array. 48 It requires 5.37N memory where N is the size of the database. 49 IS is moderately fast, but does not work with database larger 50 than 2GB. IS is the default algorithm due to its simplicity. 51 - bwtsw: Algorithm implemented in BWT-SW. This method works with the 52 whole human genome, but it does not work with database 53 smaller than 10MB and it is usually slower than IS.""", 54 checker_function=lambda x: x in ["is", "bwtsw"], 55 equate=False, is_required=True), 56 _Option(["-p", "p", "prefix"], 57 "Prefix of the output database [same as db filename]", 58 equate=False, is_required=False), 59 _Argument(["infile"], "Input file name", filename=True, is_required=True), 60 _Switch(["-c", "c"], 61 "Build color-space index. The input fasta should be in nucleotide space.") 62 ] 63 AbstractCommandline.__init__(self, cmd, **kwargs)
64 65
66 -class BwaAlignCommandline(AbstractCommandline):
67 """Command line wrapper for Burrows Wheeler Aligner (BWA) aln. 68 69 Run a BWA alignment, equivalent to:: 70 71 $ bwa aln [...] <in.db.fasta> <in.query.fq> > <out.sai> 72 73 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 74 75 Example: 76 77 >>> from Bio.Sequencing.Applications import BwaAlignCommandline 78 >>> reference_genome = "/path/to/reference_genome.fasta" 79 >>> read_file = "/path/to/read_1.fq" 80 >>> output_sai_file = "/path/to/read_1.sai" 81 >>> read_group="@RG\tID:foo\tSM:bar" 82 >>> align_cmd = BwaAlignCommandline(reference=reference_genome, read_file=read_file) 83 >>> print(align_cmd) 84 bwa aln /path/to/reference_genome.fasta /path/to/read_1.fq 85 86 You would typically run the command line using align_cmd(stdout=output_sai_file) 87 or via the Python subprocess module, as described in the Biopython tutorial. 88 """
89 - def __init__(self, cmd="bwa", **kwargs):
90 self.program_name = cmd 91 self.parameters = \ 92 [ 93 _StaticArgument("aln"), 94 _Argument(["reference"], "Reference file name", 95 filename=True, is_required=True), 96 _Argument(["read_file"], "Read file name", 97 filename=True, is_required=True), 98 _Option(["-n", "n"], 99 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 100 checker_function=lambda x: isinstance(x, (int, float)), 101 equate=False), 102 _Option(["-o", "o"], 103 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 104 checker_function=lambda x: isinstance(x, (int, float)), 105 equate=False), 106 _Option(["-e", "e"], 107 "Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]", 108 checker_function=lambda x: isinstance(x, int), 109 equate=False), 110 _Option(["-d", "d"], 111 "Disallow a long deletion within INT bp towards the 3-end [16]", 112 checker_function=lambda x: isinstance(x, int), 113 equate=False), 114 _Option(["-i", "i"], 115 "Disallow an indel within INT bp towards the ends [5]", 116 checker_function=lambda x: isinstance(x, int), 117 equate=False), 118 _Option(["-l", "l"], 119 """Take the first INT subsequence as seed. 120 121 If INT is larger than the query sequence, seeding will be disabled. 122 For long reads, this option is typically ranged from 25 to 35 for 123 -k 2. [inf]""", 124 checker_function=lambda x: isinstance(x, int), 125 equate=False), 126 _Option(["-k", "k"], "Maximum edit distance in the seed [2]", 127 checker_function=lambda x: isinstance(x, int), 128 equate=False), 129 _Option(["-t", "t"], "Number of threads (multi-threading mode) [1]", 130 checker_function=lambda x: isinstance(x, int), 131 equate=False), 132 _Option(["-M", "M"], 133 "Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3]", 134 checker_function=lambda x: isinstance(x, int), 135 equate=False), 136 _Option(["-O", "O"], "Gap open penalty [11]", 137 checker_function=lambda x: isinstance(x, int), 138 equate=False), 139 _Option(["-E", "E"], "Gap extension penalty [4]", 140 checker_function=lambda x: isinstance(x, int), 141 equate=False), 142 _Option(["-R", "R"], 143 """Proceed with suboptimal alignments if there are no more than INT equally best hits. 144 145 This option only affects paired-end mapping. Increasing this threshold helps 146 to improve the pairing accuracy at the cost of speed, especially for short 147 reads (~32bp).""", 148 checker_function=lambda x: isinstance(x, int), 149 equate=False), 150 _Option(["-q", "q"], 151 """Parameter for read trimming [0]. 152 153 BWA trims a read down to argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l<INT 154 where l is the original read length.""", 155 checker_function=lambda x: isinstance(x, int), 156 equate=False), 157 _Option(["-B", "B"], 158 "Length of barcode starting from the 5-end. When INT is positive, the barcode of each read will be trimmed before mapping and will be written at the BC SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]", 159 checker_function=lambda x: isinstance(x, int), 160 equate=False), 161 _Switch(["-c", "c"], 162 "Reverse query but not complement it, which is required for alignment in the color space."), 163 _Switch(["-N", "N"], 164 "Disable iterative search. All hits with no more than maxDiff differences will be found. This mode is much slower than the default."), 165 _Switch(["-I", "I"], 166 "The input is in the Illumina 1.3+ read format (quality equals ASCII-64)."), 167 _Switch(["-b", "b"], 168 "Specify the input read sequence file is the BAM format"), 169 _Switch(["-b1", "b1"], 170 "When -b is specified, only use the first read in a read pair in mapping (skip single-end reads and the second reads)."), 171 _Switch(["-b2", "b2"], 172 "When -b is specified, only use the second read in a read pair in mapping.") 173 ] 174 AbstractCommandline.__init__(self, cmd, **kwargs)
175 176
177 -class BwaSamseCommandline(AbstractCommandline):
178 """Command line wrapper for Burrows Wheeler Aligner (BWA) samse. 179 180 Generate alignments in the SAM format given single-end reads. 181 Equvialent to:: 182 183 $ bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam> 184 185 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 186 187 Example: 188 189 >>> from Bio.Sequencing.Applications import BwaSamseCommandline 190 >>> reference_genome = "/path/to/reference_genome.fasta" 191 >>> read_file = "/path/to/read_1.fq" 192 >>> sai_file = "/path/to/read_1.sai" 193 >>> output_sam_file = "/path/to/read_1.sam" 194 >>> samse_cmd = BwaSamseCommandline(reference=reference_genome, 195 ... read_file=read_file, sai_file=sai_file) 196 >>> print(samse_cmd) 197 bwa samse /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_1.fq 198 199 You would typically run the command line using samse_cmd(stdout=output_sam_file) 200 or via the Python subprocess module, as described in the Biopython tutorial. 201 """
202 - def __init__(self, cmd="bwa", **kwargs):
203 self.program_name = cmd 204 self.parameters = \ 205 [ 206 _StaticArgument("samse"), 207 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 208 _Argument(["sai_file"], "Sai file name", filename=True, is_required=True), 209 _Argument(["read_file"], "Read file name", filename=True, is_required=True), 210 _Option(["-n", "n"], 211 """Maximum number of alignments to output in the XA tag for reads paired properly. 212 213 If a read has more than INT hits, the XA tag will not be written. [3]""", 214 checker_function=lambda x: isinstance(x, int), 215 equate=False), 216 _Option(["-r", "r"], 217 "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 218 checker_function=lambda x: isinstance(x, int), 219 equate=False), 220 ] 221 AbstractCommandline.__init__(self, cmd, **kwargs)
222 223
224 -class BwaSampeCommandline(AbstractCommandline):
225 """Command line wrapper for Burrows Wheeler Aligner (BWA) sampe. 226 227 Generate alignments in the SAM format given paired-end reads. 228 Equivalent to:: 229 230 $ bwa sampe [...] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam> 231 232 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 233 234 Example: 235 236 >>> from Bio.Sequencing.Applications import BwaSampeCommandline 237 >>> reference_genome = "/path/to/reference_genome.fasta" 238 >>> read_file1 = "/path/to/read_1.fq" 239 >>> read_file2 = "/path/to/read_2.fq" 240 >>> sai_file1 = "/path/to/read_1.sai" 241 >>> sai_file2 = "/path/to/read_2.sai" 242 >>> output_sam_file = "/path/to/output.sam" 243 >>> read_group = "@RG\tID:foo\tSM:bar" 244 >>> sampe_cmd = BwaSampeCommandline(reference=reference_genome, 245 ... sai_file1=sai_file1, sai_file2=sai_file2, 246 ... read_file1=read_file1, read_file2=read_file2, 247 ... r=read_group) 248 >>> print(sampe_cmd) 249 bwa sampe /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_2.sai /path/to/read_1.fq /path/to/read_2.fq -r @RG ID:foo SM:bar 250 251 You would typically run the command line using sampe_cmd(stdout=output_sam_file) 252 or via the Python subprocess module, as described in the Biopython tutorial. 253 """ 254 # TODO - Should the read group have a raw tab in it, or \t?
255 - def __init__(self, cmd="bwa", **kwargs):
256 self.program_name = cmd 257 self.parameters = \ 258 [ 259 _StaticArgument("sampe"), 260 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 261 _Argument(["sai_file1"], "Sai file 1", filename=True, is_required=True), 262 _Argument(["sai_file2"], "Sai file 2", filename=True, is_required=True), 263 _Argument(["read_file1"], "Read file 1", filename=True, is_required=True), 264 _Argument(["read_file2"], "Read file 2", filename=True, is_required=True), 265 _Option(["-a", "a"], 266 """Maximum insert size for a read pair to be considered being mapped properly [500]. 267 268 Since 0.4.5, this option is only used when there are not enough 269 good alignments to infer the distribution of insert sizes.""", 270 checker_function=lambda x: isinstance(x, int), 271 equate=False), 272 _Option(["-o", "o"], 273 """Maximum occurrences of a read for pairing [100000]. 274 275 A read with more occurrences will be treated as a single-end read. 276 Reducing this parameter helps faster pairing.""", 277 checker_function=lambda x: isinstance(x, int), 278 equate=False), 279 _Option(["-n", "n"], 280 """Maximum number of alignments to output in the XA tag for reads paired properly [3]. 281 282 If a read has more than INT hits, the XA tag will not be written.""", 283 checker_function=lambda x: isinstance(x, int), 284 equate=False), 285 _Option(["-N", "N"], 286 """Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) [10]. 287 288 . If a read has more than INT hits, the XA tag will not be written.""", 289 checker_function=lambda x: isinstance(x, int), 290 equate=False), 291 _Option(["-r", "r"], "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 292 checker_function=lambda x: isinstance(x, basestring), 293 equate=False), 294 ] 295 AbstractCommandline.__init__(self, cmd, **kwargs)
296 297
298 -class BwaBwaswCommandline(AbstractCommandline):
299 """Command line wrapper for Burrows Wheeler Aligner (BWA) bwasw. 300 301 Align query sequences from FASTQ files. Equivalent to:: 302 303 $ bwa bwasw [...] <in.db.fasta> <in.fq> 304 305 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 306 307 Example: 308 309 >>> from Bio.Sequencing.Applications import BwaBwaswCommandline 310 >>> reference_genome = "/path/to/reference_genome.fasta" 311 >>> read_file = "/path/to/read_1.fq" 312 >>> bwasw_cmd = BwaBwaswCommandline(reference=reference_genome, read_file=read_file) 313 >>> print(bwasw_cmd) 314 bwa bwasw /path/to/reference_genome.fasta /path/to/read_1.fq 315 316 You would typically run the command line using bwasw_cmd() or via the 317 Python subprocess module, as described in the Biopython tutorial. 318 """
319 - def __init__(self, cmd="bwa", **kwargs):
320 self.program_name = cmd 321 self.parameters = \ 322 [ 323 _StaticArgument("bwasw"), 324 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 325 _Argument(["read_file"], "Read file", filename=True, is_required=True), 326 _Argument(["mate_file"], "Mate file", filename=True, is_required=False), 327 _Option(["-a", "a"], 328 "Score of a match [1]", 329 checker_function=lambda x: isinstance(x, int), 330 equate=False), 331 _Option(["-b", "b"], 332 "Mismatch penalty [3]", 333 checker_function=lambda x: isinstance(x, int), 334 equate=False), 335 _Option(["-q", "q"], 336 "Gap open penalty [5]", 337 checker_function=lambda x: isinstance(x, int), 338 equate=False), 339 _Option(["-r", "r"], 340 "Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2]", 341 checker_function=lambda x: isinstance(x, int), 342 equate=False), 343 _Option(["-t", "t"], 344 "Number of threads in the multi-threading mode [1]", 345 checker_function=lambda x: isinstance(x, int), 346 equate=False), 347 _Option(["-w", "w"], 348 "Band width in the banded alignment [33]", 349 checker_function=lambda x: isinstance(x, int), 350 equate=False), 351 _Option(["-T", "T"], 352 "Minimum score threshold divided by a [37]", 353 checker_function=lambda x: isinstance(x, int), 354 equate=False), 355 _Option(["-c", "c"], 356 """Coefficient for threshold adjustment according to query length [5.5]. 357 358 Given an l-long query, the threshold for a hit to be retained is 359 a*max{T,c*log(l)}.""", 360 checker_function=lambda x: isinstance(x, float), 361 equate=False), 362 _Option(["-z", "z"], 363 "Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]", 364 checker_function=lambda x: isinstance(x, int), 365 equate=False), 366 _Option(["-s", "s"], 367 """Maximum SA interval size for initiating a seed [3]. 368 369 Higher -s increases accuracy at the cost of speed.""", 370 checker_function=lambda x: isinstance(x, int), 371 equate=False), 372 _Option(["-N", "N"], 373 "Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5]", 374 checker_function=lambda x: isinstance(x, int), 375 equate=False), 376 ] 377 AbstractCommandline.__init__(self, cmd, **kwargs)
378 379 380 if __name__ == "__main__": 381 from Bio._utils import run_doctest 382 run_doctest() 383