Package Bio :: Package Sequencing :: Package Applications :: Module _bwa
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Applications._bwa

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Command line wrapper for bwa 
  7  """ 
  8   
  9  from __future__ import print_function 
 10  from Bio._py3k import basestring 
 11   
 12  from Bio.Application import _Option, _Argument, _Switch, AbstractCommandline 
 13  from Bio.Application import _StaticArgument 
 14   
 15   
16 -class BwaIndexCommandline(AbstractCommandline):
17 """Command line wrapper for Burrows Wheeler Aligner (BWA) index. 18 19 Index database sequences in the FASTA format, equivalent to:: 20 21 $ bwa index [-p prefix] [-a algoType] [-c] <in.db.fasta> 22 23 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 24 25 Example 26 ------- 27 28 >>> from Bio.Sequencing.Applications import BwaIndexCommandline 29 >>> reference_genome = "/path/to/reference_genome.fasta" 30 >>> index_cmd = BwaIndexCommandline(infile=reference_genome, algorithm="bwtsw") 31 >>> print(index_cmd) 32 bwa index -a bwtsw /path/to/reference_genome.fasta 33 34 You would typically run the command using index_cmd() or via the 35 Python subprocess module, as described in the Biopython tutorial. 36 37 """
38 - def __init__(self, cmd="bwa", **kwargs):
39 self.program_name = cmd 40 self.parameters = [ 41 _StaticArgument("index"), 42 _Option(["-a", "a", "algorithm"], 43 """Algorithm for constructing BWT index. 44 45 Available options are: 46 - is: IS linear-time algorithm for constructing suffix array. 47 It requires 5.37N memory where N is the size of the database. 48 IS is moderately fast, but does not work with database larger 49 than 2GB. IS is the default algorithm due to its simplicity. 50 - bwtsw: Algorithm implemented in BWT-SW. This method works with the 51 whole human genome, but it does not work with database 52 smaller than 10MB and it is usually slower than IS.""", 53 checker_function=lambda x: x in ["is", "bwtsw"], 54 equate=False, is_required=True), 55 _Option(["-p", "p", "prefix"], 56 "Prefix of the output database [same as db filename]", 57 equate=False, is_required=False), 58 _Argument(["infile"], "Input file name", filename=True, is_required=True), 59 _Switch(["-c", "c"], 60 "Build color-space index. The input fasta should be in nucleotide space.") 61 ] 62 AbstractCommandline.__init__(self, cmd, **kwargs)
63 64
65 -class BwaAlignCommandline(AbstractCommandline):
66 """Command line wrapper for Burrows Wheeler Aligner (BWA) aln. 67 68 Run a BWA alignment, equivalent to:: 69 70 $ bwa aln [...] <in.db.fasta> <in.query.fq> > <out.sai> 71 72 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 73 74 Example: 75 76 >>> from Bio.Sequencing.Applications import BwaAlignCommandline 77 >>> reference_genome = "/path/to/reference_genome.fasta" 78 >>> read_file = "/path/to/read_1.fq" 79 >>> output_sai_file = "/path/to/read_1.sai" 80 >>> read_group="@RG\tID:foo\tSM:bar" 81 >>> align_cmd = BwaAlignCommandline(reference=reference_genome, read_file=read_file) 82 >>> print(align_cmd) 83 bwa aln /path/to/reference_genome.fasta /path/to/read_1.fq 84 85 You would typically run the command line using align_cmd(stdout=output_sai_file) 86 or via the Python subprocess module, as described in the Biopython tutorial. 87 """
88 - def __init__(self, cmd="bwa", **kwargs):
89 self.program_name = cmd 90 self.parameters = [ 91 _StaticArgument("aln"), 92 _Argument(["reference"], "Reference file name", 93 filename=True, is_required=True), 94 _Argument(["read_file"], "Read file name", 95 filename=True, is_required=True), 96 _Option(["-n", "n"], 97 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 98 checker_function=lambda x: isinstance(x, (int, float)), 99 equate=False), 100 _Option(["-o", "o"], 101 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 102 checker_function=lambda x: isinstance(x, (int, float)), 103 equate=False), 104 _Option(["-e", "e"], 105 "Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]", 106 checker_function=lambda x: isinstance(x, int), 107 equate=False), 108 _Option(["-d", "d"], 109 "Disallow a long deletion within INT bp towards the 3-end [16]", 110 checker_function=lambda x: isinstance(x, int), 111 equate=False), 112 _Option(["-i", "i"], 113 "Disallow an indel within INT bp towards the ends [5]", 114 checker_function=lambda x: isinstance(x, int), 115 equate=False), 116 _Option(["-l", "l"], 117 """Take the first INT subsequence as seed. 118 119 If INT is larger than the query sequence, seeding will be disabled. 120 For long reads, this option is typically ranged from 25 to 35 for 121 -k 2. [inf]""", 122 checker_function=lambda x: isinstance(x, int), 123 equate=False), 124 _Option(["-k", "k"], "Maximum edit distance in the seed [2]", 125 checker_function=lambda x: isinstance(x, int), 126 equate=False), 127 _Option(["-t", "t"], "Number of threads (multi-threading mode) [1]", 128 checker_function=lambda x: isinstance(x, int), 129 equate=False), 130 _Option(["-M", "M"], 131 "Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3]", 132 checker_function=lambda x: isinstance(x, int), 133 equate=False), 134 _Option(["-O", "O"], "Gap open penalty [11]", 135 checker_function=lambda x: isinstance(x, int), 136 equate=False), 137 _Option(["-E", "E"], "Gap extension penalty [4]", 138 checker_function=lambda x: isinstance(x, int), 139 equate=False), 140 _Option(["-R", "R"], 141 """Proceed with suboptimal alignments if there are no more than INT equally best hits. 142 143 This option only affects paired-end mapping. Increasing this threshold helps 144 to improve the pairing accuracy at the cost of speed, especially for short 145 reads (~32bp).""", 146 checker_function=lambda x: isinstance(x, int), 147 equate=False), 148 _Option(["-q", "q"], 149 """Parameter for read trimming [0]. 150 151 BWA trims a read down to argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l<INT 152 where l is the original read length.""", 153 checker_function=lambda x: isinstance(x, int), 154 equate=False), 155 _Option(["-B", "B"], 156 "Length of barcode starting from the 5-end. When INT is positive, the barcode of each read will be trimmed before mapping and will be written at the BC SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]", 157 checker_function=lambda x: isinstance(x, int), 158 equate=False), 159 _Switch(["-c", "c"], 160 "Reverse query but not complement it, which is required for alignment in the color space."), 161 _Switch(["-N", "N"], 162 "Disable iterative search. All hits with no more than maxDiff differences will be found. This mode is much slower than the default."), 163 _Switch(["-I", "I"], 164 "The input is in the Illumina 1.3+ read format (quality equals ASCII-64)."), 165 _Switch(["-b", "b"], 166 "Specify the input read sequence file is the BAM format"), 167 _Switch(["-b1", "b1"], 168 "When -b is specified, only use the first read in a read pair in mapping (skip single-end reads and the second reads)."), 169 _Switch(["-b2", "b2"], 170 "When -b is specified, only use the second read in a read pair in mapping.") 171 ] 172 AbstractCommandline.__init__(self, cmd, **kwargs)
173 174
175 -class BwaSamseCommandline(AbstractCommandline):
176 """Command line wrapper for Burrows Wheeler Aligner (BWA) samse. 177 178 Generate alignments in the SAM format given single-end reads. 179 Equvialent to:: 180 181 $ bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam> 182 183 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 184 185 Example: 186 187 >>> from Bio.Sequencing.Applications import BwaSamseCommandline 188 >>> reference_genome = "/path/to/reference_genome.fasta" 189 >>> read_file = "/path/to/read_1.fq" 190 >>> sai_file = "/path/to/read_1.sai" 191 >>> output_sam_file = "/path/to/read_1.sam" 192 >>> samse_cmd = BwaSamseCommandline(reference=reference_genome, 193 ... read_file=read_file, sai_file=sai_file) 194 >>> print(samse_cmd) 195 bwa samse /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_1.fq 196 197 You would typically run the command line using samse_cmd(stdout=output_sam_file) 198 or via the Python subprocess module, as described in the Biopython tutorial. 199 """
200 - def __init__(self, cmd="bwa", **kwargs):
201 self.program_name = cmd 202 self.parameters = [ 203 _StaticArgument("samse"), 204 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 205 _Argument(["sai_file"], "Sai file name", filename=True, is_required=True), 206 _Argument(["read_file"], "Read file name", filename=True, is_required=True), 207 _Option(["-n", "n"], 208 """Maximum number of alignments to output in the XA tag for reads paired properly. 209 210 If a read has more than INT hits, the XA tag will not be written. [3]""", 211 checker_function=lambda x: isinstance(x, int), 212 equate=False), 213 _Option(["-r", "r"], 214 "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 215 checker_function=lambda x: isinstance(x, int), 216 equate=False), 217 ] 218 AbstractCommandline.__init__(self, cmd, **kwargs)
219 220
221 -class BwaSampeCommandline(AbstractCommandline):
222 """Command line wrapper for Burrows Wheeler Aligner (BWA) sampe. 223 224 Generate alignments in the SAM format given paired-end reads. 225 Equivalent to:: 226 227 $ bwa sampe [...] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam> 228 229 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 230 231 Example: 232 233 >>> from Bio.Sequencing.Applications import BwaSampeCommandline 234 >>> reference_genome = "/path/to/reference_genome.fasta" 235 >>> read_file1 = "/path/to/read_1.fq" 236 >>> read_file2 = "/path/to/read_2.fq" 237 >>> sai_file1 = "/path/to/read_1.sai" 238 >>> sai_file2 = "/path/to/read_2.sai" 239 >>> output_sam_file = "/path/to/output.sam" 240 >>> read_group = "@RG\tID:foo\tSM:bar" 241 >>> sampe_cmd = BwaSampeCommandline(reference=reference_genome, 242 ... sai_file1=sai_file1, sai_file2=sai_file2, 243 ... read_file1=read_file1, read_file2=read_file2, 244 ... r=read_group) 245 >>> print(sampe_cmd) 246 bwa sampe /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_2.sai /path/to/read_1.fq /path/to/read_2.fq -r @RG ID:foo SM:bar 247 248 You would typically run the command line using sampe_cmd(stdout=output_sam_file) 249 or via the Python subprocess module, as described in the Biopython tutorial. 250 """ 251 # TODO - Should the read group have a raw tab in it, or \t?
252 - def __init__(self, cmd="bwa", **kwargs):
253 self.program_name = cmd 254 self.parameters = [ 255 _StaticArgument("sampe"), 256 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 257 _Argument(["sai_file1"], "Sai file 1", filename=True, is_required=True), 258 _Argument(["sai_file2"], "Sai file 2", filename=True, is_required=True), 259 _Argument(["read_file1"], "Read file 1", filename=True, is_required=True), 260 _Argument(["read_file2"], "Read file 2", filename=True, is_required=True), 261 _Option(["-a", "a"], 262 """Maximum insert size for a read pair to be considered being mapped properly [500]. 263 264 Since 0.4.5, this option is only used when there are not enough 265 good alignments to infer the distribution of insert sizes.""", 266 checker_function=lambda x: isinstance(x, int), 267 equate=False), 268 _Option(["-o", "o"], 269 """Maximum occurrences of a read for pairing [100000]. 270 271 A read with more occurrences will be treated as a single-end read. 272 Reducing this parameter helps faster pairing.""", 273 checker_function=lambda x: isinstance(x, int), 274 equate=False), 275 _Option(["-n", "n"], 276 """Maximum number of alignments to output in the XA tag for reads paired properly [3]. 277 278 If a read has more than INT hits, the XA tag will not be written.""", 279 checker_function=lambda x: isinstance(x, int), 280 equate=False), 281 _Option(["-N", "N"], 282 """Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) [10]. 283 284 If a read has more than INT hits, the XA tag will not be written.""", 285 checker_function=lambda x: isinstance(x, int), 286 equate=False), 287 _Option(["-r", "r"], "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 288 checker_function=lambda x: isinstance(x, basestring), 289 equate=False), 290 ] 291 AbstractCommandline.__init__(self, cmd, **kwargs)
292 293
294 -class BwaBwaswCommandline(AbstractCommandline):
295 """Command line wrapper for Burrows Wheeler Aligner (BWA) bwasw. 296 297 Align query sequences from FASTQ files. Equivalent to:: 298 299 $ bwa bwasw [...] <in.db.fasta> <in.fq> 300 301 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 302 303 Example: 304 305 >>> from Bio.Sequencing.Applications import BwaBwaswCommandline 306 >>> reference_genome = "/path/to/reference_genome.fasta" 307 >>> read_file = "/path/to/read_1.fq" 308 >>> bwasw_cmd = BwaBwaswCommandline(reference=reference_genome, read_file=read_file) 309 >>> print(bwasw_cmd) 310 bwa bwasw /path/to/reference_genome.fasta /path/to/read_1.fq 311 312 You would typically run the command line using bwasw_cmd() or via the 313 Python subprocess module, as described in the Biopython tutorial. 314 """
315 - def __init__(self, cmd="bwa", **kwargs):
316 self.program_name = cmd 317 self.parameters = [ 318 _StaticArgument("bwasw"), 319 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 320 _Argument(["read_file"], "Read file", filename=True, is_required=True), 321 _Argument(["mate_file"], "Mate file", filename=True, is_required=False), 322 _Option(["-a", "a"], 323 "Score of a match [1]", 324 checker_function=lambda x: isinstance(x, int), 325 equate=False), 326 _Option(["-b", "b"], 327 "Mismatch penalty [3]", 328 checker_function=lambda x: isinstance(x, int), 329 equate=False), 330 _Option(["-q", "q"], 331 "Gap open penalty [5]", 332 checker_function=lambda x: isinstance(x, int), 333 equate=False), 334 _Option(["-r", "r"], 335 "Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2]", 336 checker_function=lambda x: isinstance(x, int), 337 equate=False), 338 _Option(["-t", "t"], 339 "Number of threads in the multi-threading mode [1]", 340 checker_function=lambda x: isinstance(x, int), 341 equate=False), 342 _Option(["-w", "w"], 343 "Band width in the banded alignment [33]", 344 checker_function=lambda x: isinstance(x, int), 345 equate=False), 346 _Option(["-T", "T"], 347 "Minimum score threshold divided by a [37]", 348 checker_function=lambda x: isinstance(x, int), 349 equate=False), 350 _Option(["-c", "c"], 351 """Coefficient for threshold adjustment according to query length [5.5]. 352 353 Given an l-long query, the threshold for a hit to be retained is 354 a*max{T,c*log(l)}.""", 355 checker_function=lambda x: isinstance(x, float), 356 equate=False), 357 _Option(["-z", "z"], 358 "Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]", 359 checker_function=lambda x: isinstance(x, int), 360 equate=False), 361 _Option(["-s", "s"], 362 """Maximum SA interval size for initiating a seed [3]. 363 364 Higher -s increases accuracy at the cost of speed.""", 365 checker_function=lambda x: isinstance(x, int), 366 equate=False), 367 _Option(["-N", "N"], 368 "Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5]", 369 checker_function=lambda x: isinstance(x, int), 370 equate=False), 371 ] 372 AbstractCommandline.__init__(self, cmd, **kwargs)
373 374 375 if __name__ == "__main__": 376 from Bio._utils import run_doctest 377 run_doctest() 378