Package Bio :: Package SeqIO :: Module _convert
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._convert

  1  # Copyright 2009-2016 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Optimised sequence conversion code (PRIVATE). 
  8   
  9  You are not expected to access this module, or any of its code, directly. This 
 10  is all handled internally by the Bio.SeqIO.convert(...) function which is the 
 11  public interface for this. 
 12   
 13  The idea here is that while doing this will work:: 
 14   
 15      from Bio import SeqIO 
 16      records = SeqIO.parse(in_handle, in_format) 
 17      count = SeqIO.write(records, out_handle, out_format) 
 18   
 19  it is shorter to write:: 
 20   
 21      from Bio import SeqIO 
 22      count = SeqIO.convert(in_handle, in_format, out_handle, out_format) 
 23   
 24  Also, the convert function can take a number of special case optimisations. This 
 25  means that using Bio.SeqIO.convert() may be faster, as well as more convenient. 
 26  All these file format specific optimisations are handled by this (private) module. 
 27  """ 
 28   
 29  from Bio import BiopythonWarning 
 30  from Bio import SeqIO 
 31  # NOTE - Lots of lazy imports further on... 
 32   
 33   
34 -def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
35 """Fast GenBank to FASTA (PRIVATE).""" 36 # We don't need to parse the features... 37 from Bio.GenBank.Scanner import GenBankScanner 38 records = GenBankScanner().parse_records(in_handle, do_features=False) 39 # For FASTA output we can ignore the alphabet too 40 return SeqIO.write(records, out_handle, "fasta")
41 42
43 -def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
44 """Fast EMBL to FASTA (PRIVATE).""" 45 # We don't need to parse the features... 46 from Bio.GenBank.Scanner import EmblScanner 47 records = EmblScanner().parse_records(in_handle, do_features=False) 48 # For FASTA output we can ignore the alphabet too 49 return SeqIO.write(records, out_handle, "fasta")
50 51
52 -def _fastq_generic(in_handle, out_handle, mapping):
53 """FASTQ helper function where can't have data loss by truncation (PRIVATE).""" 54 from Bio.SeqIO.QualityIO import FastqGeneralIterator 55 # For real speed, don't even make SeqRecord and Seq objects! 56 count = 0 57 null = chr(0) 58 for title, seq, old_qual in FastqGeneralIterator(in_handle): 59 count += 1 60 # map the qual... 61 qual = old_qual.translate(mapping) 62 if null in qual: 63 raise ValueError("Invalid character in quality string") 64 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 65 return count
66 67
68 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
69 """FASTQ helper function where there could be data loss by truncation (PRIVATE).""" 70 from Bio.SeqIO.QualityIO import FastqGeneralIterator 71 # For real speed, don't even make SeqRecord and Seq objects! 72 count = 0 73 null = chr(0) 74 for title, seq, old_qual in FastqGeneralIterator(in_handle): 75 count += 1 76 # map the qual... 77 qual = old_qual.translate(mapping) 78 if null in qual: 79 raise ValueError("Invalid character in quality string") 80 if truncate_char in qual: 81 qual = qual.replace(truncate_char, chr(126)) 82 import warnings 83 warnings.warn(truncate_msg, BiopythonWarning) 84 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 85 return count
86 87
88 -def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
89 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE). 90 91 Useful for removing line wrapping and the redundant second identifier 92 on the plus lines. Will check also check the quality string is valid. 93 94 Avoids creating SeqRecord and Seq objects in order to speed up this 95 conversion. 96 """ 97 # Map unexpected chars to null 98 mapping = "".join([chr(0) for ascii in range(0, 33)] + 99 [chr(ascii) for ascii in range(33, 127)] + 100 [chr(0) for ascii in range(127, 256)]) 101 assert len(mapping) == 256 102 return _fastq_generic(in_handle, out_handle, mapping)
103 104
105 -def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
106 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE). 107 108 Useful for removing line wrapping and the redundant second identifier 109 on the plus lines. Will check also check the quality string is valid. 110 Avoids creating SeqRecord and Seq objects in order to speed up this 111 conversion. 112 """ 113 # Map unexpected chars to null 114 mapping = "".join([chr(0) for ascii in range(0, 59)] + 115 [chr(ascii) for ascii in range(59, 127)] + 116 [chr(0) for ascii in range(127, 256)]) 117 assert len(mapping) == 256 118 return _fastq_generic(in_handle, out_handle, mapping)
119 120
121 -def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
122 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 123 124 Useful for removing line wrapping and the redundant second identifier 125 on the plus lines. Will check also check the quality string is valid. 126 Avoids creating SeqRecord and Seq objects in order to speed up this 127 conversion. 128 """ 129 # Map unexpected chars to null 130 mapping = "".join([chr(0) for ascii in range(0, 64)] + 131 [chr(ascii) for ascii in range(64, 127)] + 132 [chr(0) for ascii in range(127, 256)]) 133 assert len(mapping) == 256 134 return _fastq_generic(in_handle, out_handle, mapping)
135 136
137 -def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
138 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE). 139 140 Avoids creating SeqRecord and Seq objects in order to speed up this 141 conversion. 142 """ 143 # Map unexpected chars to null 144 mapping = "".join([chr(0) for ascii in range(0, 64)] + 145 [chr(33 + q) for q in range(0, 62 + 1)] + 146 [chr(0) for ascii in range(127, 256)]) 147 assert len(mapping) == 256 148 return _fastq_generic(in_handle, out_handle, mapping)
149 150
151 -def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
152 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 153 154 Avoids creating SeqRecord and Seq objects in order to speed up this 155 conversion. Will issue a warning if the scores had to be truncated at 62 156 (maximum possible in the Illumina 1.3+ FASTQ format) 157 """ 158 # Map unexpected chars to null 159 trunc_char = chr(1) 160 mapping = "".join([chr(0) for ascii in range(0, 33)] + 161 [chr(64 + q) for q in range(0, 62 + 1)] + 162 [trunc_char for ascii in range(96, 127)] + 163 [chr(0) for ascii in range(127, 256)]) 164 assert len(mapping) == 256 165 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 166 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
167 168
169 -def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
170 """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE). 171 172 Avoids creating SeqRecord and Seq objects in order to speed up this 173 conversion. 174 """ 175 # Map unexpected chars to null 176 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 177 mapping = "".join([chr(0) for ascii in range(0, 59)] + 178 [chr(33 + int(round(phred_quality_from_solexa(q)))) 179 for q in range(-5, 62 + 1)] + 180 [chr(0) for ascii in range(127, 256)]) 181 assert len(mapping) == 256 182 return _fastq_generic(in_handle, out_handle, mapping)
183 184
185 -def _fastq_sanger_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
186 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE). 187 188 Avoids creating SeqRecord and Seq objects in order to speed up this 189 conversion. Will issue a warning if the scores had to be truncated at 62 190 (maximum possible in the Solexa FASTQ format) 191 """ 192 # Map unexpected chars to null 193 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 194 trunc_char = chr(1) 195 mapping = "".join([chr(0) for ascii in range(0, 33)] + 196 [chr(64 + int(round(solexa_quality_from_phred(q)))) 197 for q in range(0, 62 + 1)] + 198 [trunc_char for ascii in range(96, 127)] + 199 [chr(0) for ascii in range(127, 256)]) 200 assert len(mapping) == 256 201 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 202 "Data loss - max Solexa quality 62 in Solexa FASTQ")
203 204
205 -def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
206 """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 207 208 Avoids creating SeqRecord and Seq objects in order to speed up this 209 conversion. 210 """ 211 # Map unexpected chars to null 212 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 213 mapping = "".join([chr(0) for ascii in range(0, 59)] + 214 [chr(64 + int(round(phred_quality_from_solexa(q)))) 215 for q in range(-5, 62 + 1)] + 216 [chr(0) for ascii in range(127, 256)]) 217 assert len(mapping) == 256 218 return _fastq_generic(in_handle, out_handle, mapping)
219 220
221 -def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
222 """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE). 223 224 Avoids creating SeqRecord and Seq objects in order to speed up this 225 conversion. 226 """ 227 # Map unexpected chars to null 228 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 229 mapping = "".join([chr(0) for ascii in range(0, 64)] + 230 [chr(64 + int(round(solexa_quality_from_phred(q)))) 231 for q in range(0, 62 + 1)] + 232 [chr(0) for ascii in range(127, 256)]) 233 assert len(mapping) == 256 234 return _fastq_generic(in_handle, out_handle, mapping)
235 236
237 -def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
238 """Fast FASTQ to FASTA conversion (PRIVATE). 239 240 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 241 Seq objects in order to speed up this conversion. 242 243 NOTE - This does NOT check the characters used in the FASTQ quality string 244 are valid! 245 """ 246 from Bio.SeqIO.QualityIO import FastqGeneralIterator 247 # For real speed, don't even make SeqRecord and Seq objects! 248 count = 0 249 for title, seq, qual in FastqGeneralIterator(in_handle): 250 count += 1 251 out_handle.write(">%s\n" % title) 252 # Do line wrapping 253 for i in range(0, len(seq), 60): 254 out_handle.write(seq[i:i + 60] + "\n") 255 return count
256 257
258 -def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
259 """Fast FASTQ to simple tabbed conversion (PRIVATE). 260 261 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 262 Seq objects in order to speed up this conversion. 263 264 NOTE - This does NOT check the characters used in the FASTQ quality string 265 are valid! 266 """ 267 from Bio.SeqIO.QualityIO import FastqGeneralIterator 268 # For real speed, don't even make SeqRecord and Seq objects! 269 count = 0 270 for title, seq, qual in FastqGeneralIterator(in_handle): 271 count += 1 272 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq)) 273 return count
274 275
276 -def _fastq_convert_qual(in_handle, out_handle, mapping):
277 """FASTQ helper function for QUAL output (PRIVATE). 278 279 Mapping should be a dictionary mapping expected ASCII characters from the 280 FASTQ quality string to PHRED quality scores (as strings). 281 """ 282 from Bio.SeqIO.QualityIO import FastqGeneralIterator 283 # For real speed, don't even make SeqRecord and Seq objects! 284 count = 0 285 for title, seq, qual in FastqGeneralIterator(in_handle): 286 count += 1 287 out_handle.write(">%s\n" % title) 288 # map the qual... note even with Sanger encoding max 2 digits 289 try: 290 qualities_strs = [mapping[ascii] for ascii in qual] 291 except KeyError: 292 raise ValueError("Invalid character in quality string") 293 data = " ".join(qualities_strs) 294 while len(data) > 60: 295 # Know quality scores are either 1 or 2 digits, so there 296 # must be a space in any three consecutive characters. 297 if data[60] == " ": 298 out_handle.write(data[:60] + "\n") 299 data = data[61:] 300 elif data[59] == " ": 301 out_handle.write(data[:59] + "\n") 302 data = data[60:] 303 else: 304 assert data[58] == " ", "Internal logic failure in wrapping" 305 out_handle.write(data[:58] + "\n") 306 data = data[59:] 307 out_handle.write(data + "\n") 308 return count
309 310
311 -def _fastq_sanger_convert_qual(in_handle, out_handle, alphabet=None):
312 """Fast Sanger FASTQ to QUAL conversion (PRIVATE).""" 313 mapping = dict((chr(q + 33), str(q)) for q in range(0, 93 + 1)) 314 return _fastq_convert_qual(in_handle, out_handle, mapping)
315 316
317 -def _fastq_solexa_convert_qual(in_handle, out_handle, alphabet=None):
318 """Fast Solexa FASTQ to QUAL conversion (PRIVATE).""" 319 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 320 mapping = dict((chr(q + 64), str(int(round(phred_quality_from_solexa(q))))) 321 for q in range(-5, 62 + 1)) 322 return _fastq_convert_qual(in_handle, out_handle, mapping)
323 324
325 -def _fastq_illumina_convert_qual(in_handle, out_handle, alphabet=None):
326 """Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE).""" 327 mapping = dict((chr(q + 64), str(q)) for q in range(0, 62 + 1)) 328 return _fastq_convert_qual(in_handle, out_handle, mapping)
329 330 331 # TODO? - Handling aliases explicitly would let us shorten this list: 332 _converter = { 333 ("genbank", "fasta"): _genbank_convert_fasta, 334 ("gb", "fasta"): _genbank_convert_fasta, 335 ("embl", "fasta"): _embl_convert_fasta, 336 ("fastq", "fasta"): _fastq_convert_fasta, 337 ("fastq-sanger", "fasta"): _fastq_convert_fasta, 338 ("fastq-solexa", "fasta"): _fastq_convert_fasta, 339 ("fastq-illumina", "fasta"): _fastq_convert_fasta, 340 ("fastq", "tab"): _fastq_convert_tab, 341 ("fastq-sanger", "tab"): _fastq_convert_tab, 342 ("fastq-solexa", "tab"): _fastq_convert_tab, 343 ("fastq-illumina", "tab"): _fastq_convert_tab, 344 ("fastq", "fastq"): _fastq_sanger_convert_fastq_sanger, 345 ("fastq-sanger", "fastq"): _fastq_sanger_convert_fastq_sanger, 346 ("fastq-solexa", "fastq"): _fastq_solexa_convert_fastq_sanger, 347 ("fastq-illumina", "fastq"): _fastq_illumina_convert_fastq_sanger, 348 ("fastq", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger, 349 ("fastq-sanger", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger, 350 ("fastq-solexa", "fastq-sanger"): _fastq_solexa_convert_fastq_sanger, 351 ("fastq-illumina", "fastq-sanger"): _fastq_illumina_convert_fastq_sanger, 352 ("fastq", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa, 353 ("fastq-sanger", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa, 354 ("fastq-solexa", "fastq-solexa"): _fastq_solexa_convert_fastq_solexa, 355 ("fastq-illumina", "fastq-solexa"): _fastq_illumina_convert_fastq_solexa, 356 ("fastq", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina, 357 ("fastq-sanger", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina, 358 ("fastq-solexa", "fastq-illumina"): _fastq_solexa_convert_fastq_illumina, 359 ("fastq-illumina", "fastq-illumina"): _fastq_illumina_convert_fastq_illumina, 360 ("fastq", "qual"): _fastq_sanger_convert_qual, 361 ("fastq-sanger", "qual"): _fastq_sanger_convert_qual, 362 ("fastq-solexa", "qual"): _fastq_solexa_convert_qual, 363 ("fastq-illumina", "qual"): _fastq_illumina_convert_qual, 364 } 365 366
367 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
368 """Convert handles from one format to another (PRIVATE).""" 369 try: 370 f = _converter[(in_format, out_format)] 371 except KeyError: 372 f = None 373 if f: 374 return f(in_handle, out_handle, alphabet) 375 else: 376 records = SeqIO.parse(in_handle, in_format, alphabet) 377 return SeqIO.write(records, out_handle, out_format)
378