Package Bio :: Package SeqIO :: Module _convert
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._convert

  1  # Copyright 2009-2016 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Optimised sequence conversion code (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.convert(...) function which is the 
  9  public interface for this. 
 10   
 11  The idea here is that while doing this will work:: 
 12   
 13      from Bio import SeqIO 
 14      records = SeqIO.parse(in_handle, in_format) 
 15      count = SeqIO.write(records, out_handle, out_format) 
 16   
 17  it is shorter to write:: 
 18   
 19      from Bio import SeqIO 
 20      count = SeqIO.convert(in_handle, in_format, out_handle, out_format) 
 21   
 22  Also, the convert function can take a number of special case optimisations. This 
 23  means that using Bio.SeqIO.convert() may be faster, as well as more convenient. 
 24  All these file format specific optimisations are handled by this (private) module. 
 25  """ 
 26   
 27  from Bio import BiopythonWarning 
 28  from Bio import SeqIO 
 29  # NOTE - Lots of lazy imports further on... 
 30   
 31   
32 -def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
33 """Fast GenBank to FASTA (PRIVATE).""" 34 # We don't need to parse the features... 35 from Bio.GenBank.Scanner import GenBankScanner 36 records = GenBankScanner().parse_records(in_handle, do_features=False) 37 # For FASTA output we can ignore the alphabet too 38 return SeqIO.write(records, out_handle, "fasta")
39 40
41 -def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
42 """Fast EMBL to FASTA (PRIVATE).""" 43 # We don't need to parse the features... 44 from Bio.GenBank.Scanner import EmblScanner 45 records = EmblScanner().parse_records(in_handle, do_features=False) 46 # For FASTA output we can ignore the alphabet too 47 return SeqIO.write(records, out_handle, "fasta")
48 49
50 -def _fastq_generic(in_handle, out_handle, mapping):
51 """FASTQ helper function where can't have data loss by truncation (PRIVATE).""" 52 from Bio.SeqIO.QualityIO import FastqGeneralIterator 53 # For real speed, don't even make SeqRecord and Seq objects! 54 count = 0 55 null = chr(0) 56 for title, seq, old_qual in FastqGeneralIterator(in_handle): 57 count += 1 58 # map the qual... 59 qual = old_qual.translate(mapping) 60 if null in qual: 61 raise ValueError("Invalid character in quality string") 62 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 63 return count
64 65
66 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
67 """FASTQ helper function where there could be data loss by truncation (PRIVATE).""" 68 from Bio.SeqIO.QualityIO import FastqGeneralIterator 69 # For real speed, don't even make SeqRecord and Seq objects! 70 count = 0 71 null = chr(0) 72 for title, seq, old_qual in FastqGeneralIterator(in_handle): 73 count += 1 74 # map the qual... 75 qual = old_qual.translate(mapping) 76 if null in qual: 77 raise ValueError("Invalid character in quality string") 78 if truncate_char in qual: 79 qual = qual.replace(truncate_char, chr(126)) 80 import warnings 81 warnings.warn(truncate_msg, BiopythonWarning) 82 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 83 return count
84 85
86 -def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
87 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE). 88 89 Useful for removing line wrapping and the redundant second identifier 90 on the plus lines. Will check also check the quality string is valid. 91 92 Avoids creating SeqRecord and Seq objects in order to speed up this 93 conversion. 94 """ 95 # Map unexpected chars to null 96 mapping = "".join([chr(0) for ascii in range(0, 33)] + 97 [chr(ascii) for ascii in range(33, 127)] + 98 [chr(0) for ascii in range(127, 256)]) 99 assert len(mapping) == 256 100 return _fastq_generic(in_handle, out_handle, mapping)
101 102
103 -def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
104 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE). 105 106 Useful for removing line wrapping and the redundant second identifier 107 on the plus lines. Will check also check the quality string is valid. 108 Avoids creating SeqRecord and Seq objects in order to speed up this 109 conversion. 110 """ 111 # Map unexpected chars to null 112 mapping = "".join([chr(0) for ascii in range(0, 59)] + 113 [chr(ascii) for ascii in range(59, 127)] + 114 [chr(0) for ascii in range(127, 256)]) 115 assert len(mapping) == 256 116 return _fastq_generic(in_handle, out_handle, mapping)
117 118
119 -def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
120 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 121 122 Useful for removing line wrapping and the redundant second identifier 123 on the plus lines. Will check also check the quality string is valid. 124 Avoids creating SeqRecord and Seq objects in order to speed up this 125 conversion. 126 """ 127 # Map unexpected chars to null 128 mapping = "".join([chr(0) for ascii in range(0, 64)] + 129 [chr(ascii) for ascii in range(64, 127)] + 130 [chr(0) for ascii in range(127, 256)]) 131 assert len(mapping) == 256 132 return _fastq_generic(in_handle, out_handle, mapping)
133 134
135 -def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
136 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE). 137 138 Avoids creating SeqRecord and Seq objects in order to speed up this 139 conversion. 140 """ 141 # Map unexpected chars to null 142 mapping = "".join([chr(0) for ascii in range(0, 64)] + 143 [chr(33 + q) for q in range(0, 62 + 1)] + 144 [chr(0) for ascii in range(127, 256)]) 145 assert len(mapping) == 256 146 return _fastq_generic(in_handle, out_handle, mapping)
147 148
149 -def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
150 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 151 152 Avoids creating SeqRecord and Seq objects in order to speed up this 153 conversion. Will issue a warning if the scores had to be truncated at 62 154 (maximum possible in the Illumina 1.3+ FASTQ format) 155 """ 156 # Map unexpected chars to null 157 trunc_char = chr(1) 158 mapping = "".join([chr(0) for ascii in range(0, 33)] + 159 [chr(64 + q) for q in range(0, 62 + 1)] + 160 [trunc_char for ascii in range(96, 127)] + 161 [chr(0) for ascii in range(127, 256)]) 162 assert len(mapping) == 256 163 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 164 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
165 166
167 -def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
168 """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE). 169 170 Avoids creating SeqRecord and Seq objects in order to speed up this 171 conversion. 172 """ 173 # Map unexpected chars to null 174 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 175 mapping = "".join([chr(0) for ascii in range(0, 59)] + 176 [chr(33 + int(round(phred_quality_from_solexa(q)))) 177 for q in range(-5, 62 + 1)] + 178 [chr(0) for ascii in range(127, 256)]) 179 assert len(mapping) == 256 180 return _fastq_generic(in_handle, out_handle, mapping)
181 182
183 -def _fastq_sanger_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
184 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE). 185 186 Avoids creating SeqRecord and Seq objects in order to speed up this 187 conversion. Will issue a warning if the scores had to be truncated at 62 188 (maximum possible in the Solexa FASTQ format) 189 """ 190 # Map unexpected chars to null 191 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 192 trunc_char = chr(1) 193 mapping = "".join([chr(0) for ascii in range(0, 33)] + 194 [chr(64 + int(round(solexa_quality_from_phred(q)))) 195 for q in range(0, 62 + 1)] + 196 [trunc_char for ascii in range(96, 127)] + 197 [chr(0) for ascii in range(127, 256)]) 198 assert len(mapping) == 256 199 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 200 "Data loss - max Solexa quality 62 in Solexa FASTQ")
201 202
203 -def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
204 """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 205 206 Avoids creating SeqRecord and Seq objects in order to speed up this 207 conversion. 208 """ 209 # Map unexpected chars to null 210 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 211 mapping = "".join([chr(0) for ascii in range(0, 59)] + 212 [chr(64 + int(round(phred_quality_from_solexa(q)))) 213 for q in range(-5, 62 + 1)] + 214 [chr(0) for ascii in range(127, 256)]) 215 assert len(mapping) == 256 216 return _fastq_generic(in_handle, out_handle, mapping)
217 218
219 -def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
220 """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE). 221 222 Avoids creating SeqRecord and Seq objects in order to speed up this 223 conversion. 224 """ 225 # Map unexpected chars to null 226 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 227 mapping = "".join([chr(0) for ascii in range(0, 64)] + 228 [chr(64 + int(round(solexa_quality_from_phred(q)))) 229 for q in range(0, 62 + 1)] + 230 [chr(0) for ascii in range(127, 256)]) 231 assert len(mapping) == 256 232 return _fastq_generic(in_handle, out_handle, mapping)
233 234
235 -def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
236 """Fast FASTQ to FASTA conversion (PRIVATE). 237 238 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 239 Seq objects in order to speed up this conversion. 240 241 NOTE - This does NOT check the characters used in the FASTQ quality string 242 are valid! 243 """ 244 from Bio.SeqIO.QualityIO import FastqGeneralIterator 245 # For real speed, don't even make SeqRecord and Seq objects! 246 count = 0 247 for title, seq, qual in FastqGeneralIterator(in_handle): 248 count += 1 249 out_handle.write(">%s\n" % title) 250 # Do line wrapping 251 for i in range(0, len(seq), 60): 252 out_handle.write(seq[i:i + 60] + "\n") 253 return count
254 255
256 -def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
257 """Fast FASTQ to simple tabbed conversion (PRIVATE). 258 259 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 260 Seq objects in order to speed up this conversion. 261 262 NOTE - This does NOT check the characters used in the FASTQ quality string 263 are valid! 264 """ 265 from Bio.SeqIO.QualityIO import FastqGeneralIterator 266 # For real speed, don't even make SeqRecord and Seq objects! 267 count = 0 268 for title, seq, qual in FastqGeneralIterator(in_handle): 269 count += 1 270 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq)) 271 return count
272 273
274 -def _fastq_convert_qual(in_handle, out_handle, mapping):
275 """FASTQ helper function for QUAL output (PRIVATE). 276 277 Mapping should be a dictionary mapping expected ASCII characters from the 278 FASTQ quality string to PHRED quality scores (as strings). 279 """ 280 from Bio.SeqIO.QualityIO import FastqGeneralIterator 281 # For real speed, don't even make SeqRecord and Seq objects! 282 count = 0 283 for title, seq, qual in FastqGeneralIterator(in_handle): 284 count += 1 285 out_handle.write(">%s\n" % title) 286 # map the qual... note even with Sanger encoding max 2 digits 287 try: 288 qualities_strs = [mapping[ascii] for ascii in qual] 289 except KeyError: 290 raise ValueError("Invalid character in quality string") 291 data = " ".join(qualities_strs) 292 while len(data) > 60: 293 # Know quality scores are either 1 or 2 digits, so there 294 # must be a space in any three consecutive characters. 295 if data[60] == " ": 296 out_handle.write(data[:60] + "\n") 297 data = data[61:] 298 elif data[59] == " ": 299 out_handle.write(data[:59] + "\n") 300 data = data[60:] 301 else: 302 assert data[58] == " ", "Internal logic failure in wrapping" 303 out_handle.write(data[:58] + "\n") 304 data = data[59:] 305 out_handle.write(data + "\n") 306 return count
307 308
309 -def _fastq_sanger_convert_qual(in_handle, out_handle, alphabet=None):
310 """Fast Sanger FASTQ to QUAL conversion (PRIVATE).""" 311 mapping = dict((chr(q + 33), str(q)) for q in range(0, 93 + 1)) 312 return _fastq_convert_qual(in_handle, out_handle, mapping)
313 314
315 -def _fastq_solexa_convert_qual(in_handle, out_handle, alphabet=None):
316 """Fast Solexa FASTQ to QUAL conversion (PRIVATE).""" 317 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 318 mapping = dict((chr(q + 64), str(int(round(phred_quality_from_solexa(q))))) 319 for q in range(-5, 62 + 1)) 320 return _fastq_convert_qual(in_handle, out_handle, mapping)
321 322
323 -def _fastq_illumina_convert_qual(in_handle, out_handle, alphabet=None):
324 """Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE).""" 325 mapping = dict((chr(q + 64), str(q)) for q in range(0, 62 + 1)) 326 return _fastq_convert_qual(in_handle, out_handle, mapping)
327 328 329 # TODO? - Handling aliases explicitly would let us shorten this list: 330 _converter = { 331 ("genbank", "fasta"): _genbank_convert_fasta, 332 ("gb", "fasta"): _genbank_convert_fasta, 333 ("embl", "fasta"): _embl_convert_fasta, 334 ("fastq", "fasta"): _fastq_convert_fasta, 335 ("fastq-sanger", "fasta"): _fastq_convert_fasta, 336 ("fastq-solexa", "fasta"): _fastq_convert_fasta, 337 ("fastq-illumina", "fasta"): _fastq_convert_fasta, 338 ("fastq", "tab"): _fastq_convert_tab, 339 ("fastq-sanger", "tab"): _fastq_convert_tab, 340 ("fastq-solexa", "tab"): _fastq_convert_tab, 341 ("fastq-illumina", "tab"): _fastq_convert_tab, 342 ("fastq", "fastq"): _fastq_sanger_convert_fastq_sanger, 343 ("fastq-sanger", "fastq"): _fastq_sanger_convert_fastq_sanger, 344 ("fastq-solexa", "fastq"): _fastq_solexa_convert_fastq_sanger, 345 ("fastq-illumina", "fastq"): _fastq_illumina_convert_fastq_sanger, 346 ("fastq", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger, 347 ("fastq-sanger", "fastq-sanger"): _fastq_sanger_convert_fastq_sanger, 348 ("fastq-solexa", "fastq-sanger"): _fastq_solexa_convert_fastq_sanger, 349 ("fastq-illumina", "fastq-sanger"): _fastq_illumina_convert_fastq_sanger, 350 ("fastq", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa, 351 ("fastq-sanger", "fastq-solexa"): _fastq_sanger_convert_fastq_solexa, 352 ("fastq-solexa", "fastq-solexa"): _fastq_solexa_convert_fastq_solexa, 353 ("fastq-illumina", "fastq-solexa"): _fastq_illumina_convert_fastq_solexa, 354 ("fastq", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina, 355 ("fastq-sanger", "fastq-illumina"): _fastq_sanger_convert_fastq_illumina, 356 ("fastq-solexa", "fastq-illumina"): _fastq_solexa_convert_fastq_illumina, 357 ("fastq-illumina", "fastq-illumina"): _fastq_illumina_convert_fastq_illumina, 358 ("fastq", "qual"): _fastq_sanger_convert_qual, 359 ("fastq-sanger", "qual"): _fastq_sanger_convert_qual, 360 ("fastq-solexa", "qual"): _fastq_solexa_convert_qual, 361 ("fastq-illumina", "qual"): _fastq_illumina_convert_qual, 362 } 363 364
365 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
366 """SeqIO conversion function (PRIVATE).""" 367 try: 368 f = _converter[(in_format, out_format)] 369 except KeyError: 370 f = None 371 if f: 372 return f(in_handle, out_handle, alphabet) 373 else: 374 records = SeqIO.parse(in_handle, in_format, alphabet) 375 return SeqIO.write(records, out_handle, out_format)
376