Package Bio :: Package SeqIO :: Module AceIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.AceIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Bio.SeqIO support for the "ace" file format. 
  8   
  9  You are expected to use this module via the Bio.SeqIO functions. 
 10  See also the Bio.Sequencing.Ace module which offers more than just accessing 
 11  the contig consensus sequences in an ACE file as SeqRecord objects. 
 12  """ 
 13   
 14  from __future__ import print_function 
 15   
 16  from Bio.Seq import Seq 
 17  from Bio.SeqRecord import SeqRecord 
 18  from Bio.Alphabet import generic_nucleotide, generic_dna, generic_rna, Gapped 
 19  from Bio.Sequencing import Ace 
 20   
 21  __docformat__ = "restructuredtext en" 
 22   
 23   
24 -def AceIterator(handle):
25 """Returns SeqRecord objects from an ACE file. 26 27 This uses the Bio.Sequencing.Ace module to do the hard work. Note that 28 by iterating over the file in a single pass, we are forced to ignore any 29 WA, CT, RT or WR footer tags. 30 31 Ace files include the base quality for each position, which are taken 32 to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file 33 using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's 34 letter_annotations dictionary under the "phred_quality" key. 35 36 >>> from Bio import SeqIO 37 >>> with open("Ace/consed_sample.ace", "rU") as handle: 38 ... for record in SeqIO.parse(handle, "ace"): 39 ... print("%s %s... %i" % (record.id, record.seq[:10], len(record))) 40 ... print(max(record.letter_annotations["phred_quality"])) 41 Contig1 agccccgggc... 1475 42 90 43 44 However, ACE files do not include a base quality for any gaps in the 45 consensus sequence, and these are represented in Biopython with a quality 46 of zero. Using zero is perhaps misleading as there may be very strong 47 evidence to support the gap in the consensus. Previous versions of 48 Biopython therefore used None instead, but this complicated usage, and 49 prevented output of the gapped sequence as FASTQ format. 50 51 >>> from Bio import SeqIO 52 >>> with open("Ace/contig1.ace", "rU") as handle: 53 ... for record in SeqIO.parse(handle, "ace"): 54 ... print("%s ...%s..." % (record.id, record.seq[85:95])) 55 ... print(record.letter_annotations["phred_quality"][85:95]) 56 ... print(max(record.letter_annotations["phred_quality"])) 57 Contig1 ...AGAGG-ATGC... 58 [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] 59 90 60 Contig2 ...GAATTACTAT... 61 [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] 62 90 63 64 """ 65 for ace_contig in Ace.parse(handle): 66 # Convert the ACE contig record into a SeqRecord... 67 consensus_seq_str = ace_contig.sequence 68 # Assume its DNA unless there is a U in it, 69 if "U" in consensus_seq_str: 70 if "T" in consensus_seq_str: 71 # Very odd! Error? 72 alpha = generic_nucleotide 73 else: 74 alpha = generic_rna 75 else: 76 alpha = generic_dna 77 78 if "*" in consensus_seq_str: 79 # For consistency with most other file formats, map 80 # any * gaps into - gaps. 81 assert "-" not in consensus_seq_str 82 consensus_seq = Seq(consensus_seq_str.replace("*", "-"), 83 Gapped(alpha, gap_char="-")) 84 else: 85 consensus_seq = Seq(consensus_seq_str, alpha) 86 87 # TODO? - Base segments (BS lines) which indicates which read 88 # phrap has chosen to be the consensus at a particular position. 89 # Perhaps as SeqFeature objects? 90 91 # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) 92 # Perhaps as SeqFeature objects? 93 94 seq_record = SeqRecord(consensus_seq, 95 id=ace_contig.name, 96 name=ace_contig.name) 97 98 # Consensus base quality (BQ lines). Note that any gaps (originally 99 # as * characters) in the consensus do not get a quality entry, so 100 # we assign a quality of None (zero would be missleading as there may 101 # be excelent support for having a gap here). 102 quals = [] 103 i = 0 104 for base in consensus_seq: 105 if base == "-": 106 quals.append(0) 107 else: 108 quals.append(ace_contig.quality[i]) 109 i += 1 110 assert i == len(ace_contig.quality) 111 seq_record.letter_annotations["phred_quality"] = quals 112 113 yield seq_record
114 # All done 115 116 117 if __name__ == "__main__": 118 from Bio._utils import run_doctest 119 run_doctest() 120