Package Bio :: Package SeqIO :: Module PhdIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.PhdIO

  1  # Copyright 2008-2016 by Peter Cock.  All rights reserved. 
  2  # Revisions copyright 2009 by Cymon J. Cox.  All rights reserved. 
  3  # 
  4  # This file is part of the Biopython distribution and governed by your 
  5  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  6  # Please see the LICENSE file that should have been included as part of this 
  7  # package. 
  8  """Bio.SeqIO support for the "phd" file format. 
  9   
 10  PHD files are output by PHRED and used by PHRAP and CONSED. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions, under the 
 13  format name "phd". See also the underlying Bio.Sequencing.Phd module. 
 14   
 15  For example, using Bio.SeqIO we can read in one of the example PHRED files 
 16  from the Biopython unit tests: 
 17   
 18      >>> from Bio import SeqIO 
 19      >>> for record in SeqIO.parse("Phd/phd1", "phd"): 
 20      ...     print(record.id) 
 21      ...     print("%s..." % record.seq[:10]) 
 22      ...     print("%s..." % record.letter_annotations["phred_quality"][:10]) 
 23      34_222_(80-A03-19).b.ab1 
 24      ctccgtcgga... 
 25      [9, 9, 10, 19, 22, 37, 28, 28, 24, 22]... 
 26      425_103_(81-A03-19).g.ab1 
 27      cgggatccca... 
 28      [14, 17, 22, 10, 10, 10, 15, 8, 8, 9]... 
 29      425_7_(71-A03-19).b.ab1 
 30      acataaatca... 
 31      [10, 10, 10, 10, 8, 8, 6, 6, 6, 6]... 
 32   
 33  Since PHRED files contain quality scores, you can save them as FASTQ or as 
 34  QUAL files, for example using Bio.SeqIO.write(...), or simply with the format 
 35  method of the SeqRecord object: 
 36   
 37      >>> print(record[:50].format("fastq")) 
 38      @425_7_(71-A03-19).b.ab1 
 39      acataaatcaaattactnaccaacacacaaaccngtctcgcgtagtggag 
 40      + 
 41      ++++))'''')(''')$!$''')''''(+.''$!$))))+)))''''''' 
 42      <BLANKLINE> 
 43   
 44  Or, 
 45   
 46      >>> print(record[:50].format("qual")) 
 47      >425_7_(71-A03-19).b.ab1 
 48      10 10 10 10 8 8 6 6 6 6 8 7 6 6 6 8 3 0 3 6 6 6 8 6 6 6 6 7 
 49      10 13 6 6 3 0 3 8 8 8 8 10 8 8 8 6 6 6 6 6 6 6 
 50      <BLANKLINE> 
 51   
 52  Note these examples only show the first 50 bases to keep the output short. 
 53  """ 
 54   
 55  from __future__ import print_function 
 56   
 57  from Bio.SeqRecord import SeqRecord 
 58  from Bio.Sequencing import Phd 
 59  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 60  from Bio.SeqIO import QualityIO 
 61   
 62   
63 -def PhdIterator(handle):
64 """Return SeqRecord objects from a PHD file. 65 66 This uses the Bio.Sequencing.Phd module to do the hard work. 67 """ 68 phd_records = Phd.parse(handle) 69 for phd_record in phd_records: 70 # Convert the PHY record into a SeqRecord... 71 # The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1' 72 # from unit test example file phd_solexa. 73 # This will cause problems if used as the record identifier 74 # (e.g. output for FASTQ format). 75 name = phd_record.file_name.split(None, 1)[0] 76 seq_record = SeqRecord(phd_record.seq, 77 id=name, name=name, 78 description=phd_record.file_name) 79 # Just re-use the comments dictionary as the SeqRecord's annotations 80 seq_record.annotations = phd_record.comments 81 # And store the qualities and peak locations as per-letter-annotation 82 seq_record.letter_annotations["phred_quality"] = \ 83 [int(site[1]) for site in phd_record.sites] 84 try: 85 seq_record.letter_annotations["peak_location"] = \ 86 [int(site[2]) for site in phd_record.sites] 87 except IndexError: 88 # peak locations are not always there according to 89 # David Gordon (the Consed author) 90 pass 91 yield seq_record
92 # All done 93 94
95 -class PhdWriter(SequentialSequenceWriter):
96 """Class to write Phd format files.""" 97
98 - def __init__(self, handle):
99 """Initialize the class.""" 100 SequentialSequenceWriter.__init__(self, handle)
101
102 - def write_record(self, record):
103 """Write a single Phd record to the file.""" 104 assert record.seq, "No sequence present in SeqRecord" 105 # This method returns the 'phred_quality' scores or converted 106 # 'solexa_quality' scores if present, else raises a value error 107 phred_qualities = QualityIO._get_phred_quality(record) 108 peak_locations = record.letter_annotations.get("peak_location") 109 assert len(record.seq) == len(phred_qualities), "Number of " + \ 110 "phd quality scores does not match length of sequence" 111 if peak_locations: 112 assert len(record.seq) == len(peak_locations), "Number " + \ 113 "of peak location scores does not match length of sequence" 114 if None in phred_qualities: 115 raise ValueError("A quality value of None was found") 116 if record.description.startswith("%s " % record.id): 117 title = record.description 118 else: 119 title = "%s %s" % (record.id, record.description) 120 self.handle.write("BEGIN_SEQUENCE %s\nBEGIN_COMMENT\n" 121 % self.clean(title)) 122 for annot in [k.lower() for k in Phd.CKEYWORDS]: 123 value = None 124 if annot == "trim": 125 if record.annotations.get("trim"): 126 value = "%s %s %.4f" % record.annotations["trim"] 127 elif annot == "trace_peak_area_ratio": 128 if record.annotations.get("trace_peak_area_ratio"): 129 value = "%.4f" % record.annotations[ 130 "trace_peak_area_ratio"] 131 else: 132 value = record.annotations.get(annot) 133 if value or value == 0: 134 self.handle.write("%s: %s\n" % (annot.upper(), value)) 135 136 self.handle.write("END_COMMENT\nBEGIN_DNA\n") 137 for i, site in enumerate(record.seq): 138 if peak_locations: 139 self.handle.write("%s %i %i\n" % ( 140 site, 141 round(phred_qualities[i]), 142 peak_locations[i]) 143 ) 144 else: 145 self.handle.write("%s %i\n" % ( 146 site, 147 round(phred_qualities[i])) 148 ) 149 150 self.handle.write("END_DNA\nEND_SEQUENCE\n")
151 152 153 if __name__ == "__main__": 154 from Bio._utils import run_doctest 155 run_doctest() 156