1
2
3
4
5
6
7
8
9 """
10 Parser for PHD files output by PHRED and used by PHRAP and CONSED.
11
12 This module can be used directly which will return Record objects
13 which should contain all the original data in the file.
14
15 Alternatively, using Bio.SeqIO with the "phd" format will call this module
16 internally. This will give SeqRecord objects for each contig sequence.
17 """
18
19 from Bio import Seq
20 from Bio.Alphabet import generic_dna
21
22 CKEYWORDS = ['CHROMAT_FILE', 'ABI_THUMBPRINT', 'PHRED_VERSION', 'CALL_METHOD',
23 'QUALITY_LEVELS', 'TIME', 'TRACE_ARRAY_MIN_INDEX', 'TRACE_ARRAY_MAX_INDEX',
24 'TRIM', 'TRACE_PEAK_AREA_RATIO', 'CHEM', 'DYE']
25
26
28 """Hold information from a PHD file."""
30 self.file_name = ''
31 self.comments = {}
32 for kw in CKEYWORDS:
33 self.comments[kw.lower()] = None
34 self.sites = []
35 self.seq = ''
36 self.seq_trimmed = ''
37
38
40 """Reads the next PHD record from the file, returning it as a Record object.
41
42 This function reads PHD file data line by line from the handle,
43 and returns a single Record object.
44 """
45 for line in handle:
46 if line.startswith("BEGIN_SEQUENCE"):
47 record = Record()
48 record.file_name = line[15:].rstrip()
49 break
50 else:
51 return
52
53 for line in handle:
54 if line.startswith("BEGIN_COMMENT"):
55 break
56 else:
57 raise ValueError("Failed to find BEGIN_COMMENT line")
58
59 for line in handle:
60 line = line.strip()
61 if not line:
62 continue
63 if line == "END_COMMENT":
64 break
65 keyword, value = line.split(":", 1)
66 keyword = keyword.lower()
67 value = value.strip()
68 if keyword in ('chromat_file',
69 'phred_version',
70 'call_method',
71 'chem',
72 'dye',
73 'time',
74 'basecaller_version',
75 'trace_processor_version'):
76 record.comments[keyword] = value
77 elif keyword in ('abi_thumbprint',
78 'quality_levels',
79 'trace_array_min_index',
80 'trace_array_max_index'):
81 record.comments[keyword] = int(value)
82 elif keyword == 'trace_peak_area_ratio':
83 record.comments[keyword] = float(value)
84 elif keyword == 'trim':
85 first, last, prob = value.split()
86 record.comments[keyword] = (int(first), int(last), float(prob))
87 else:
88 raise ValueError("Failed to find END_COMMENT line")
89
90 for line in handle:
91 if line.startswith('BEGIN_DNA'):
92 break
93 else:
94 raise ValueError("Failed to find BEGIN_DNA line")
95
96 for line in handle:
97 if line.startswith('END_DNA'):
98 break
99 else:
100
101
102
103 parts = line.split()
104 if len(parts) in [2, 3]:
105 record.sites.append(tuple(parts))
106 else:
107 raise ValueError("DNA line must contain a base and quality "
108 "score, and optionally a peak location.")
109
110 for line in handle:
111 if line.startswith("END_SEQUENCE"):
112 break
113 else:
114 raise ValueError("Failed to find END_SEQUENCE line")
115
116 record.seq = Seq.Seq(''.join([n[0] for n in record.sites]), generic_dna)
117 if record.comments['trim'] is not None:
118 first, last = record.comments['trim'][:2]
119 record.seq_trimmed = record.seq[first:last]
120
121 return record
122
123
125 """Iterates over a file returning multiple PHD records.
126
127 The data is read line by line from the handle. The handle can be a list
128 of lines, an open file, or similar; the only requirement is that we can
129 iterate over the handle to retrieve lines from it.
130
131 Typical usage:
132
133 records = parse(handle)
134 for record in records:
135 # do something with the record object
136 """
137 while True:
138 record = read(handle)
139 if not record:
140 return
141 yield record
142