1 """Provide trainers which estimate parameters based on training sequences.
2
3 These should be used to 'train' a Markov Model prior to actually using
4 it to decode state paths. When supplied training sequences and a model
5 to work from, these classes will estimate parameters of the model.
6
7 This aims to estimate two parameters:
8
9 * a_{kl} -- the number of times there is a transition from k to l in the
10 training data.
11
12 * e_{k}(b) -- the number of emissions of the state b from the letter k
13 in the training data.
14 """
15
16 import math
17
18
19 from DynamicProgramming import ScaledDPAlgorithms
20
21
23 """Hold a training sequence with emissions and optionally, a state path.
24 """
25 - def __init__(self, emissions, state_path):
26 """Initialize a training sequence.
27
28 Arguments:
29
30 o emissions - A Seq object containing the sequence of emissions in
31 the training sequence, and the alphabet of the sequence.
32
33 o state_path - A Seq object containing the sequence of states and
34 the alphabet of the states. If there is no known state path, then
35 the sequence of states should be an empty string.
36 """
37 if len(state_path) > 0:
38 assert len(emissions) == len(state_path), \
39 "State path does not match associated emissions."
40 self.emissions = emissions
41 self.states = state_path
42
43
45 """Provide generic functionality needed in all trainers.
46 """
48 self._markov_model = markov_model
49
51 """Calculate the log likelihood of the training seqs.
52
53 Arguments:
54
55 o probabilities -- A list of the probabilities of each training
56 sequence under the current parameters, calculated using the forward
57 algorithm.
58 """
59 total_likelihood = 0
60 for probability in probabilities:
61 total_likelihood += math.log(probability)
62
63 return total_likelihood
64
66 """Get a maximum likelihood estimation of transition and emmission.
67
68 Arguments:
69
70 o transition_counts -- A dictionary with the total number of counts
71 of transitions between two states.
72
73 o emissions_counts -- A dictionary with the total number of counts
74 of emmissions of a particular emission letter by a state letter.
75
76 This then returns the maximum likelihood estimators for the
77 transitions and emissions, estimated by formulas 3.18 in
78 Durbin et al:
79
80 a_{kl} = A_{kl} / sum(A_{kl'})
81 e_{k}(b) = E_{k}(b) / sum(E_{k}(b'))
82
83 Returns:
84 Transition and emission dictionaries containing the maximum
85 likelihood estimators.
86 """
87
88 ml_transitions = self.ml_estimator(transition_counts)
89 ml_emissions = self.ml_estimator(emission_counts)
90
91 return ml_transitions, ml_emissions
92
94 """Calculate the maximum likelihood estimator.
95
96 This can calculate maximum likelihoods for both transitions
97 and emissions.
98
99 Arguments:
100
101 o counts -- A dictionary of the counts for each item.
102
103 See estimate_params for a description of the formula used for
104 calculation.
105 """
106
107 all_ordered = counts.keys()
108 all_ordered.sort()
109
110 ml_estimation = {}
111
112
113 cur_letter = None
114 cur_letter_counts = 0
115
116 for cur_item in all_ordered:
117
118 if cur_item[0] != cur_letter:
119
120 cur_letter = cur_item[0]
121
122
123 cur_letter_counts = counts[cur_item]
124
125
126 cur_position = all_ordered.index(cur_item) + 1
127
128
129
130 while (cur_position < len(all_ordered) and
131 all_ordered[cur_position][0] == cur_item[0]):
132 cur_letter_counts += counts[all_ordered[cur_position]]
133 cur_position += 1
134
135 else:
136 pass
137
138
139 cur_ml = float(counts[cur_item]) / float(cur_letter_counts)
140 ml_estimation[cur_item] = cur_ml
141
142 return ml_estimation
143
144
146 """Trainer that uses the Baum-Welch algorithm to estimate parameters.
147
148 These should be used when a training sequence for an HMM has unknown
149 paths for the actual states, and you need to make an estimation of the
150 model parameters from the observed emissions.
151
152 This uses the Baum-Welch algorithm, first described in
153 Baum, L.E. 1972. Inequalities. 3:1-8
154 This is based on the description in 'Biological Sequence Analysis' by
155 Durbin et al. in section 3.3
156
157 This algorithm is guaranteed to converge to a local maximum, but not
158 necessarily to the global maxima, so use with care!
159 """
161 """Initialize the trainer.
162
163 Arguments:
164
165 o markov_model - The model we are going to estimate parameters for.
166 This should have the parameters with some initial estimates, that
167 we can build from.
168 """
169 AbstractTrainer.__init__(self, markov_model)
170
173 """Estimate the parameters using training sequences.
174
175 The algorithm for this is taken from Durbin et al. p64, so this
176 is a good place to go for a reference on what is going on.
177
178 Arguments:
179
180 o training_seqs -- A list of TrainingSequence objects to be used
181 for estimating the parameters.
182
183 o stopping_criteria -- A function, that when passed the change
184 in log likelihood and threshold, will indicate if we should stop
185 the estimation iterations.
186
187 o dp_method -- A class instance specifying the dynamic programming
188 implementation we should use to calculate the forward and
189 backward variables. By default, we use the scaling method.
190 """
191 prev_log_likelihood = None
192 num_iterations = 1
193
194 while 1:
195 transition_count = self._markov_model.get_blank_transitions()
196 emission_count = self._markov_model.get_blank_emissions()
197
198
199 all_probabilities = []
200
201 for training_seq in training_seqs:
202
203 DP = dp_method(self._markov_model, training_seq)
204 forward_var, seq_prob = DP.forward_algorithm()
205 backward_var = DP.backward_algorithm()
206
207 all_probabilities.append(seq_prob)
208
209
210 transition_count = self.update_transitions(transition_count,
211 training_seq,
212 forward_var,
213 backward_var,
214 seq_prob)
215 emission_count = self.update_emissions(emission_count,
216 training_seq,
217 forward_var,
218 backward_var,
219 seq_prob)
220
221
222 ml_transitions, ml_emissions = \
223 self.estimate_params(transition_count, emission_count)
224 self._markov_model.transition_prob = ml_transitions
225 self._markov_model.emission_prob = ml_emissions
226
227 cur_log_likelihood = self.log_likelihood(all_probabilities)
228
229
230
231 if prev_log_likelihood is not None:
232
233
234
235 log_likelihood_change = abs(abs(cur_log_likelihood) -
236 abs(prev_log_likelihood))
237
238
239
240 if stopping_criteria(log_likelihood_change, num_iterations):
241 break
242
243
244 prev_log_likelihood = cur_log_likelihood
245 num_iterations += 1
246
247 return self._markov_model
248
249 - def update_transitions(self, transition_counts, training_seq,
250 forward_vars, backward_vars, training_seq_prob):
251 """Add the contribution of a new training sequence to the transitions.
252
253 Arguments:
254
255 o transition_counts -- A dictionary of the current counts for the
256 transitions
257
258 o training_seq -- The training sequence we are working with
259
260 o forward_vars -- Probabilities calculated using the forward
261 algorithm.
262
263 o backward_vars -- Probabilities calculated using the backwards
264 algorithm.
265
266 o training_seq_prob - The probability of the current sequence.
267
268 This calculates A_{kl} (the estimated transition counts from state
269 k to state l) using formula 3.20 in Durbin et al.
270 """
271
272 transitions = self._markov_model.transition_prob
273 emissions = self._markov_model.emission_prob
274
275
276 for k in training_seq.states.alphabet.letters:
277 for l in self._markov_model.transitions_from(k):
278 estimated_counts = 0
279
280 for i in range(len(training_seq.emissions) - 1):
281
282 forward_value = forward_vars[(k, i)]
283
284
285 backward_value = backward_vars[(l, i + 1)]
286
287
288 trans_value = transitions[(k, l)]
289
290
291 emm_value = emissions[(l, training_seq.emissions[i + 1])]
292
293 estimated_counts += (forward_value * trans_value *
294 emm_value * backward_value)
295
296
297 transition_counts[(k, l)] += (float(estimated_counts) /
298 training_seq_prob)
299
300 return transition_counts
301
302 - def update_emissions(self, emission_counts, training_seq,
303 forward_vars, backward_vars, training_seq_prob):
304 """Add the contribution of a new training sequence to the emissions
305
306 Arguments:
307
308 o emission_counts -- A dictionary of the current counts for the
309 emissions
310
311 o training_seq -- The training sequence we are working with
312
313 o forward_vars -- Probabilities calculated using the forward
314 algorithm.
315
316 o backward_vars -- Probabilities calculated using the backwards
317 algorithm.
318
319 o training_seq_prob - The probability of the current sequence.
320
321 This calculates E_{k}(b) (the estimated emission probability for
322 emission letter b from state k) using formula 3.21 in Durbin et al.
323 """
324
325 for k in training_seq.states.alphabet.letters:
326
327 for b in training_seq.emissions.alphabet.letters:
328 expected_times = 0
329
330 for i in range(len(training_seq.emissions)):
331
332
333 if training_seq.emissions[i] == b:
334
335 expected_times += (forward_vars[(k, i)] *
336 backward_vars[(k, i)])
337
338
339 emission_counts[(k, b)] += (float(expected_times) /
340 training_seq_prob)
341
342 return emission_counts
343
344
346 """Estimate probabilities with known state sequences.
347
348 This should be used for direct estimation of emission and transition
349 probabilities when both the state path and emission sequence are
350 known for the training examples.
351 """
354
355 - def train(self, training_seqs):
356 """Estimate the Markov Model parameters with known state paths.
357
358 This trainer requires that both the state and the emissions are
359 known for all of the training sequences in the list of
360 TrainingSequence objects.
361 This training will then count all of the transitions and emissions,
362 and use this to estimate the parameters of the model.
363 """
364
365 transition_counts = self._markov_model.get_blank_transitions()
366 emission_counts = self._markov_model.get_blank_emissions()
367
368 for training_seq in training_seqs:
369 emission_counts = self._count_emissions(training_seq,
370 emission_counts)
371 transition_counts = self._count_transitions(training_seq.states,
372 transition_counts)
373
374
375 ml_transitions, ml_emissions = \
376 self.estimate_params(transition_counts,
377 emission_counts)
378 self._markov_model.transition_prob = ml_transitions
379 self._markov_model.emission_prob = ml_emissions
380
381 return self._markov_model
382
384 """Add emissions from the training sequence to the current counts.
385
386 Arguments:
387
388 o training_seq -- A TrainingSequence with states and emissions
389 to get the counts from
390
391 o emission_counts -- The current emission counts to add to.
392 """
393 for index in range(len(training_seq.emissions)):
394 cur_state = training_seq.states[index]
395 cur_emission = training_seq.emissions[index]
396
397 try:
398 emission_counts[(cur_state, cur_emission)] += 1
399 except KeyError:
400 raise KeyError("Unexpected emission (%s, %s)"
401 % (cur_state, cur_emission))
402 return emission_counts
403
405 """Add transitions from the training sequence to the current counts.
406
407 Arguments:
408
409 o state_seq -- A Seq object with the states of the current training
410 sequence.
411
412 o transition_counts -- The current transition counts to add to.
413 """
414 for cur_pos in range(len(state_seq) - 1):
415 cur_state = state_seq[cur_pos]
416 next_state = state_seq[cur_pos + 1]
417
418 try:
419 transition_counts[(cur_state, next_state)] += 1
420 except KeyError:
421 raise KeyError("Unexpected transition (%s, %s)" %
422 (cur_state, next_state))
423
424 return transition_counts
425