1 """
2 This is an implementation of a state-emitting MarkovModel. I am using
3 terminology similar to Manning and Schutze.
4
5
6
7 Functions:
8 train_bw Train a markov model using the Baum-Welch algorithm.
9 train_visible Train a visible markov model using MLE.
10 find_states Find the a state sequence that explains some observations.
11
12 load Load a MarkovModel.
13 save Save a MarkovModel.
14
15 Classes:
16 MarkovModel Holds the description of a markov model
17 """
18
19 import numpy
20
21 try:
22 logaddexp = numpy.logaddexp
23 except AttributeError:
24
25
26
27
28 import warnings
29 warnings.warn("For optimal speed, please update to Numpy version 1.3 or later (current version is %s)" % numpy.__version__)
30
32 if logy - logx > 100:
33 return logy
34 elif logx - logy > 100:
35 return logx
36 minxy = min(logx, logy)
37 return minxy + numpy.log(numpy.exp(logx-minxy) + numpy.exp(logy-minxy))
38
39
41 d = {}
42 entries = enumerate(values[::-1])
43 n = len(values)-1
44 for index, key in entries:
45 d[key] = n-index
46 return d
47
48 numpy.random.seed()
49
50 VERY_SMALL_NUMBER = 1E-300
51 LOG0 = numpy.log(VERY_SMALL_NUMBER)
52
53
55 - def __init__(self, states, alphabet,
56 p_initial=None, p_transition=None, p_emission=None):
57 self.states = states
58 self.alphabet = alphabet
59 self.p_initial = p_initial
60 self.p_transition = p_transition
61 self.p_emission = p_emission
62
69
70
76
77
79 """load(handle) -> MarkovModel()"""
80
81 line = _readline_and_check_start(handle, "STATES:")
82 states = line.split()[1:]
83
84
85 line = _readline_and_check_start(handle, "ALPHABET:")
86 alphabet = line.split()[1:]
87
88 mm = MarkovModel(states, alphabet)
89 N, M = len(states), len(alphabet)
90
91
92 mm.p_initial = numpy.zeros(N)
93 line = _readline_and_check_start(handle, "INITIAL:")
94 for i in range(len(states)):
95 line = _readline_and_check_start(handle, " %s:" % states[i])
96 mm.p_initial[i] = float(line.split()[-1])
97
98
99 mm.p_transition = numpy.zeros((N, N))
100 line = _readline_and_check_start(handle, "TRANSITION:")
101 for i in range(len(states)):
102 line = _readline_and_check_start(handle, " %s:" % states[i])
103 mm.p_transition[i,:] = map(float, line.split()[1:])
104
105
106 mm.p_emission = numpy.zeros((N, M))
107 line = _readline_and_check_start(handle, "EMISSION:")
108 for i in range(len(states)):
109 line = _readline_and_check_start(handle, " %s:" % states[i])
110 mm.p_emission[i,:] = map(float, line.split()[1:])
111
112 return mm
113
114
115 -def save(mm, handle):
116 """save(mm, handle)"""
117
118 w = handle.write
119 w("STATES: %s\n" % ' '.join(mm.states))
120 w("ALPHABET: %s\n" % ' '.join(mm.alphabet))
121 w("INITIAL:\n")
122 for i in range(len(mm.p_initial)):
123 w(" %s: %g\n" % (mm.states[i], mm.p_initial[i]))
124 w("TRANSITION:\n")
125 for i in range(len(mm.p_transition)):
126 x = map(str, mm.p_transition[i])
127 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
128 w("EMISSION:\n")
129 for i in range(len(mm.p_emission)):
130 x = map(str, mm.p_emission[i])
131 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
132
133
134
135 -def train_bw(states, alphabet, training_data,
136 pseudo_initial=None, pseudo_transition=None, pseudo_emission=None,
137 update_fn=None,
138 ):
139 """train_bw(states, alphabet, training_data[, pseudo_initial]
140 [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel
141
142 Train a MarkovModel using the Baum-Welch algorithm. states is a list
143 of strings that describe the names of each state. alphabet is a
144 list of objects that indicate the allowed outputs. training_data
145 is a list of observations. Each observation is a list of objects
146 from the alphabet.
147
148 pseudo_initial, pseudo_transition, and pseudo_emission are
149 optional parameters that you can use to assign pseudo-counts to
150 different matrices. They should be matrices of the appropriate
151 size that contain numbers to add to each parameter matrix, before
152 normalization.
153
154 update_fn is an optional callback that takes parameters
155 (iteration, log_likelihood). It is called once per iteration.
156
157 """
158 N, M = len(states), len(alphabet)
159 if not training_data:
160 raise ValueError("No training data given.")
161 if pseudo_initial is not None:
162 pseudo_initial = numpy.asarray(pseudo_initial)
163 if pseudo_initial.shape != (N,):
164 raise ValueError("pseudo_initial not shape len(states)")
165 if pseudo_transition is not None:
166 pseudo_transition = numpy.asarray(pseudo_transition)
167 if pseudo_transition.shape != (N,N):
168 raise ValueError("pseudo_transition not shape " +
169 "len(states) X len(states)")
170 if pseudo_emission is not None:
171 pseudo_emission = numpy.asarray(pseudo_emission)
172 if pseudo_emission.shape != (N,M):
173 raise ValueError("pseudo_emission not shape " +
174 "len(states) X len(alphabet)")
175
176
177
178
179 training_outputs = []
180 indexes = itemindex(alphabet)
181 for outputs in training_data:
182 training_outputs.append([indexes[x] for x in outputs])
183
184
185 lengths = map(len, training_outputs)
186 if min(lengths) == 0:
187 raise ValueError("I got training data with outputs of length 0")
188
189
190 x = _baum_welch(N, M, training_outputs,
191 pseudo_initial=pseudo_initial,
192 pseudo_transition=pseudo_transition,
193 pseudo_emission=pseudo_emission,
194 update_fn=update_fn)
195 p_initial, p_transition, p_emission = x
196 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
197
198 MAX_ITERATIONS = 1000
199
200
201 -def _baum_welch(N, M, training_outputs,
202 p_initial=None, p_transition=None, p_emission=None,
203 pseudo_initial=None, pseudo_transition=None,
204 pseudo_emission=None, update_fn=None):
205
206 if p_initial is None:
207 p_initial = _random_norm(N)
208 else:
209 p_initial = _copy_and_check(p_initial, (N,))
210
211 if p_transition is None:
212 p_transition = _random_norm((N,N))
213 else:
214 p_transition = _copy_and_check(p_transition, (N,N))
215 if p_emission is None:
216 p_emission = _random_norm((N,M))
217 else:
218 p_emission = _copy_and_check(p_emission, (N,M))
219
220
221 lp_initial, lp_transition, lp_emission = map(
222 numpy.log, (p_initial, p_transition, p_emission))
223 if pseudo_initial is not None:
224 lpseudo_initial = numpy.log(pseudo_initial)
225 else:
226 lpseudo_initial = None
227 if pseudo_transition is not None:
228 lpseudo_transition = numpy.log(pseudo_transition)
229 else:
230 lpseudo_transition = None
231 if pseudo_emission is not None:
232 lpseudo_emission = numpy.log(pseudo_emission)
233 else:
234 lpseudo_emission = None
235
236
237
238
239 prev_llik = None
240 for i in range(MAX_ITERATIONS):
241 llik = LOG0
242 for outputs in training_outputs:
243 x = _baum_welch_one(
244 N, M, outputs,
245 lp_initial, lp_transition, lp_emission,
246 lpseudo_initial, lpseudo_transition, lpseudo_emission,)
247 llik += x
248 if update_fn is not None:
249 update_fn(i, llik)
250 if prev_llik is not None and numpy.fabs(prev_llik-llik) < 0.1:
251 break
252 prev_llik = llik
253 else:
254 raise RuntimeError("HMM did not converge in %d iterations"
255 % MAX_ITERATIONS)
256
257
258 return map(numpy.exp, (lp_initial, lp_transition, lp_emission))
259
260
261 -def _baum_welch_one(N, M, outputs,
262 lp_initial, lp_transition, lp_emission,
263 lpseudo_initial, lpseudo_transition, lpseudo_emission):
264
265
266
267 T = len(outputs)
268 fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs)
269 bmat = _backward(N, T, lp_transition, lp_emission, outputs)
270
271
272
273 lp_arc = numpy.zeros((N, N, T))
274 for t in range(T):
275 k = outputs[t]
276 lp_traverse = numpy.zeros((N, N))
277 for i in range(N):
278 for j in range(N):
279
280
281
282
283 lp = fmat[i][t] + \
284 lp_transition[i][j] + \
285 lp_emission[i][k] + \
286 bmat[j][t+1]
287 lp_traverse[i][j] = lp
288
289 lp_arc[:,:,t] = lp_traverse - _logsum(lp_traverse)
290
291
292 lp_arcout_t = numpy.zeros((N, T))
293 for t in range(T):
294 for i in range(N):
295 lp_arcout_t[i][t] = _logsum(lp_arc[i,:,t])
296
297
298 lp_arcout = numpy.zeros(N)
299 for i in range(N):
300 lp_arcout[i] = _logsum(lp_arcout_t[i,:])
301
302
303 lp_initial = lp_arcout_t[:,0]
304 if lpseudo_initial is not None:
305 lp_initial = _logvecadd(lp_initial, lpseudo_initial)
306 lp_initial = lp_initial - _logsum(lp_initial)
307
308
309
310
311 for i in range(N):
312 for j in range(N):
313 lp_transition[i][j] = _logsum(lp_arc[i,j,:]) - lp_arcout[i]
314 if lpseudo_transition is not None:
315 lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition)
316 lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i])
317
318
319
320
321 for i in range(N):
322 ksum = numpy.zeros(M)+LOG0
323 for t in range(T):
324 k = outputs[t]
325 for j in range(N):
326 ksum[k] = logaddexp(ksum[k], lp_arc[i,j,t])
327 ksum = ksum - _logsum(ksum)
328 if lpseudo_emission is not None:
329 ksum = _logvecadd(ksum, lpseudo_emission[i])
330 ksum = ksum - _logsum(ksum)
331 lp_emission[i,:] = ksum
332
333
334
335
336
337
338
339
340 return _logsum(fmat[:,T])
341
342
343 -def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
344
345
346
347
348 matrix = numpy.zeros((N, T+1))
349
350
351 matrix[:,0] = lp_initial
352 for t in range(1, T+1):
353 k = outputs[t-1]
354 for j in range(N):
355
356
357 lprob = LOG0
358 for i in range(N):
359 lp = matrix[i][t-1] + \
360 lp_transition[i][j] + \
361 lp_emission[i][k]
362 lprob = logaddexp(lprob, lp)
363 matrix[j][t] = lprob
364 return matrix
365
366
367 -def _backward(N, T, lp_transition, lp_emission, outputs):
368 matrix = numpy.zeros((N, T+1))
369 for t in range(T-1, -1, -1):
370 k = outputs[t]
371 for i in range(N):
372
373
374 lprob = LOG0
375 for j in range(N):
376 lp = matrix[j][t+1] + \
377 lp_transition[i][j] + \
378 lp_emission[i][k]
379 lprob = logaddexp(lprob, lp)
380 matrix[i][t] = lprob
381 return matrix
382
383
384 -def train_visible(states, alphabet, training_data,
385 pseudo_initial=None, pseudo_transition=None,
386 pseudo_emission=None):
387 """train_visible(states, alphabet, training_data[, pseudo_initial]
388 [, pseudo_transition][, pseudo_emission]) -> MarkovModel
389
390 Train a visible MarkovModel using maximum likelihoood estimates
391 for each of the parameters. states is a list of strings that
392 describe the names of each state. alphabet is a list of objects
393 that indicate the allowed outputs. training_data is a list of
394 (outputs, observed states) where outputs is a list of the emission
395 from the alphabet, and observed states is a list of states from
396 states.
397
398 pseudo_initial, pseudo_transition, and pseudo_emission are
399 optional parameters that you can use to assign pseudo-counts to
400 different matrices. They should be matrices of the appropriate
401 size that contain numbers to add to each parameter matrix
402
403 """
404 N, M = len(states), len(alphabet)
405 if pseudo_initial is not None:
406 pseudo_initial = numpy.asarray(pseudo_initial)
407 if pseudo_initial.shape != (N,):
408 raise ValueError("pseudo_initial not shape len(states)")
409 if pseudo_transition is not None:
410 pseudo_transition = numpy.asarray(pseudo_transition)
411 if pseudo_transition.shape != (N,N):
412 raise ValueError("pseudo_transition not shape " +
413 "len(states) X len(states)")
414 if pseudo_emission is not None:
415 pseudo_emission = numpy.asarray(pseudo_emission)
416 if pseudo_emission.shape != (N,M):
417 raise ValueError("pseudo_emission not shape " +
418 "len(states) X len(alphabet)")
419
420
421
422
423 training_states, training_outputs = [], []
424 states_indexes = itemindex(states)
425 outputs_indexes = itemindex(alphabet)
426 for toutputs, tstates in training_data:
427 if len(tstates) != len(toutputs):
428 raise ValueError("states and outputs not aligned")
429 training_states.append([states_indexes[x] for x in tstates])
430 training_outputs.append([outputs_indexes[x] for x in toutputs])
431
432 x = _mle(N, M, training_outputs, training_states,
433 pseudo_initial, pseudo_transition, pseudo_emission)
434 p_initial, p_transition, p_emission = x
435
436 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
437
438
439 -def _mle(N, M, training_outputs, training_states, pseudo_initial,
440 pseudo_transition, pseudo_emission):
441
442
443 p_initial = numpy.zeros(N)
444 if pseudo_initial:
445 p_initial = p_initial + pseudo_initial
446 for states in training_states:
447 p_initial[states[0]] += 1
448 p_initial = _normalize(p_initial)
449
450
451
452 p_transition = numpy.zeros((N,N))
453 if pseudo_transition:
454 p_transition = p_transition + pseudo_transition
455 for states in training_states:
456 for n in range(len(states)-1):
457 i, j = states[n], states[n+1]
458 p_transition[i, j] += 1
459 for i in range(len(p_transition)):
460 p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:])
461
462
463
464 p_emission = numpy.zeros((N,M))
465 if pseudo_emission:
466 p_emission = p_emission + pseudo_emission
467 p_emission = numpy.ones((N,M))
468 for outputs, states in zip(training_outputs, training_states):
469 for o, s in zip(outputs, states):
470 p_emission[s, o] += 1
471 for i in range(len(p_emission)):
472 p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:])
473
474 return p_initial, p_transition, p_emission
475
476
478 return [numpy.argmax(vector)]
479
480
482 """find_states(markov_model, output) -> list of (states, score)"""
483 mm = markov_model
484 N = len(mm.states)
485
486
487
488 x = mm.p_initial + VERY_SMALL_NUMBER
489 y = mm.p_transition + VERY_SMALL_NUMBER
490 z = mm.p_emission + VERY_SMALL_NUMBER
491 lp_initial, lp_transition, lp_emission = map(numpy.log, (x, y, z))
492
493 indexes = itemindex(mm.alphabet)
494 output = [indexes[x] for x in output]
495
496
497 results = _viterbi(N, lp_initial, lp_transition, lp_emission, output)
498
499 for i in range(len(results)):
500 states, score = results[i]
501 results[i] = [mm.states[x] for x in states], numpy.exp(score)
502 return results
503
504
505 -def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
506
507
508
509 T = len(output)
510
511 backtrace = []
512 for i in range(N):
513 backtrace.append([None] * T)
514
515
516 scores = numpy.zeros((N, T))
517 scores[:,0] = lp_initial + lp_emission[:,output[0]]
518 for t in range(1, T):
519 k = output[t]
520 for j in range(N):
521
522 i_scores = scores[:,t-1] + \
523 lp_transition[:,j] + \
524 lp_emission[j,k]
525 indexes = _argmaxes(i_scores)
526 scores[j,t] = i_scores[indexes[0]]
527 backtrace[j][t] = indexes
528
529
530
531
532
533
534 in_process = []
535 results = []
536 indexes = _argmaxes(scores[:,T-1])
537 for i in indexes:
538 in_process.append((T-1, [i], scores[i][T-1]))
539 while in_process:
540 t, states, score = in_process.pop()
541 if t == 0:
542 results.append((states, score))
543 else:
544 indexes = backtrace[states[0]][t]
545 for i in indexes:
546 in_process.append((t-1, [i]+states, score))
547 return results
548
549
561
562
566
567
571
572
574
575 matrix = numpy.array(matrix, copy=1)
576
577 if matrix.shape != desired_shape:
578 raise ValueError("Incorrect dimension")
579
580 if len(matrix.shape) == 1:
581 if numpy.fabs(sum(matrix)-1.0) > 0.01:
582 raise ValueError("matrix not normalized to 1.0")
583 elif len(matrix.shape) == 2:
584 for i in range(len(matrix)):
585 if numpy.fabs(sum(matrix[i])-1.0) > 0.01:
586 raise ValueError("matrix %d not normalized to 1.0" % i)
587 else:
588 raise ValueError("I don't handle matrices > 2 dimensions")
589 return matrix
590
591
601
602
604 assert len(logvec1) == len(logvec2), "vectors aren't the same length"
605 sumvec = numpy.zeros(len(logvec1))
606 for i in range(len(logvec1)):
607 sumvec[i] = logaddexp(logvec1[i], logvec2[i])
608 return sumvec
609
610
614