Package Bio :: Module MarkovModel
[hide private]
[frames] | no frames]

Source Code for Module Bio.MarkovModel

  1  """ 
  2  This is an implementation of a state-emitting MarkovModel.  I am using 
  3  terminology similar to Manning and Schutze. 
  4   
  5   
  6   
  7  Functions: 
  8  train_bw        Train a markov model using the Baum-Welch algorithm. 
  9  train_visible   Train a visible markov model using MLE. 
 10  find_states     Find the a state sequence that explains some observations. 
 11   
 12  load            Load a MarkovModel. 
 13  save            Save a MarkovModel. 
 14   
 15  Classes: 
 16  MarkovModel     Holds the description of a markov model 
 17  """ 
 18   
 19  import numpy 
 20   
 21  try: 
 22      logaddexp = numpy.logaddexp 
 23  except AttributeError: 
 24      # Numpy versions older than 1.3 do not contain logaddexp. 
 25      # Once we require Numpy version 1.3 or later, we should revisit this 
 26      # module to see if we can simplify some of the other functions in 
 27      # this module. 
 28      import warnings 
 29      warnings.warn("For optimal speed, please update to Numpy version 1.3 or later (current version is %s)" % numpy.__version__) 
 30   
31 - def logaddexp(logx, logy):
32 if logy - logx > 100: 33 return logy 34 elif logx - logy > 100: 35 return logx 36 minxy = min(logx, logy) 37 return minxy + numpy.log(numpy.exp(logx-minxy) + numpy.exp(logy-minxy))
38 39
40 -def itemindex(values):
41 d = {} 42 entries = enumerate(values[::-1]) 43 n = len(values)-1 44 for index, key in entries: 45 d[key] = n-index 46 return d
47 48 numpy.random.seed() 49 50 VERY_SMALL_NUMBER = 1E-300 51 LOG0 = numpy.log(VERY_SMALL_NUMBER) 52 53
54 -class MarkovModel(object):
55 - def __init__(self, states, alphabet, 56 p_initial=None, p_transition=None, p_emission=None):
57 self.states = states 58 self.alphabet = alphabet 59 self.p_initial = p_initial 60 self.p_transition = p_transition 61 self.p_emission = p_emission
62
63 - def __str__(self):
64 import StringIO 65 handle = StringIO.StringIO() 66 save(self, handle) 67 handle.seek(0) 68 return handle.read()
69 70
71 -def _readline_and_check_start(handle, start):
72 line = handle.readline() 73 if not line.startswith(start): 74 raise ValueError("I expected %r but got %r" % (start, line)) 75 return line
76 77
78 -def load(handle):
79 """load(handle) -> MarkovModel()""" 80 # Load the states. 81 line = _readline_and_check_start(handle, "STATES:") 82 states = line.split()[1:] 83 84 # Load the alphabet. 85 line = _readline_and_check_start(handle, "ALPHABET:") 86 alphabet = line.split()[1:] 87 88 mm = MarkovModel(states, alphabet) 89 N, M = len(states), len(alphabet) 90 91 # Load the initial probabilities. 92 mm.p_initial = numpy.zeros(N) 93 line = _readline_and_check_start(handle, "INITIAL:") 94 for i in range(len(states)): 95 line = _readline_and_check_start(handle, " %s:" % states[i]) 96 mm.p_initial[i] = float(line.split()[-1]) 97 98 # Load the transition. 99 mm.p_transition = numpy.zeros((N, N)) 100 line = _readline_and_check_start(handle, "TRANSITION:") 101 for i in range(len(states)): 102 line = _readline_and_check_start(handle, " %s:" % states[i]) 103 mm.p_transition[i,:] = map(float, line.split()[1:]) 104 105 # Load the emission. 106 mm.p_emission = numpy.zeros((N, M)) 107 line = _readline_and_check_start(handle, "EMISSION:") 108 for i in range(len(states)): 109 line = _readline_and_check_start(handle, " %s:" % states[i]) 110 mm.p_emission[i,:] = map(float, line.split()[1:]) 111 112 return mm
113 114
115 -def save(mm, handle):
116 """save(mm, handle)""" 117 # This will fail if there are spaces in the states or alphabet. 118 w = handle.write 119 w("STATES: %s\n" % ' '.join(mm.states)) 120 w("ALPHABET: %s\n" % ' '.join(mm.alphabet)) 121 w("INITIAL:\n") 122 for i in range(len(mm.p_initial)): 123 w(" %s: %g\n" % (mm.states[i], mm.p_initial[i])) 124 w("TRANSITION:\n") 125 for i in range(len(mm.p_transition)): 126 x = map(str, mm.p_transition[i]) 127 w(" %s: %s\n" % (mm.states[i], ' '.join(x))) 128 w("EMISSION:\n") 129 for i in range(len(mm.p_emission)): 130 x = map(str, mm.p_emission[i]) 131 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
132 133 134 # XXX allow them to specify starting points
135 -def train_bw(states, alphabet, training_data, 136 pseudo_initial=None, pseudo_transition=None, pseudo_emission=None, 137 update_fn=None, 138 ):
139 """train_bw(states, alphabet, training_data[, pseudo_initial] 140 [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel 141 142 Train a MarkovModel using the Baum-Welch algorithm. states is a list 143 of strings that describe the names of each state. alphabet is a 144 list of objects that indicate the allowed outputs. training_data 145 is a list of observations. Each observation is a list of objects 146 from the alphabet. 147 148 pseudo_initial, pseudo_transition, and pseudo_emission are 149 optional parameters that you can use to assign pseudo-counts to 150 different matrices. They should be matrices of the appropriate 151 size that contain numbers to add to each parameter matrix, before 152 normalization. 153 154 update_fn is an optional callback that takes parameters 155 (iteration, log_likelihood). It is called once per iteration. 156 157 """ 158 N, M = len(states), len(alphabet) 159 if not training_data: 160 raise ValueError("No training data given.") 161 if pseudo_initial is not None: 162 pseudo_initial = numpy.asarray(pseudo_initial) 163 if pseudo_initial.shape != (N,): 164 raise ValueError("pseudo_initial not shape len(states)") 165 if pseudo_transition is not None: 166 pseudo_transition = numpy.asarray(pseudo_transition) 167 if pseudo_transition.shape != (N,N): 168 raise ValueError("pseudo_transition not shape " + 169 "len(states) X len(states)") 170 if pseudo_emission is not None: 171 pseudo_emission = numpy.asarray(pseudo_emission) 172 if pseudo_emission.shape != (N,M): 173 raise ValueError("pseudo_emission not shape " + 174 "len(states) X len(alphabet)") 175 176 # Training data is given as a list of members of the alphabet. 177 # Replace those with indexes into the alphabet list for easier 178 # computation. 179 training_outputs = [] 180 indexes = itemindex(alphabet) 181 for outputs in training_data: 182 training_outputs.append([indexes[x] for x in outputs]) 183 184 # Do some sanity checking on the outputs. 185 lengths = map(len, training_outputs) 186 if min(lengths) == 0: 187 raise ValueError("I got training data with outputs of length 0") 188 189 # Do the training with baum welch. 190 x = _baum_welch(N, M, training_outputs, 191 pseudo_initial=pseudo_initial, 192 pseudo_transition=pseudo_transition, 193 pseudo_emission=pseudo_emission, 194 update_fn=update_fn) 195 p_initial, p_transition, p_emission = x 196 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
197 198 MAX_ITERATIONS = 1000 199 200
201 -def _baum_welch(N, M, training_outputs, 202 p_initial=None, p_transition=None, p_emission=None, 203 pseudo_initial=None, pseudo_transition=None, 204 pseudo_emission=None, update_fn=None):
205 # Returns (p_initial, p_transition, p_emission) 206 if p_initial is None: 207 p_initial = _random_norm(N) 208 else: 209 p_initial = _copy_and_check(p_initial, (N,)) 210 211 if p_transition is None: 212 p_transition = _random_norm((N,N)) 213 else: 214 p_transition = _copy_and_check(p_transition, (N,N)) 215 if p_emission is None: 216 p_emission = _random_norm((N,M)) 217 else: 218 p_emission = _copy_and_check(p_emission, (N,M)) 219 220 # Do all the calculations in log space to avoid underflows. 221 lp_initial, lp_transition, lp_emission = map( 222 numpy.log, (p_initial, p_transition, p_emission)) 223 if pseudo_initial is not None: 224 lpseudo_initial = numpy.log(pseudo_initial) 225 else: 226 lpseudo_initial = None 227 if pseudo_transition is not None: 228 lpseudo_transition = numpy.log(pseudo_transition) 229 else: 230 lpseudo_transition = None 231 if pseudo_emission is not None: 232 lpseudo_emission = numpy.log(pseudo_emission) 233 else: 234 lpseudo_emission = None 235 236 # Iterate through each sequence of output, updating the parameters 237 # to the HMM. Stop when the log likelihoods of the sequences 238 # stops varying. 239 prev_llik = None 240 for i in range(MAX_ITERATIONS): 241 llik = LOG0 242 for outputs in training_outputs: 243 x = _baum_welch_one( 244 N, M, outputs, 245 lp_initial, lp_transition, lp_emission, 246 lpseudo_initial, lpseudo_transition, lpseudo_emission,) 247 llik += x 248 if update_fn is not None: 249 update_fn(i, llik) 250 if prev_llik is not None and numpy.fabs(prev_llik-llik) < 0.1: 251 break 252 prev_llik = llik 253 else: 254 raise RuntimeError("HMM did not converge in %d iterations" 255 % MAX_ITERATIONS) 256 257 # Return everything back in normal space. 258 return map(numpy.exp, (lp_initial, lp_transition, lp_emission))
259 260
261 -def _baum_welch_one(N, M, outputs, 262 lp_initial, lp_transition, lp_emission, 263 lpseudo_initial, lpseudo_transition, lpseudo_emission):
264 # Do one iteration of Baum-Welch based on a sequence of output. 265 # NOTE: This will change the values of lp_initial, lp_transition, 266 # and lp_emission in place. 267 T = len(outputs) 268 fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs) 269 bmat = _backward(N, T, lp_transition, lp_emission, outputs) 270 271 # Calculate the probability of traversing each arc for any given 272 # transition. 273 lp_arc = numpy.zeros((N, N, T)) 274 for t in range(T): 275 k = outputs[t] 276 lp_traverse = numpy.zeros((N, N)) # P going over one arc. 277 for i in range(N): 278 for j in range(N): 279 # P(getting to this arc) 280 # P(making this transition) 281 # P(emitting this character) 282 # P(going to the end) 283 lp = fmat[i][t] + \ 284 lp_transition[i][j] + \ 285 lp_emission[i][k] + \ 286 bmat[j][t+1] 287 lp_traverse[i][j] = lp 288 # Normalize the probability for this time step. 289 lp_arc[:,:,t] = lp_traverse - _logsum(lp_traverse) 290 291 # Sum of all the transitions out of state i at time t. 292 lp_arcout_t = numpy.zeros((N, T)) 293 for t in range(T): 294 for i in range(N): 295 lp_arcout_t[i][t] = _logsum(lp_arc[i,:,t]) 296 297 # Sum of all the transitions out of state i. 298 lp_arcout = numpy.zeros(N) 299 for i in range(N): 300 lp_arcout[i] = _logsum(lp_arcout_t[i,:]) 301 302 # UPDATE P_INITIAL. 303 lp_initial = lp_arcout_t[:,0] 304 if lpseudo_initial is not None: 305 lp_initial = _logvecadd(lp_initial, lpseudo_initial) 306 lp_initial = lp_initial - _logsum(lp_initial) 307 308 # UPDATE P_TRANSITION. p_transition[i][j] is the sum of all the 309 # transitions from i to j, normalized by the sum of the 310 # transitions out of i. 311 for i in range(N): 312 for j in range(N): 313 lp_transition[i][j] = _logsum(lp_arc[i,j,:]) - lp_arcout[i] 314 if lpseudo_transition is not None: 315 lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition) 316 lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i]) 317 318 # UPDATE P_EMISSION. lp_emission[i][k] is the sum of all the 319 # transitions out of i when k is observed, divided by the sum of 320 # the transitions out of i. 321 for i in range(N): 322 ksum = numpy.zeros(M)+LOG0 # ksum[k] is the sum of all i with k. 323 for t in range(T): 324 k = outputs[t] 325 for j in range(N): 326 ksum[k] = logaddexp(ksum[k], lp_arc[i,j,t]) 327 ksum = ksum - _logsum(ksum) # Normalize 328 if lpseudo_emission is not None: 329 ksum = _logvecadd(ksum, lpseudo_emission[i]) 330 ksum = ksum - _logsum(ksum) # Renormalize 331 lp_emission[i,:] = ksum 332 333 # Calculate the log likelihood of the output based on the forward 334 # matrix. Since the parameters of the HMM has changed, the log 335 # likelihoods are going to be a step behind, and we might be doing 336 # one extra iteration of training. The alternative is to rerun 337 # the _forward algorithm and calculate from the clean one, but 338 # that may be more expensive than overshooting the training by one 339 # step. 340 return _logsum(fmat[:,T])
341 342
343 -def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
344 # Implement the forward algorithm. This actually calculates a 345 # Nx(T+1) matrix, where the last column is the total probability 346 # of the output. 347 348 matrix = numpy.zeros((N, T+1)) 349 350 # Initialize the first column to be the initial values. 351 matrix[:,0] = lp_initial 352 for t in range(1, T+1): 353 k = outputs[t-1] 354 for j in range(N): 355 # The probability of the state is the sum of the 356 # transitions from all the states from time t-1. 357 lprob = LOG0 358 for i in range(N): 359 lp = matrix[i][t-1] + \ 360 lp_transition[i][j] + \ 361 lp_emission[i][k] 362 lprob = logaddexp(lprob, lp) 363 matrix[j][t] = lprob 364 return matrix
365 366
367 -def _backward(N, T, lp_transition, lp_emission, outputs):
368 matrix = numpy.zeros((N, T+1)) 369 for t in range(T-1, -1, -1): 370 k = outputs[t] 371 for i in range(N): 372 # The probability of the state is the sum of the 373 # transitions from all the states from time t+1. 374 lprob = LOG0 375 for j in range(N): 376 lp = matrix[j][t+1] + \ 377 lp_transition[i][j] + \ 378 lp_emission[i][k] 379 lprob = logaddexp(lprob, lp) 380 matrix[i][t] = lprob 381 return matrix
382 383
384 -def train_visible(states, alphabet, training_data, 385 pseudo_initial=None, pseudo_transition=None, 386 pseudo_emission=None):
387 """train_visible(states, alphabet, training_data[, pseudo_initial] 388 [, pseudo_transition][, pseudo_emission]) -> MarkovModel 389 390 Train a visible MarkovModel using maximum likelihoood estimates 391 for each of the parameters. states is a list of strings that 392 describe the names of each state. alphabet is a list of objects 393 that indicate the allowed outputs. training_data is a list of 394 (outputs, observed states) where outputs is a list of the emission 395 from the alphabet, and observed states is a list of states from 396 states. 397 398 pseudo_initial, pseudo_transition, and pseudo_emission are 399 optional parameters that you can use to assign pseudo-counts to 400 different matrices. They should be matrices of the appropriate 401 size that contain numbers to add to each parameter matrix 402 403 """ 404 N, M = len(states), len(alphabet) 405 if pseudo_initial is not None: 406 pseudo_initial = numpy.asarray(pseudo_initial) 407 if pseudo_initial.shape != (N,): 408 raise ValueError("pseudo_initial not shape len(states)") 409 if pseudo_transition is not None: 410 pseudo_transition = numpy.asarray(pseudo_transition) 411 if pseudo_transition.shape != (N,N): 412 raise ValueError("pseudo_transition not shape " + 413 "len(states) X len(states)") 414 if pseudo_emission is not None: 415 pseudo_emission = numpy.asarray(pseudo_emission) 416 if pseudo_emission.shape != (N,M): 417 raise ValueError("pseudo_emission not shape " + 418 "len(states) X len(alphabet)") 419 420 # Training data is given as a list of members of the alphabet. 421 # Replace those with indexes into the alphabet list for easier 422 # computation. 423 training_states, training_outputs = [], [] 424 states_indexes = itemindex(states) 425 outputs_indexes = itemindex(alphabet) 426 for toutputs, tstates in training_data: 427 if len(tstates) != len(toutputs): 428 raise ValueError("states and outputs not aligned") 429 training_states.append([states_indexes[x] for x in tstates]) 430 training_outputs.append([outputs_indexes[x] for x in toutputs]) 431 432 x = _mle(N, M, training_outputs, training_states, 433 pseudo_initial, pseudo_transition, pseudo_emission) 434 p_initial, p_transition, p_emission = x 435 436 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
437 438
439 -def _mle(N, M, training_outputs, training_states, pseudo_initial, 440 pseudo_transition, pseudo_emission):
441 # p_initial is the probability that a sequence of states starts 442 # off with a particular one. 443 p_initial = numpy.zeros(N) 444 if pseudo_initial: 445 p_initial = p_initial + pseudo_initial 446 for states in training_states: 447 p_initial[states[0]] += 1 448 p_initial = _normalize(p_initial) 449 450 # p_transition is the probability that a state leads to the next 451 # one. C(i,j)/C(i) where i and j are states. 452 p_transition = numpy.zeros((N,N)) 453 if pseudo_transition: 454 p_transition = p_transition + pseudo_transition 455 for states in training_states: 456 for n in range(len(states)-1): 457 i, j = states[n], states[n+1] 458 p_transition[i, j] += 1 459 for i in range(len(p_transition)): 460 p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:]) 461 462 # p_emission is the probability of an output given a state. 463 # C(s,o)|C(s) where o is an output and s is a state. 464 p_emission = numpy.zeros((N,M)) 465 if pseudo_emission: 466 p_emission = p_emission + pseudo_emission 467 p_emission = numpy.ones((N,M)) 468 for outputs, states in zip(training_outputs, training_states): 469 for o, s in zip(outputs, states): 470 p_emission[s, o] += 1 471 for i in range(len(p_emission)): 472 p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:]) 473 474 return p_initial, p_transition, p_emission
475 476
477 -def _argmaxes(vector, allowance=None):
478 return [numpy.argmax(vector)]
479 480
481 -def find_states(markov_model, output):
482 """find_states(markov_model, output) -> list of (states, score)""" 483 mm = markov_model 484 N = len(mm.states) 485 486 # _viterbi does calculations in log space. Add a tiny bit to the 487 # matrices so that the logs will not break. 488 x = mm.p_initial + VERY_SMALL_NUMBER 489 y = mm.p_transition + VERY_SMALL_NUMBER 490 z = mm.p_emission + VERY_SMALL_NUMBER 491 lp_initial, lp_transition, lp_emission = map(numpy.log, (x, y, z)) 492 # Change output into a list of indexes into the alphabet. 493 indexes = itemindex(mm.alphabet) 494 output = [indexes[x] for x in output] 495 496 # Run the viterbi algorithm. 497 results = _viterbi(N, lp_initial, lp_transition, lp_emission, output) 498 499 for i in range(len(results)): 500 states, score = results[i] 501 results[i] = [mm.states[x] for x in states], numpy.exp(score) 502 return results
503 504
505 -def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
506 # The Viterbi algorithm finds the most likely set of states for a 507 # given output. Returns a list of states. 508 509 T = len(output) 510 # Store the backtrace in a NxT matrix. 511 backtrace = [] # list of indexes of states in previous timestep. 512 for i in range(N): 513 backtrace.append([None] * T) 514 515 # Store the best scores. 516 scores = numpy.zeros((N, T)) 517 scores[:,0] = lp_initial + lp_emission[:,output[0]] 518 for t in range(1, T): 519 k = output[t] 520 for j in range(N): 521 # Find the most likely place it came from. 522 i_scores = scores[:,t-1] + \ 523 lp_transition[:,j] + \ 524 lp_emission[j,k] 525 indexes = _argmaxes(i_scores) 526 scores[j,t] = i_scores[indexes[0]] 527 backtrace[j][t] = indexes 528 529 # Do the backtrace. First, find a good place to start. Then, 530 # we'll follow the backtrace matrix to find the list of states. 531 # In the event of ties, there may be multiple paths back through 532 # the matrix, which implies a recursive solution. We'll simulate 533 # it by keeping our own stack. 534 in_process = [] # list of (t, states, score) 535 results = [] # return values. list of (states, score) 536 indexes = _argmaxes(scores[:,T-1]) # pick the first place 537 for i in indexes: 538 in_process.append((T-1, [i], scores[i][T-1])) 539 while in_process: 540 t, states, score = in_process.pop() 541 if t == 0: 542 results.append((states, score)) 543 else: 544 indexes = backtrace[states[0]][t] 545 for i in indexes: 546 in_process.append((t-1, [i]+states, score)) 547 return results
548 549
550 -def _normalize(matrix):
551 # Make sure numbers add up to 1.0 552 if len(matrix.shape) == 1: 553 matrix = matrix / float(sum(matrix)) 554 elif len(matrix.shape) == 2: 555 # Normalize by rows. 556 for i in range(len(matrix)): 557 matrix[i,:] = matrix[i,:] / sum(matrix[i,:]) 558 else: 559 raise ValueError("I cannot handle matrixes of that shape") 560 return matrix
561 562
563 -def _uniform_norm(shape):
564 matrix = numpy.ones(shape) 565 return _normalize(matrix)
566 567
568 -def _random_norm(shape):
569 matrix = numpy.random.random(shape) 570 return _normalize(matrix)
571 572
573 -def _copy_and_check(matrix, desired_shape):
574 # Copy the matrix. 575 matrix = numpy.array(matrix, copy=1) 576 # Check the dimensions. 577 if matrix.shape != desired_shape: 578 raise ValueError("Incorrect dimension") 579 # Make sure it's normalized. 580 if len(matrix.shape) == 1: 581 if numpy.fabs(sum(matrix)-1.0) > 0.01: 582 raise ValueError("matrix not normalized to 1.0") 583 elif len(matrix.shape) == 2: 584 for i in range(len(matrix)): 585 if numpy.fabs(sum(matrix[i])-1.0) > 0.01: 586 raise ValueError("matrix %d not normalized to 1.0" % i) 587 else: 588 raise ValueError("I don't handle matrices > 2 dimensions") 589 return matrix
590 591
592 -def _logsum(matrix):
593 if len(matrix.shape) > 1: 594 vec = numpy.reshape(matrix, (numpy.product(matrix.shape),)) 595 else: 596 vec = matrix 597 sum = LOG0 598 for num in vec: 599 sum = logaddexp(sum, num) 600 return sum
601 602
603 -def _logvecadd(logvec1, logvec2):
604 assert len(logvec1) == len(logvec2), "vectors aren't the same length" 605 sumvec = numpy.zeros(len(logvec1)) 606 for i in range(len(logvec1)): 607 sumvec[i] = logaddexp(logvec1[i], logvec2[i]) 608 return sumvec
609 610
611 -def _exp_logsum(numbers):
612 sum = _logsum(numbers) 613 return numpy.exp(sum)
614