Package Bio :: Package HMM :: Module Trainer
[hide private]
[frames] | no frames]

Source Code for Module Bio.HMM.Trainer

  1  """Provide trainers which estimate parameters based on training sequences. 
  2   
  3  These should be used to 'train' a Markov Model prior to actually using 
  4  it to decode state paths. When supplied training sequences and a model 
  5  to work from, these classes will estimate parameters of the model. 
  6   
  7  This aims to estimate two parameters: 
  8   
  9  * a_{kl} -- the number of times there is a transition from k to l in the 
 10  training data. 
 11   
 12  * e_{k}(b) -- the number of emissions of the state b from the letter k 
 13  in the training data. 
 14  """ 
 15  # standard modules 
 16  import math 
 17   
 18  # local stuff 
 19  from DynamicProgramming import ScaledDPAlgorithms 
 20   
 21   
22 -class TrainingSequence(object):
23 """Hold a training sequence with emissions and optionally, a state path. 24 """
25 - def __init__(self, emissions, state_path):
26 """Initialize a training sequence. 27 28 Arguments: 29 30 o emissions - A Seq object containing the sequence of emissions in 31 the training sequence, and the alphabet of the sequence. 32 33 o state_path - A Seq object containing the sequence of states and 34 the alphabet of the states. If there is no known state path, then 35 the sequence of states should be an empty string. 36 """ 37 if len(state_path) > 0: 38 assert len(emissions) == len(state_path), \ 39 "State path does not match associated emissions." 40 self.emissions = emissions 41 self.states = state_path
42 43
44 -class AbstractTrainer(object):
45 """Provide generic functionality needed in all trainers. 46 """
47 - def __init__(self, markov_model):
48 self._markov_model = markov_model
49
50 - def log_likelihood(self, probabilities):
51 """Calculate the log likelihood of the training seqs. 52 53 Arguments: 54 55 o probabilities -- A list of the probabilities of each training 56 sequence under the current parameters, calculated using the forward 57 algorithm. 58 """ 59 total_likelihood = 0 60 for probability in probabilities: 61 total_likelihood += math.log(probability) 62 63 return total_likelihood
64
65 - def estimate_params(self, transition_counts, emission_counts):
66 """Get a maximum likelihood estimation of transition and emmission. 67 68 Arguments: 69 70 o transition_counts -- A dictionary with the total number of counts 71 of transitions between two states. 72 73 o emissions_counts -- A dictionary with the total number of counts 74 of emmissions of a particular emission letter by a state letter. 75 76 This then returns the maximum likelihood estimators for the 77 transitions and emissions, estimated by formulas 3.18 in 78 Durbin et al: 79 80 a_{kl} = A_{kl} / sum(A_{kl'}) 81 e_{k}(b) = E_{k}(b) / sum(E_{k}(b')) 82 83 Returns: 84 Transition and emission dictionaries containing the maximum 85 likelihood estimators. 86 """ 87 # now calculate the information 88 ml_transitions = self.ml_estimator(transition_counts) 89 ml_emissions = self.ml_estimator(emission_counts) 90 91 return ml_transitions, ml_emissions
92
93 - def ml_estimator(self, counts):
94 """Calculate the maximum likelihood estimator. 95 96 This can calculate maximum likelihoods for both transitions 97 and emissions. 98 99 Arguments: 100 101 o counts -- A dictionary of the counts for each item. 102 103 See estimate_params for a description of the formula used for 104 calculation. 105 """ 106 # get an ordered list of all items 107 all_ordered = counts.keys() 108 all_ordered.sort() 109 110 ml_estimation = {} 111 112 # the total counts for the current letter we are on 113 cur_letter = None 114 cur_letter_counts = 0 115 116 for cur_item in all_ordered: 117 # if we are on a new letter (ie. the first letter of the tuple) 118 if cur_item[0] != cur_letter: 119 # set the new letter we are working with 120 cur_letter = cur_item[0] 121 122 # count up the total counts for this letter 123 cur_letter_counts = counts[cur_item] 124 125 # add counts for all other items with the same first letter 126 cur_position = all_ordered.index(cur_item) + 1 127 128 # keep adding while we have the same first letter or until 129 # we get to the end of the ordered list 130 while (cur_position < len(all_ordered) and 131 all_ordered[cur_position][0] == cur_item[0]): 132 cur_letter_counts += counts[all_ordered[cur_position]] 133 cur_position += 1 134 # otherwise we've already got the total counts for this letter 135 else: 136 pass 137 138 # now calculate the ml and add it to the estimation 139 cur_ml = float(counts[cur_item]) / float(cur_letter_counts) 140 ml_estimation[cur_item] = cur_ml 141 142 return ml_estimation
143 144
145 -class BaumWelchTrainer(AbstractTrainer):
146 """Trainer that uses the Baum-Welch algorithm to estimate parameters. 147 148 These should be used when a training sequence for an HMM has unknown 149 paths for the actual states, and you need to make an estimation of the 150 model parameters from the observed emissions. 151 152 This uses the Baum-Welch algorithm, first described in 153 Baum, L.E. 1972. Inequalities. 3:1-8 154 This is based on the description in 'Biological Sequence Analysis' by 155 Durbin et al. in section 3.3 156 157 This algorithm is guaranteed to converge to a local maximum, but not 158 necessarily to the global maxima, so use with care! 159 """
160 - def __init__(self, markov_model):
161 """Initialize the trainer. 162 163 Arguments: 164 165 o markov_model - The model we are going to estimate parameters for. 166 This should have the parameters with some initial estimates, that 167 we can build from. 168 """ 169 AbstractTrainer.__init__(self, markov_model)
170
171 - def train(self, training_seqs, stopping_criteria, 172 dp_method = ScaledDPAlgorithms):
173 """Estimate the parameters using training sequences. 174 175 The algorithm for this is taken from Durbin et al. p64, so this 176 is a good place to go for a reference on what is going on. 177 178 Arguments: 179 180 o training_seqs -- A list of TrainingSequence objects to be used 181 for estimating the parameters. 182 183 o stopping_criteria -- A function, that when passed the change 184 in log likelihood and threshold, will indicate if we should stop 185 the estimation iterations. 186 187 o dp_method -- A class instance specifying the dynamic programming 188 implementation we should use to calculate the forward and 189 backward variables. By default, we use the scaling method. 190 """ 191 prev_log_likelihood = None 192 num_iterations = 1 193 194 while 1: 195 transition_count = self._markov_model.get_blank_transitions() 196 emission_count = self._markov_model.get_blank_emissions() 197 198 # remember all of the sequence probabilities 199 all_probabilities = [] 200 201 for training_seq in training_seqs: 202 # calculate the forward and backward variables 203 DP = dp_method(self._markov_model, training_seq) 204 forward_var, seq_prob = DP.forward_algorithm() 205 backward_var = DP.backward_algorithm() 206 207 all_probabilities.append(seq_prob) 208 209 # update the counts for transitions and emissions 210 transition_count = self.update_transitions(transition_count, 211 training_seq, 212 forward_var, 213 backward_var, 214 seq_prob) 215 emission_count = self.update_emissions(emission_count, 216 training_seq, 217 forward_var, 218 backward_var, 219 seq_prob) 220 221 # update the markov model with the new probabilities 222 ml_transitions, ml_emissions = \ 223 self.estimate_params(transition_count, emission_count) 224 self._markov_model.transition_prob = ml_transitions 225 self._markov_model.emission_prob = ml_emissions 226 227 cur_log_likelihood = self.log_likelihood(all_probabilities) 228 229 # if we have previously calculated the log likelihood (ie. 230 # not the first round), see if we can finish 231 if prev_log_likelihood is not None: 232 # XXX log likelihoods are negatives -- am I calculating 233 # the change properly, or should I use the negatives... 234 # I'm not sure at all if this is right. 235 log_likelihood_change = abs(abs(cur_log_likelihood) - 236 abs(prev_log_likelihood)) 237 238 # check whether we have completed enough iterations to have 239 # a good estimation 240 if stopping_criteria(log_likelihood_change, num_iterations): 241 break 242 243 # set up for another round of iterations 244 prev_log_likelihood = cur_log_likelihood 245 num_iterations += 1 246 247 return self._markov_model
248
249 - def update_transitions(self, transition_counts, training_seq, 250 forward_vars, backward_vars, training_seq_prob):
251 """Add the contribution of a new training sequence to the transitions. 252 253 Arguments: 254 255 o transition_counts -- A dictionary of the current counts for the 256 transitions 257 258 o training_seq -- The training sequence we are working with 259 260 o forward_vars -- Probabilities calculated using the forward 261 algorithm. 262 263 o backward_vars -- Probabilities calculated using the backwards 264 algorithm. 265 266 o training_seq_prob - The probability of the current sequence. 267 268 This calculates A_{kl} (the estimated transition counts from state 269 k to state l) using formula 3.20 in Durbin et al. 270 """ 271 # set up the transition and emission probabilities we are using 272 transitions = self._markov_model.transition_prob 273 emissions = self._markov_model.emission_prob 274 275 # loop over the possible combinations of state path letters 276 for k in training_seq.states.alphabet.letters: 277 for l in self._markov_model.transitions_from(k): 278 estimated_counts = 0 279 # now loop over the entire training sequence 280 for i in range(len(training_seq.emissions) - 1): 281 # the forward value of k at the current position 282 forward_value = forward_vars[(k, i)] 283 284 # the backward value of l in the next position 285 backward_value = backward_vars[(l, i + 1)] 286 287 # the probability of a transition from k to l 288 trans_value = transitions[(k, l)] 289 290 # the probability of getting the emission at the next pos 291 emm_value = emissions[(l, training_seq.emissions[i + 1])] 292 293 estimated_counts += (forward_value * trans_value * 294 emm_value * backward_value) 295 296 # update the transition approximation 297 transition_counts[(k, l)] += (float(estimated_counts) / 298 training_seq_prob) 299 300 return transition_counts
301
302 - def update_emissions(self, emission_counts, training_seq, 303 forward_vars, backward_vars, training_seq_prob):
304 """Add the contribution of a new training sequence to the emissions 305 306 Arguments: 307 308 o emission_counts -- A dictionary of the current counts for the 309 emissions 310 311 o training_seq -- The training sequence we are working with 312 313 o forward_vars -- Probabilities calculated using the forward 314 algorithm. 315 316 o backward_vars -- Probabilities calculated using the backwards 317 algorithm. 318 319 o training_seq_prob - The probability of the current sequence. 320 321 This calculates E_{k}(b) (the estimated emission probability for 322 emission letter b from state k) using formula 3.21 in Durbin et al. 323 """ 324 # loop over the possible combinations of state path letters 325 for k in training_seq.states.alphabet.letters: 326 # now loop over all of the possible emissions 327 for b in training_seq.emissions.alphabet.letters: 328 expected_times = 0 329 # finally loop over the entire training sequence 330 for i in range(len(training_seq.emissions)): 331 # only count the forward and backward probability if the 332 # emission at the position is the same as b 333 if training_seq.emissions[i] == b: 334 # f_{k}(i) b_{k}(i) 335 expected_times += (forward_vars[(k, i)] * 336 backward_vars[(k, i)]) 337 338 # add to E_{k}(b) 339 emission_counts[(k, b)] += (float(expected_times) / 340 training_seq_prob) 341 342 return emission_counts
343 344
345 -class KnownStateTrainer(AbstractTrainer):
346 """Estimate probabilities with known state sequences. 347 348 This should be used for direct estimation of emission and transition 349 probabilities when both the state path and emission sequence are 350 known for the training examples. 351 """
352 - def __init__(self, markov_model):
353 AbstractTrainer.__init__(self, markov_model)
354
355 - def train(self, training_seqs):
356 """Estimate the Markov Model parameters with known state paths. 357 358 This trainer requires that both the state and the emissions are 359 known for all of the training sequences in the list of 360 TrainingSequence objects. 361 This training will then count all of the transitions and emissions, 362 and use this to estimate the parameters of the model. 363 """ 364 # count up all of the transitions and emissions 365 transition_counts = self._markov_model.get_blank_transitions() 366 emission_counts = self._markov_model.get_blank_emissions() 367 368 for training_seq in training_seqs: 369 emission_counts = self._count_emissions(training_seq, 370 emission_counts) 371 transition_counts = self._count_transitions(training_seq.states, 372 transition_counts) 373 374 # update the markov model from the counts 375 ml_transitions, ml_emissions = \ 376 self.estimate_params(transition_counts, 377 emission_counts) 378 self._markov_model.transition_prob = ml_transitions 379 self._markov_model.emission_prob = ml_emissions 380 381 return self._markov_model
382
383 - def _count_emissions(self, training_seq, emission_counts):
384 """Add emissions from the training sequence to the current counts. 385 386 Arguments: 387 388 o training_seq -- A TrainingSequence with states and emissions 389 to get the counts from 390 391 o emission_counts -- The current emission counts to add to. 392 """ 393 for index in range(len(training_seq.emissions)): 394 cur_state = training_seq.states[index] 395 cur_emission = training_seq.emissions[index] 396 397 try: 398 emission_counts[(cur_state, cur_emission)] += 1 399 except KeyError: 400 raise KeyError("Unexpected emission (%s, %s)" 401 % (cur_state, cur_emission)) 402 return emission_counts
403
404 - def _count_transitions(self, state_seq, transition_counts):
405 """Add transitions from the training sequence to the current counts. 406 407 Arguments: 408 409 o state_seq -- A Seq object with the states of the current training 410 sequence. 411 412 o transition_counts -- The current transition counts to add to. 413 """ 414 for cur_pos in range(len(state_seq) - 1): 415 cur_state = state_seq[cur_pos] 416 next_state = state_seq[cur_pos + 1] 417 418 try: 419 transition_counts[(cur_state, next_state)] += 1 420 except KeyError: 421 raise KeyError("Unexpected transition (%s, %s)" % 422 (cur_state, next_state)) 423 424 return transition_counts
425