Package Bio :: Package HMM :: Module Trainer
[hide private]
[frames] | no frames]

Source Code for Module Bio.HMM.Trainer

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Provide trainers which estimate parameters based on training sequences. 
  7   
  8  These should be used to 'train' a Markov Model prior to actually using 
  9  it to decode state paths. When supplied training sequences and a model 
 10  to work from, these classes will estimate parameters of the model. 
 11   
 12  This aims to estimate two parameters: 
 13   
 14  * a_{kl} -- the number of times there is a transition from k to l in the 
 15  training data. 
 16   
 17  * e_{k}(b) -- the number of emissions of the state b from the letter k 
 18  in the training data. 
 19  """ 
 20  # standard modules 
 21  import math 
 22   
 23  # local stuff 
 24  from .DynamicProgramming import ScaledDPAlgorithms 
 25   
 26   
27 -class TrainingSequence(object):
28 """Hold a training sequence with emissions and optionally, a state path. 29 """
30 - def __init__(self, emissions, state_path):
31 """Initialize a training sequence. 32 33 Arguments: 34 35 o emissions - A Seq object containing the sequence of emissions in 36 the training sequence, and the alphabet of the sequence. 37 38 o state_path - A Seq object containing the sequence of states and 39 the alphabet of the states. If there is no known state path, then 40 the sequence of states should be an empty string. 41 """ 42 if len(state_path) > 0: 43 assert len(emissions) == len(state_path), \ 44 "State path does not match associated emissions." 45 self.emissions = emissions 46 self.states = state_path
47 48
49 -class AbstractTrainer(object):
50 """Provide generic functionality needed in all trainers. 51 """
52 - def __init__(self, markov_model):
53 self._markov_model = markov_model
54
55 - def log_likelihood(self, probabilities):
56 """Calculate the log likelihood of the training seqs. 57 58 Arguments: 59 60 o probabilities -- A list of the probabilities of each training 61 sequence under the current parameters, calculated using the forward 62 algorithm. 63 """ 64 total_likelihood = 0 65 for probability in probabilities: 66 total_likelihood += math.log(probability) 67 68 return total_likelihood
69
70 - def estimate_params(self, transition_counts, emission_counts):
71 """Get a maximum likelihood estimation of transition and emmission. 72 73 Arguments: 74 75 o transition_counts -- A dictionary with the total number of counts 76 of transitions between two states. 77 78 o emissions_counts -- A dictionary with the total number of counts 79 of emmissions of a particular emission letter by a state letter. 80 81 This then returns the maximum likelihood estimators for the 82 transitions and emissions, estimated by formulas 3.18 in 83 Durbin et al: 84 85 a_{kl} = A_{kl} / sum(A_{kl'}) 86 e_{k}(b) = E_{k}(b) / sum(E_{k}(b')) 87 88 Returns: 89 Transition and emission dictionaries containing the maximum 90 likelihood estimators. 91 """ 92 # now calculate the information 93 ml_transitions = self.ml_estimator(transition_counts) 94 ml_emissions = self.ml_estimator(emission_counts) 95 96 return ml_transitions, ml_emissions
97
98 - def ml_estimator(self, counts):
99 """Calculate the maximum likelihood estimator. 100 101 This can calculate maximum likelihoods for both transitions 102 and emissions. 103 104 Arguments: 105 106 o counts -- A dictionary of the counts for each item. 107 108 See estimate_params for a description of the formula used for 109 calculation. 110 """ 111 # get an ordered list of all items 112 all_ordered = sorted(counts) 113 114 ml_estimation = {} 115 116 # the total counts for the current letter we are on 117 cur_letter = None 118 cur_letter_counts = 0 119 120 for cur_item in all_ordered: 121 # if we are on a new letter (ie. the first letter of the tuple) 122 if cur_item[0] != cur_letter: 123 # set the new letter we are working with 124 cur_letter = cur_item[0] 125 126 # count up the total counts for this letter 127 cur_letter_counts = counts[cur_item] 128 129 # add counts for all other items with the same first letter 130 cur_position = all_ordered.index(cur_item) + 1 131 132 # keep adding while we have the same first letter or until 133 # we get to the end of the ordered list 134 while (cur_position < len(all_ordered) and 135 all_ordered[cur_position][0] == cur_item[0]): 136 cur_letter_counts += counts[all_ordered[cur_position]] 137 cur_position += 1 138 # otherwise we've already got the total counts for this letter 139 else: 140 pass 141 142 # now calculate the ml and add it to the estimation 143 cur_ml = float(counts[cur_item]) / float(cur_letter_counts) 144 ml_estimation[cur_item] = cur_ml 145 146 return ml_estimation
147 148
149 -class BaumWelchTrainer(AbstractTrainer):
150 """Trainer that uses the Baum-Welch algorithm to estimate parameters. 151 152 These should be used when a training sequence for an HMM has unknown 153 paths for the actual states, and you need to make an estimation of the 154 model parameters from the observed emissions. 155 156 This uses the Baum-Welch algorithm, first described in 157 Baum, L.E. 1972. Inequalities. 3:1-8 158 This is based on the description in 'Biological Sequence Analysis' by 159 Durbin et al. in section 3.3 160 161 This algorithm is guaranteed to converge to a local maximum, but not 162 necessarily to the global maxima, so use with care! 163 """
164 - def __init__(self, markov_model):
165 """Initialize the trainer. 166 167 Arguments: 168 169 o markov_model - The model we are going to estimate parameters for. 170 This should have the parameters with some initial estimates, that 171 we can build from. 172 """ 173 AbstractTrainer.__init__(self, markov_model)
174
175 - def train(self, training_seqs, stopping_criteria, 176 dp_method = ScaledDPAlgorithms):
177 """Estimate the parameters using training sequences. 178 179 The algorithm for this is taken from Durbin et al. p64, so this 180 is a good place to go for a reference on what is going on. 181 182 Arguments: 183 184 o training_seqs -- A list of TrainingSequence objects to be used 185 for estimating the parameters. 186 187 o stopping_criteria -- A function, that when passed the change 188 in log likelihood and threshold, will indicate if we should stop 189 the estimation iterations. 190 191 o dp_method -- A class instance specifying the dynamic programming 192 implementation we should use to calculate the forward and 193 backward variables. By default, we use the scaling method. 194 """ 195 prev_log_likelihood = None 196 num_iterations = 1 197 198 while True: 199 transition_count = self._markov_model.get_blank_transitions() 200 emission_count = self._markov_model.get_blank_emissions() 201 202 # remember all of the sequence probabilities 203 all_probabilities = [] 204 205 for training_seq in training_seqs: 206 # calculate the forward and backward variables 207 DP = dp_method(self._markov_model, training_seq) 208 forward_var, seq_prob = DP.forward_algorithm() 209 backward_var = DP.backward_algorithm() 210 211 all_probabilities.append(seq_prob) 212 213 # update the counts for transitions and emissions 214 transition_count = self.update_transitions(transition_count, 215 training_seq, 216 forward_var, 217 backward_var, 218 seq_prob) 219 emission_count = self.update_emissions(emission_count, 220 training_seq, 221 forward_var, 222 backward_var, 223 seq_prob) 224 225 # update the markov model with the new probabilities 226 ml_transitions, ml_emissions = \ 227 self.estimate_params(transition_count, emission_count) 228 self._markov_model.transition_prob = ml_transitions 229 self._markov_model.emission_prob = ml_emissions 230 231 cur_log_likelihood = self.log_likelihood(all_probabilities) 232 233 # if we have previously calculated the log likelihood (ie. 234 # not the first round), see if we can finish 235 if prev_log_likelihood is not None: 236 # XXX log likelihoods are negatives -- am I calculating 237 # the change properly, or should I use the negatives... 238 # I'm not sure at all if this is right. 239 log_likelihood_change = abs(abs(cur_log_likelihood) - 240 abs(prev_log_likelihood)) 241 242 # check whether we have completed enough iterations to have 243 # a good estimation 244 if stopping_criteria(log_likelihood_change, num_iterations): 245 break 246 247 # set up for another round of iterations 248 prev_log_likelihood = cur_log_likelihood 249 num_iterations += 1 250 251 return self._markov_model
252
253 - def update_transitions(self, transition_counts, training_seq, 254 forward_vars, backward_vars, training_seq_prob):
255 """Add the contribution of a new training sequence to the transitions. 256 257 Arguments: 258 259 o transition_counts -- A dictionary of the current counts for the 260 transitions 261 262 o training_seq -- The training sequence we are working with 263 264 o forward_vars -- Probabilities calculated using the forward 265 algorithm. 266 267 o backward_vars -- Probabilities calculated using the backwards 268 algorithm. 269 270 o training_seq_prob - The probability of the current sequence. 271 272 This calculates A_{kl} (the estimated transition counts from state 273 k to state l) using formula 3.20 in Durbin et al. 274 """ 275 # set up the transition and emission probabilities we are using 276 transitions = self._markov_model.transition_prob 277 emissions = self._markov_model.emission_prob 278 279 # loop over the possible combinations of state path letters 280 for k in training_seq.states.alphabet.letters: 281 for l in self._markov_model.transitions_from(k): 282 estimated_counts = 0 283 # now loop over the entire training sequence 284 for i in range(len(training_seq.emissions) - 1): 285 # the forward value of k at the current position 286 forward_value = forward_vars[(k, i)] 287 288 # the backward value of l in the next position 289 backward_value = backward_vars[(l, i + 1)] 290 291 # the probability of a transition from k to l 292 trans_value = transitions[(k, l)] 293 294 # the probability of getting the emission at the next pos 295 emm_value = emissions[(l, training_seq.emissions[i + 1])] 296 297 estimated_counts += (forward_value * trans_value * 298 emm_value * backward_value) 299 300 # update the transition approximation 301 transition_counts[(k, l)] += (float(estimated_counts) / 302 training_seq_prob) 303 304 return transition_counts
305
306 - def update_emissions(self, emission_counts, training_seq, 307 forward_vars, backward_vars, training_seq_prob):
308 """Add the contribution of a new training sequence to the emissions 309 310 Arguments: 311 312 o emission_counts -- A dictionary of the current counts for the 313 emissions 314 315 o training_seq -- The training sequence we are working with 316 317 o forward_vars -- Probabilities calculated using the forward 318 algorithm. 319 320 o backward_vars -- Probabilities calculated using the backwards 321 algorithm. 322 323 o training_seq_prob - The probability of the current sequence. 324 325 This calculates E_{k}(b) (the estimated emission probability for 326 emission letter b from state k) using formula 3.21 in Durbin et al. 327 """ 328 # loop over the possible combinations of state path letters 329 for k in training_seq.states.alphabet.letters: 330 # now loop over all of the possible emissions 331 for b in training_seq.emissions.alphabet.letters: 332 expected_times = 0 333 # finally loop over the entire training sequence 334 for i in range(len(training_seq.emissions)): 335 # only count the forward and backward probability if the 336 # emission at the position is the same as b 337 if training_seq.emissions[i] == b: 338 # f_{k}(i) b_{k}(i) 339 expected_times += (forward_vars[(k, i)] * 340 backward_vars[(k, i)]) 341 342 # add to E_{k}(b) 343 emission_counts[(k, b)] += (float(expected_times) / 344 training_seq_prob) 345 346 return emission_counts
347 348
349 -class KnownStateTrainer(AbstractTrainer):
350 """Estimate probabilities with known state sequences. 351 352 This should be used for direct estimation of emission and transition 353 probabilities when both the state path and emission sequence are 354 known for the training examples. 355 """
356 - def __init__(self, markov_model):
357 AbstractTrainer.__init__(self, markov_model)
358
359 - def train(self, training_seqs):
360 """Estimate the Markov Model parameters with known state paths. 361 362 This trainer requires that both the state and the emissions are 363 known for all of the training sequences in the list of 364 TrainingSequence objects. 365 This training will then count all of the transitions and emissions, 366 and use this to estimate the parameters of the model. 367 """ 368 # count up all of the transitions and emissions 369 transition_counts = self._markov_model.get_blank_transitions() 370 emission_counts = self._markov_model.get_blank_emissions() 371 372 for training_seq in training_seqs: 373 emission_counts = self._count_emissions(training_seq, 374 emission_counts) 375 transition_counts = self._count_transitions(training_seq.states, 376 transition_counts) 377 378 # update the markov model from the counts 379 ml_transitions, ml_emissions = \ 380 self.estimate_params(transition_counts, 381 emission_counts) 382 self._markov_model.transition_prob = ml_transitions 383 self._markov_model.emission_prob = ml_emissions 384 385 return self._markov_model
386
387 - def _count_emissions(self, training_seq, emission_counts):
388 """Add emissions from the training sequence to the current counts. 389 390 Arguments: 391 392 o training_seq -- A TrainingSequence with states and emissions 393 to get the counts from 394 395 o emission_counts -- The current emission counts to add to. 396 """ 397 for index in range(len(training_seq.emissions)): 398 cur_state = training_seq.states[index] 399 cur_emission = training_seq.emissions[index] 400 401 try: 402 emission_counts[(cur_state, cur_emission)] += 1 403 except KeyError: 404 raise KeyError("Unexpected emission (%s, %s)" 405 % (cur_state, cur_emission)) 406 return emission_counts
407
408 - def _count_transitions(self, state_seq, transition_counts):
409 """Add transitions from the training sequence to the current counts. 410 411 Arguments: 412 413 o state_seq -- A Seq object with the states of the current training 414 sequence. 415 416 o transition_counts -- The current transition counts to add to. 417 """ 418 for cur_pos in range(len(state_seq) - 1): 419 cur_state = state_seq[cur_pos] 420 next_state = state_seq[cur_pos + 1] 421 422 try: 423 transition_counts[(cur_state, next_state)] += 1 424 except KeyError: 425 raise KeyError("Unexpected transition (%s, %s)" % 426 (cur_state, next_state)) 427 428 return transition_counts
429