Package Bio :: Package HMM :: Module DynamicProgramming
[hide private]
[frames] | no frames]

Source Code for Module Bio.HMM.DynamicProgramming

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Dynamic Programming algorithms for general usage. 
  7   
  8  This module contains classes which implement Dynamic Programming 
  9  algorithms that can be used generally. 
 10  """ 
 11   
 12  from Bio._py3k import range 
 13   
 14   
15 -class AbstractDPAlgorithms(object):
16 """An abstract class to calculate forward and backward probabilities. 17 18 This class should not be instantiated directly, but should be used 19 through a derived class which implements proper scaling of variables. 20 21 This class is just meant to encapsulate the basic forward and backward 22 algorithms, and allow derived classes to deal with the problems of 23 multiplying probabilities. 24 25 Derived class of this must implement: 26 27 o _forward_recursion -- Calculate the forward values in the recursion 28 using some kind of technique for preventing underflow errors. 29 30 o _backward_recursion -- Calculate the backward values in the recursion 31 step using some technique to prevent underflow errors. 32 """
33 - def __init__(self, markov_model, sequence):
34 """Initialize to calculate forward and backward probabilities. 35 36 Arguments: 37 38 o markov_model -- The current Markov model we are working with. 39 40 o sequence -- A training sequence containing a set of emissions. 41 """ 42 self._mm = markov_model 43 self._seq = sequence
44
45 - def _forward_recursion(self, cur_state, sequence_pos, forward_vars):
46 """Calculate the forward recursion value. 47 """ 48 raise NotImplementedError("Subclasses must implement")
49
50 - def forward_algorithm(self):
51 """Calculate sequence probability using the forward algorithm. 52 53 This implements the forward algorithm, as described on p57-58 of 54 Durbin et al. 55 56 Returns: 57 58 o A dictionary containing the forward variables. This has keys of the 59 form (state letter, position in the training sequence), and values 60 containing the calculated forward variable. 61 62 o The calculated probability of the sequence. 63 """ 64 # all of the different letters that the state path can be in 65 state_letters = self._seq.states.alphabet.letters 66 67 # -- initialize the algorithm 68 # 69 # NOTE: My index numbers are one less than what is given in Durbin 70 # et al, since we are indexing the sequence going from 0 to 71 # (Length - 1) not 1 to Length, like in Durbin et al. 72 # 73 forward_var = {} 74 # f_{0}(0) = 1 75 forward_var[(state_letters[0], -1)] = 1 76 # f_{k}(0) = 0, for k > 0 77 for k in range(1, len(state_letters)): 78 forward_var[(state_letters[k], -1)] = 0 79 80 # -- now do the recursion step 81 # loop over the training sequence 82 # Recursion step: (i = 1 .. L) 83 for i in range(len(self._seq.emissions)): 84 # now loop over the letters in the state path 85 for main_state in state_letters: 86 # calculate the forward value using the appropriate 87 # method to prevent underflow errors 88 forward_value = self._forward_recursion(main_state, i, 89 forward_var) 90 91 if forward_value is not None: 92 forward_var[(main_state, i)] = forward_value 93 94 # -- termination step - calculate the probability of the sequence 95 first_state = state_letters[0] 96 seq_prob = 0 97 98 for state_item in state_letters: 99 # f_{k}(L) 100 forward_value = forward_var[(state_item, 101 len(self._seq.emissions) - 1)] 102 # a_{k0} 103 transition_value = self._mm.transition_prob[(state_item, 104 first_state)] 105 106 seq_prob += forward_value * transition_value 107 108 return forward_var, seq_prob
109
110 - def _backward_recursion(self, cur_state, sequence_pos, forward_vars):
111 """Calculate the backward recursion value. 112 """ 113 raise NotImplementedError("Subclasses must implement")
114
115 - def backward_algorithm(self):
116 """Calculate sequence probability using the backward algorithm. 117 118 This implements the backward algorithm, as described on p58-59 of 119 Durbin et al. 120 121 Returns: 122 123 o A dictionary containing the backwards variables. This has keys 124 of the form (state letter, position in the training sequence), 125 and values containing the calculated backward variable. 126 """ 127 # all of the different letters that the state path can be in 128 state_letters = self._seq.states.alphabet.letters 129 130 # -- initialize the algorithm 131 # 132 # NOTE: My index numbers are one less than what is given in Durbin 133 # et al, since we are indexing the sequence going from 0 to 134 # (Length - 1) not 1 to Length, like in Durbin et al. 135 # 136 backward_var = {} 137 138 first_letter = state_letters[0] 139 # b_{k}(L) = a_{k0} for all k 140 for state in state_letters: 141 backward_var[(state, len(self._seq.emissions) - 1)] = \ 142 self._mm.transition_prob[(state, state_letters[0])] 143 144 # -- recursion 145 # first loop over the training sequence backwards 146 # Recursion step: (i = L - 1 ... 1) 147 all_indexes = list(range(len(self._seq.emissions) - 1)) 148 all_indexes.reverse() 149 for i in all_indexes: 150 # now loop over the letters in the state path 151 for main_state in state_letters: 152 # calculate the backward value using the appropriate 153 # method to prevent underflow errors 154 backward_value = self._backward_recursion(main_state, i, 155 backward_var) 156 157 if backward_value is not None: 158 backward_var[(main_state, i)] = backward_value 159 160 # skip the termination step to avoid recalculations -- you should 161 # get sequence probabilities using the forward algorithm 162 163 return backward_var
164 165
166 -class ScaledDPAlgorithms(AbstractDPAlgorithms):
167 """Implement forward and backward algorithms using a rescaling approach. 168 169 This scales the f and b variables, so that they remain within a 170 manageable numerical interval during calculations. This approach is 171 described in Durbin et al. on p 78. 172 173 This approach is a little more straightforward then log transformation 174 but may still give underflow errors for some types of models. In these 175 cases, the LogDPAlgorithms class should be used. 176 """
177 - def __init__(self, markov_model, sequence):
178 """Initialize the scaled approach to calculating probabilities. 179 Arguments: 180 181 o markov_model -- The current Markov model we are working with. 182 183 o sequence -- A TrainingSequence object that must have a 184 set of emissions to work with. 185 """ 186 AbstractDPAlgorithms.__init__(self, markov_model, sequence) 187 188 self._s_values = {}
189
190 - def _calculate_s_value(self, seq_pos, previous_vars):
191 """Calculate the next scaling variable for a sequence position. 192 193 This utilizes the approach of choosing s values such that the 194 sum of all of the scaled f values is equal to 1. 195 196 Arguments: 197 198 o seq_pos -- The current position we are at in the sequence. 199 200 o previous_vars -- All of the forward or backward variables 201 calculated so far. 202 203 Returns: 204 205 o The calculated scaling variable for the sequence item. 206 """ 207 # all of the different letters the state can have 208 state_letters = self._seq.states.alphabet.letters 209 210 # loop over all of the possible states 211 s_value = 0 212 for main_state in state_letters: 213 emission = self._mm.emission_prob[(main_state, 214 self._seq.emissions[seq_pos])] 215 216 # now sum over all of the previous vars and transitions 217 trans_and_var_sum = 0 218 for second_state in self._mm.transitions_from(main_state): 219 # the value of the previous f or b value 220 var_value = previous_vars[(second_state, seq_pos - 1)] 221 222 # the transition probability 223 trans_value = self._mm.transition_prob[(second_state, 224 main_state)] 225 226 trans_and_var_sum += (var_value * trans_value) 227 228 s_value += (emission * trans_and_var_sum) 229 230 return s_value
231
232 - def _forward_recursion(self, cur_state, sequence_pos, forward_vars):
233 """Calculate the value of the forward recursion. 234 235 Arguments: 236 237 o cur_state -- The letter of the state we are calculating the 238 forward variable for. 239 240 o sequence_pos -- The position we are at in the training seq. 241 242 o forward_vars -- The current set of forward variables 243 """ 244 # calculate the s value, if we haven't done so already (ie. during 245 # a previous forward or backward recursion) 246 if sequence_pos not in self._s_values: 247 self._s_values[sequence_pos] = \ 248 self._calculate_s_value(sequence_pos, forward_vars) 249 250 # e_{l}(x_{i}) 251 seq_letter = self._seq.emissions[sequence_pos] 252 cur_emission_prob = self._mm.emission_prob[(cur_state, seq_letter)] 253 # divide by the scaling value 254 scale_emission_prob = (float(cur_emission_prob) / 255 float(self._s_values[sequence_pos])) 256 257 # loop over all of the possible states at the position 258 state_pos_sum = 0 259 have_transition = 0 260 for second_state in self._mm.transitions_from(cur_state): 261 have_transition = 1 262 263 # get the previous forward_var values 264 # f_{k}(i - 1) 265 prev_forward = forward_vars[(second_state, sequence_pos - 1)] 266 267 # a_{kl} 268 cur_trans_prob = self._mm.transition_prob[(second_state, 269 cur_state)] 270 state_pos_sum += prev_forward * cur_trans_prob 271 272 # if we have the possiblity of having a transition 273 # return the recursion value 274 if have_transition: 275 return (scale_emission_prob * state_pos_sum) 276 else: 277 return None
278
279 - def _backward_recursion(self, cur_state, sequence_pos, backward_vars):
280 """Calculate the value of the backward recursion 281 282 Arguments: 283 284 o cur_state -- The letter of the state we are calculating the 285 forward variable for. 286 287 o sequence_pos -- The position we are at in the training seq. 288 289 o backward_vars -- The current set of backward variables 290 """ 291 # calculate the s value, if we haven't done so already (ie. during 292 # a previous forward or backward recursion) 293 if sequence_pos not in self._s_values: 294 self._s_values[sequence_pos] = \ 295 self._calculate_s_value(sequence_pos, backward_vars) 296 297 # loop over all of the possible states at the position 298 state_pos_sum = 0 299 have_transition = 0 300 for second_state in self._mm.transitions_from(cur_state): 301 have_transition = 1 302 # e_{l}(x_{i + 1}) 303 seq_letter = self._seq.emissions[sequence_pos + 1] 304 cur_emission_prob = self._mm.emission_prob[(cur_state, seq_letter)] 305 306 # get the previous backward_var value 307 # b_{l}(i + 1) 308 prev_backward = backward_vars[(second_state, sequence_pos + 1)] 309 310 # the transition probability -- a_{kl} 311 cur_transition_prob = self._mm.transition_prob[(cur_state, 312 second_state)] 313 314 state_pos_sum += (cur_emission_prob * prev_backward * 315 cur_transition_prob) 316 317 # if we have a probability for a transition, return it 318 if have_transition: 319 return (state_pos_sum / float(self._s_values[sequence_pos])) 320 # otherwise we have no probability (ie. we can't do this transition) 321 # and return None 322 else: 323 return None
324 325
326 -class LogDPAlgorithms(AbstractDPAlgorithms):
327 """Implement forward and backward algorithms using a log approach. 328 329 This uses the approach of calculating the sum of log probabilities 330 using a lookup table for common values. 331 332 XXX This is not implemented yet! 333 """
334 - def __init__(self, markov_model, sequence):
335 raise NotImplementedError("Haven't coded this yet...")
336