Package Bio :: Module MaxEntropy
[hide private]
[frames] | no frames]

Source Code for Module Bio.MaxEntropy

  1  # Copyright 2001 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Maximum Entropy code. 
  7   
  8  Uses Improved Iterative Scaling. 
  9  """ 
 10  # TODO Define terminology 
 11   
 12  from __future__ import print_function 
 13  from functools import reduce 
 14   
 15  try: 
 16      import numpy 
 17  except ImportError: 
 18      from Bio import MissingPythonDependencyError 
 19      raise MissingPythonDependencyError( 
 20          "Install NumPy if you want to use Bio.MaxEntropy.") 
 21   
 22   
23 -class MaxEntropy(object):
24 """Hold information for a Maximum Entropy classifier. 25 26 Members: 27 classes List of the possible classes of data. 28 alphas List of the weights for each feature. 29 feature_fns List of the feature functions. 30 31 Car data from example Naive Bayes Classifier example by Eric Meisner November 22, 2003 32 http://www.inf.u-szeged.hu/~ormandi/teaching 33 34 >>> from Bio.MaxEntropy import train, classify 35 >>> xcar = [ 36 ... ['Red', 'Sports', 'Domestic'], 37 ... ['Red', 'Sports', 'Domestic'], 38 ... ['Red', 'Sports', 'Domestic'], 39 ... ['Yellow', 'Sports', 'Domestic'], 40 ... ['Yellow', 'Sports', 'Imported'], 41 ... ['Yellow', 'SUV', 'Imported'], 42 ... ['Yellow', 'SUV', 'Imported'], 43 ... ['Yellow', 'SUV', 'Domestic'], 44 ... ['Red', 'SUV', 'Imported'], 45 ... ['Red', 'Sports', 'Imported']] 46 >>> ycar = ['Yes','No','Yes','No','Yes','No','Yes','No','No','Yes'] 47 48 Requires some rules or features 49 50 >>> def udf1(ts, cl): 51 ... return ts[0] != 'Red' 52 ... 53 >>> def udf2(ts, cl): 54 ... return ts[1] != 'Sports' 55 ... 56 >>> def udf3(ts, cl): 57 ... return ts[2] != 'Domestic' 58 ... 59 >>> user_functions = [udf1, udf2, udf3] # must be an iterable type 60 >>> xe = train(xcar, ycar, user_functions) 61 >>> for xv, yv in zip(xcar, ycar): 62 ... xc = classify(xe, xv) 63 ... print('Pred: %s gives %s y is %s' % (xv, xc, yv)) 64 ... 65 Pred: ['Red', 'Sports', 'Domestic'] gives No y is Yes 66 Pred: ['Red', 'Sports', 'Domestic'] gives No y is No 67 Pred: ['Red', 'Sports', 'Domestic'] gives No y is Yes 68 Pred: ['Yellow', 'Sports', 'Domestic'] gives No y is No 69 Pred: ['Yellow', 'Sports', 'Imported'] gives No y is Yes 70 Pred: ['Yellow', 'SUV', 'Imported'] gives No y is No 71 Pred: ['Yellow', 'SUV', 'Imported'] gives No y is Yes 72 Pred: ['Yellow', 'SUV', 'Domestic'] gives No y is No 73 Pred: ['Red', 'SUV', 'Imported'] gives No y is No 74 Pred: ['Red', 'Sports', 'Imported'] gives No y is Yes 75 """ 76
77 - def __init__(self):
78 """Initialize the class.""" 79 self.classes = [] 80 self.alphas = [] 81 self.feature_fns = []
82 83
84 -def calculate(me, observation):
85 """Calculate the log of the probability for each class. 86 87 me is a MaxEntropy object that has been trained. observation is a vector 88 representing the observed data. The return value is a list of 89 unnormalized log probabilities for each class. 90 """ 91 scores = [] 92 assert len(me.feature_fns) == len(me.alphas) 93 for klass in me.classes: 94 lprob = 0.0 95 for fn, alpha in zip(me.feature_fns, me.alphas): 96 lprob += fn(observation, klass) * alpha 97 scores.append(lprob) 98 return scores
99 100
101 -def classify(me, observation):
102 """Classify an observation into a class.""" 103 scores = calculate(me, observation) 104 max_score, klass = scores[0], me.classes[0] 105 for i in range(1, len(scores)): 106 if scores[i] > max_score: 107 max_score, klass = scores[i], me.classes[i] 108 return klass
109 110
111 -def _eval_feature_fn(fn, xs, classes):
112 """Evaluate a feature function on every instance of the training set and class (PRIVATE). 113 114 fn is a callback function that takes two parameters: a 115 training instance and a class. Return a dictionary of (training 116 set index, class index) -> non-zero value. Values of 0 are not 117 stored in the dictionary. 118 """ 119 values = {} 120 for i in range(len(xs)): 121 for j in range(len(classes)): 122 f = fn(xs[i], classes[j]) 123 if f != 0: 124 values[(i, j)] = f 125 return values
126 127
128 -def _calc_empirical_expects(xs, ys, classes, features):
129 """Calculate the expectation of each function from the data (PRIVATE). 130 131 This is the constraint for the maximum entropy distribution. Return a 132 list of expectations, parallel to the list of features. 133 """ 134 # E[f_i] = SUM_x,y P(x, y) f(x, y) 135 # = 1/N f(x, y) 136 class2index = {} 137 for index, key in enumerate(classes): 138 class2index[key] = index 139 ys_i = [class2index[y] for y in ys] 140 141 expect = [] 142 N = len(xs) 143 for feature in features: 144 s = 0 145 for i in range(N): 146 s += feature.get((i, ys_i[i]), 0) 147 expect.append(float(s) / N) 148 return expect
149 150
151 -def _calc_model_expects(xs, classes, features, alphas):
152 """Calculate the expectation of each feature from the model (PRIVATE). 153 154 This is not used in maximum entropy training, but provides a good function 155 for debugging. 156 """ 157 # SUM_X P(x) SUM_Y P(Y|X) F(X, Y) 158 # = 1/N SUM_X SUM_Y P(Y|X) F(X, Y) 159 p_yx = _calc_p_class_given_x(xs, classes, features, alphas) 160 161 expects = [] 162 for feature in features: 163 sum = 0.0 164 for (i, j), f in feature.items(): 165 sum += p_yx[i][j] * f 166 expects.append(sum / len(xs)) 167 return expects
168 169
170 -def _calc_p_class_given_x(xs, classes, features, alphas):
171 """Calculate conditional probability P(y|x) (PRIVATE). 172 173 y is the class and x is an instance from the training set. 174 Return a XSxCLASSES matrix of probabilities. 175 """ 176 prob_yx = numpy.zeros((len(xs), len(classes))) 177 178 # Calculate log P(y, x). 179 assert len(features) == len(alphas) 180 for feature, alpha in zip(features, alphas): 181 for (x, y), f in feature.items(): 182 prob_yx[x][y] += alpha * f 183 # Take an exponent to get P(y, x) 184 prob_yx = numpy.exp(prob_yx) 185 # Divide out the probability over each class, so we get P(y|x). 186 for i in range(len(xs)): 187 z = sum(prob_yx[i]) 188 prob_yx[i] = prob_yx[i] / z 189 return prob_yx
190 191
192 -def _calc_f_sharp(N, nclasses, features):
193 """Calculate a matrix of f sharp values (PRIVATE).""" 194 # f#(x, y) = SUM_i feature(x, y) 195 f_sharp = numpy.zeros((N, nclasses)) 196 for feature in features: 197 for (i, j), f in feature.items(): 198 f_sharp[i][j] += f 199 return f_sharp
200 201
202 -def _iis_solve_delta(N, feature, f_sharp, empirical, prob_yx, 203 max_newton_iterations, newton_converge):
204 """Solve delta using Newton's method (PRIVATE).""" 205 # SUM_x P(x) * SUM_c P(c|x) f_i(x, c) e^[delta_i * f#(x, c)] = 0 206 delta = 0.0 207 iters = 0 208 while iters < max_newton_iterations: # iterate for Newton's method 209 f_newton = df_newton = 0.0 # evaluate the function and derivative 210 for (i, j), f in feature.items(): 211 prod = prob_yx[i][j] * f * numpy.exp(delta * f_sharp[i][j]) 212 f_newton += prod 213 df_newton += prod * f_sharp[i][j] 214 f_newton, df_newton = empirical - f_newton / N, -df_newton / N 215 216 ratio = f_newton / df_newton 217 delta -= ratio 218 if numpy.fabs(ratio) < newton_converge: # converged 219 break 220 iters = iters + 1 221 else: 222 raise RuntimeError("Newton's method did not converge") 223 return delta
224 225
226 -def _train_iis(xs, classes, features, f_sharp, alphas, e_empirical, 227 max_newton_iterations, newton_converge):
228 """Do one iteration of hill climbing to find better alphas (PRIVATE).""" 229 # This is a good function to parallelize. 230 231 # Pre-calculate P(y|x) 232 p_yx = _calc_p_class_given_x(xs, classes, features, alphas) 233 234 N = len(xs) 235 newalphas = alphas[:] 236 for i in range(len(alphas)): 237 delta = _iis_solve_delta(N, features[i], f_sharp, e_empirical[i], p_yx, 238 max_newton_iterations, newton_converge) 239 newalphas[i] += delta 240 return newalphas
241 242
243 -def train(training_set, results, feature_fns, update_fn=None, 244 max_iis_iterations=10000, iis_converge=1.0e-5, 245 max_newton_iterations=100, newton_converge=1.0e-10):
246 """Train a maximum entropy classifier, returns MaxEntropy object. 247 248 Train a maximum entropy classifier on a training set. 249 training_set is a list of observations. results is a list of the 250 class assignments for each observation. feature_fns is a list of 251 the features. These are callback functions that take an 252 observation and class and return a 1 or 0. update_fn is a 253 callback function that is called at each training iteration. It is 254 passed a MaxEntropy object that encapsulates the current state of 255 the training. 256 257 The maximum number of iterations and the convergence criterion for IIS 258 are given by max_iis_iterations and iis_converge, respectively, while 259 max_newton_iterations and newton_converge are the maximum number 260 of iterations and the convergence criterion for Newton's method. 261 """ 262 if not training_set: 263 raise ValueError("No data in the training set.") 264 if len(training_set) != len(results): 265 raise ValueError("training_set and results should be parallel lists.") 266 267 # Rename variables for convenience. 268 xs, ys = training_set, results 269 270 # Get a list of all the classes that need to be trained. 271 classes = sorted(set(results)) 272 273 # Cache values for all features. 274 features = [_eval_feature_fn(fn, training_set, classes) 275 for fn in feature_fns] 276 # Cache values for f#. 277 f_sharp = _calc_f_sharp(len(training_set), len(classes), features) 278 279 # Pre-calculate the empirical expectations of the features. 280 e_empirical = _calc_empirical_expects(xs, ys, classes, features) 281 282 # Now train the alpha parameters to weigh each feature. 283 alphas = [0.0] * len(features) 284 iters = 0 285 while iters < max_iis_iterations: 286 nalphas = _train_iis(xs, classes, features, f_sharp, 287 alphas, e_empirical, 288 max_newton_iterations, newton_converge) 289 diff = [numpy.fabs(x - y) for x, y in zip(alphas, nalphas)] 290 diff = reduce(numpy.add, diff, 0) 291 alphas = nalphas 292 293 me = MaxEntropy() 294 me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns 295 if update_fn is not None: 296 update_fn(me) 297 298 if diff < iis_converge: # converged 299 break 300 else: 301 raise RuntimeError("IIS did not converge") 302 303 return me
304 305 306 if __name__ == "__main__": 307 from Bio._utils import run_doctest 308 run_doctest(verbose=0) 309