Package Bio :: Module MaxEntropy
[hide private]
[frames] | no frames]

Source Code for Module Bio.MaxEntropy

  1  # Copyright 2001 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Maximum Entropy code. 
  7   
  8  Uses Improved Iterative Scaling. 
  9  """ 
 10  # TODO Define terminology 
 11   
 12  from __future__ import print_function 
 13  from functools import reduce 
 14   
 15  from Bio._py3k import map 
 16   
 17  try: 
 18      import numpy 
 19  except ImportError: 
 20      from Bio import MissingPythonDependencyError 
 21      raise MissingPythonDependencyError( 
 22          "Install NumPy if you want to use Bio.MaxEntropy.") 
 23   
 24   
25 -class MaxEntropy(object):
26 """Holds information for a Maximum Entropy classifier. 27 28 Members: 29 classes List of the possible classes of data. 30 alphas List of the weights for each feature. 31 feature_fns List of the feature functions. 32 33 Car data from example Naive Bayes Classifier example by Eric Meisner November 22, 2003 34 http://www.inf.u-szeged.hu/~ormandi/teaching 35 36 >>> from Bio.MaxEntropy import train, classify 37 >>> xcar = [ 38 ... ['Red', 'Sports', 'Domestic'], 39 ... ['Red', 'Sports', 'Domestic'], 40 ... ['Red', 'Sports', 'Domestic'], 41 ... ['Yellow', 'Sports', 'Domestic'], 42 ... ['Yellow', 'Sports', 'Imported'], 43 ... ['Yellow', 'SUV', 'Imported'], 44 ... ['Yellow', 'SUV', 'Imported'], 45 ... ['Yellow', 'SUV', 'Domestic'], 46 ... ['Red', 'SUV', 'Imported'], 47 ... ['Red', 'Sports', 'Imported']] 48 >>> ycar = ['Yes','No','Yes','No','Yes','No','Yes','No','No','Yes'] 49 50 Requires some rules or features 51 52 >>> def udf1(ts, cl): 53 ... return ts[0] != 'Red' 54 ... 55 >>> def udf2(ts, cl): 56 ... return ts[1] != 'Sports' 57 ... 58 >>> def udf3(ts, cl): 59 ... return ts[2] != 'Domestic' 60 ... 61 >>> user_functions = [udf1, udf2, udf3] # must be an iterable type 62 >>> xe = train(xcar, ycar, user_functions) 63 >>> for xv, yv in zip(xcar, ycar): 64 ... xc = classify(xe, xv) 65 ... print('Pred: %s gives %s y is %s' % (xv, xc, yv)) 66 ... 67 Pred: ['Red', 'Sports', 'Domestic'] gives No y is Yes 68 Pred: ['Red', 'Sports', 'Domestic'] gives No y is No 69 Pred: ['Red', 'Sports', 'Domestic'] gives No y is Yes 70 Pred: ['Yellow', 'Sports', 'Domestic'] gives No y is No 71 Pred: ['Yellow', 'Sports', 'Imported'] gives No y is Yes 72 Pred: ['Yellow', 'SUV', 'Imported'] gives No y is No 73 Pred: ['Yellow', 'SUV', 'Imported'] gives No y is Yes 74 Pred: ['Yellow', 'SUV', 'Domestic'] gives No y is No 75 Pred: ['Red', 'SUV', 'Imported'] gives No y is No 76 Pred: ['Red', 'Sports', 'Imported'] gives No y is Yes 77 78 """
79 - def __init__(self):
80 self.classes = [] 81 self.alphas = [] 82 self.feature_fns = []
83 84
85 -def calculate(me, observation):
86 """calculate(me, observation) -> list of log probs 87 88 Calculate the log of the probability for each class. me is a 89 MaxEntropy object that has been trained. observation is a vector 90 representing the observed data. The return value is a list of 91 unnormalized log probabilities for each class. 92 93 """ 94 scores = [] 95 assert len(me.feature_fns) == len(me.alphas) 96 for klass in me.classes: 97 lprob = 0.0 98 for fn, alpha in zip(me.feature_fns, me.alphas): 99 lprob += fn(observation, klass) * alpha 100 scores.append(lprob) 101 return scores
102 103
104 -def classify(me, observation):
105 """classify(me, observation) -> class 106 107 Classify an observation into a class. 108 109 """ 110 scores = calculate(me, observation) 111 max_score, klass = scores[0], me.classes[0] 112 for i in range(1, len(scores)): 113 if scores[i] > max_score: 114 max_score, klass = scores[i], me.classes[i] 115 return klass
116 117
118 -def _eval_feature_fn(fn, xs, classes):
119 """_eval_feature_fn(fn, xs, classes) -> dict of values 120 121 Evaluate a feature function on every instance of the training set 122 and class. fn is a callback function that takes two parameters: a 123 training instance and a class. Return a dictionary of (training 124 set index, class index) -> non-zero value. Values of 0 are not 125 stored in the dictionary. 126 127 """ 128 values = {} 129 for i in range(len(xs)): 130 for j in range(len(classes)): 131 f = fn(xs[i], classes[j]) 132 if f != 0: 133 values[(i, j)] = f 134 return values
135 136
137 -def _calc_empirical_expects(xs, ys, classes, features):
138 """_calc_empirical_expects(xs, ys, classes, features) -> list of expectations 139 140 Calculate the expectation of each function from the data. This is 141 the constraint for the maximum entropy distribution. Return a 142 list of expectations, parallel to the list of features. 143 144 """ 145 # E[f_i] = SUM_x,y P(x, y) f(x, y) 146 # = 1/N f(x, y) 147 class2index = {} 148 for index, key in enumerate(classes): 149 class2index[key] = index 150 ys_i = [class2index[y] for y in ys] 151 152 expect = [] 153 N = len(xs) 154 for feature in features: 155 s = 0 156 for i in range(N): 157 s += feature.get((i, ys_i[i]), 0) 158 expect.append(float(s) / N) 159 return expect
160 161
162 -def _calc_model_expects(xs, classes, features, alphas):
163 """_calc_model_expects(xs, classes, features, alphas) -> list of expectations. 164 165 Calculate the expectation of each feature from the model. This is 166 not used in maximum entropy training, but provides a good function 167 for debugging. 168 169 """ 170 # SUM_X P(x) SUM_Y P(Y|X) F(X, Y) 171 # = 1/N SUM_X SUM_Y P(Y|X) F(X, Y) 172 p_yx = _calc_p_class_given_x(xs, classes, features, alphas) 173 174 expects = [] 175 for feature in features: 176 sum = 0.0 177 for (i, j), f in feature.items(): 178 sum += p_yx[i][j] * f 179 expects.append(sum / len(xs)) 180 return expects
181 182
183 -def _calc_p_class_given_x(xs, classes, features, alphas):
184 """_calc_p_class_given_x(xs, classes, features, alphas) -> matrix 185 186 Calculate P(y|x), where y is the class and x is an instance from 187 the training set. Return a XSxCLASSES matrix of probabilities. 188 189 """ 190 prob_yx = numpy.zeros((len(xs), len(classes))) 191 192 # Calculate log P(y, x). 193 assert len(features) == len(alphas) 194 for feature, alpha in zip(features, alphas): 195 for (x, y), f in feature.items(): 196 prob_yx[x][y] += alpha * f 197 # Take an exponent to get P(y, x) 198 prob_yx = numpy.exp(prob_yx) 199 # Divide out the probability over each class, so we get P(y|x). 200 for i in range(len(xs)): 201 z = sum(prob_yx[i]) 202 prob_yx[i] = prob_yx[i] / z 203 return prob_yx
204 205
206 -def _calc_f_sharp(N, nclasses, features):
207 """_calc_f_sharp(N, nclasses, features) -> matrix of f sharp values.""" 208 # f#(x, y) = SUM_i feature(x, y) 209 f_sharp = numpy.zeros((N, nclasses)) 210 for feature in features: 211 for (i, j), f in feature.items(): 212 f_sharp[i][j] += f 213 return f_sharp
214 215
216 -def _iis_solve_delta(N, feature, f_sharp, empirical, prob_yx, 217 max_newton_iterations, newton_converge):
218 # Solve delta using Newton's method for: 219 # SUM_x P(x) * SUM_c P(c|x) f_i(x, c) e^[delta_i * f#(x, c)] = 0 220 delta = 0.0 221 iters = 0 222 while iters < max_newton_iterations: # iterate for Newton's method 223 f_newton = df_newton = 0.0 # evaluate the function and derivative 224 for (i, j), f in feature.items(): 225 prod = prob_yx[i][j] * f * numpy.exp(delta * f_sharp[i][j]) 226 f_newton += prod 227 df_newton += prod * f_sharp[i][j] 228 f_newton, df_newton = empirical - f_newton / N, -df_newton / N 229 230 ratio = f_newton / df_newton 231 delta -= ratio 232 if numpy.fabs(ratio) < newton_converge: # converged 233 break 234 iters = iters + 1 235 else: 236 raise RuntimeError("Newton's method did not converge") 237 return delta
238 239
240 -def _train_iis(xs, classes, features, f_sharp, alphas, e_empirical, 241 max_newton_iterations, newton_converge):
242 """Do one iteration of hill climbing to find better alphas (PRIVATE).""" 243 # This is a good function to parallelize. 244 245 # Pre-calculate P(y|x) 246 p_yx = _calc_p_class_given_x(xs, classes, features, alphas) 247 248 N = len(xs) 249 newalphas = alphas[:] 250 for i in range(len(alphas)): 251 delta = _iis_solve_delta(N, features[i], f_sharp, e_empirical[i], p_yx, 252 max_newton_iterations, newton_converge) 253 newalphas[i] += delta 254 return newalphas
255 256
257 -def train(training_set, results, feature_fns, update_fn=None, 258 max_iis_iterations=10000, iis_converge=1.0e-5, 259 max_newton_iterations=100, newton_converge=1.0e-10):
260 """Train a maximum entropy classifier, returns MaxEntropy object. 261 262 Train a maximum entropy classifier on a training set. 263 training_set is a list of observations. results is a list of the 264 class assignments for each observation. feature_fns is a list of 265 the features. These are callback functions that take an 266 observation and class and return a 1 or 0. update_fn is a 267 callback function that is called at each training iteration. It is 268 passed a MaxEntropy object that encapsulates the current state of 269 the training. 270 271 The maximum number of iterations and the convergence criterion for IIS 272 are given by max_iis_iterations and iis_converge, respectively, while 273 max_newton_iterations and newton_converge are the maximum number 274 of iterations and the convergence criterion for Newton's method. 275 """ 276 if not training_set: 277 raise ValueError("No data in the training set.") 278 if len(training_set) != len(results): 279 raise ValueError("training_set and results should be parallel lists.") 280 281 # Rename variables for convenience. 282 xs, ys = training_set, results 283 284 # Get a list of all the classes that need to be trained. 285 classes = sorted(set(results)) 286 287 # Cache values for all features. 288 features = [_eval_feature_fn(fn, training_set, classes) 289 for fn in feature_fns] 290 # Cache values for f#. 291 f_sharp = _calc_f_sharp(len(training_set), len(classes), features) 292 293 # Pre-calculate the empirical expectations of the features. 294 e_empirical = _calc_empirical_expects(xs, ys, classes, features) 295 296 # Now train the alpha parameters to weigh each feature. 297 alphas = [0.0] * len(features) 298 iters = 0 299 while iters < max_iis_iterations: 300 nalphas = _train_iis(xs, classes, features, f_sharp, 301 alphas, e_empirical, 302 max_newton_iterations, newton_converge) 303 diff = map(lambda x, y: numpy.fabs(x - y), alphas, nalphas) 304 diff = reduce(lambda x, y: x + y, diff, 0) 305 alphas = nalphas 306 307 me = MaxEntropy() 308 me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns 309 if update_fn is not None: 310 update_fn(me) 311 312 if diff < iis_converge: # converged 313 break 314 else: 315 raise RuntimeError("IIS did not converge") 316 317 return me
318 319 if __name__ == "__main__": 320 from Bio._utils import run_doctest 321 run_doctest(verbose=0) 322