Package Bio :: Module NaiveBayes
[hide private]
[frames] | no frames]

Source Code for Module Bio.NaiveBayes

  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This provides code for a general Naive Bayes learner. 
  7   
  8  Naive Bayes is a supervised classification algorithm that uses Bayes 
  9  rule to compute the fit between a new observation and some previously 
 10  observed data.  The observations are discrete feature vectors, with 
 11  the Bayes assumption that the features are independent.  Although this 
 12  is hardly ever true, the classifier works well enough in practice. 
 13   
 14  Glossary: 
 15  observation    A feature vector of discrete data. 
 16  class          A possible classification for an observation. 
 17   
 18   
 19  Classes: 
 20  NaiveBayes     Holds information for a naive Bayes classifier. 
 21   
 22  Functions: 
 23  train          Train a new naive Bayes classifier. 
 24  calculate      Calculate the probabilities of each class, given an observation. 
 25  classify       Classify an observation into a class. 
 26   
 27  """ 
 28   
 29  import numpy 
 30   
 31   
32 -def _contents(items):
33 term = 1.0/len(items) 34 counts = {} 35 for item in items: 36 counts[item] = counts.get(item, 0) + term 37 return counts
38 39
40 -class NaiveBayes(object):
41 """Holds information for a NaiveBayes classifier. 42 43 Members: 44 classes List of the possible classes of data. 45 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim) 46 p_prior List of the prior probabilities for every class. 47 dimensionality Dimensionality of the data. 48 49 """
50 - def __init__(self):
51 self.classes = [] 52 self.p_conditional = None 53 self.p_prior = [] 54 self.dimensionality = None
55 56
57 -def calculate(nb, observation, scale=0):
58 """calculate(nb, observation[, scale]) -> probability dict 59 60 Calculate log P(class|observation) for each class. nb is a NaiveBayes 61 classifier that has been trained. observation is a list representing 62 the observed data. scale is whether the probability should be 63 scaled by P(observation). By default, no scaling is done. The return 64 value is a dictionary where the keys is the class and the value is the 65 log probability of the class. 66 67 """ 68 # P(class|observation) = P(observation|class)*P(class)/P(observation) 69 # Taking the log: 70 # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) 71 72 # Make sure the observation has the right dimensionality. 73 if len(observation) != nb.dimensionality: 74 raise ValueError("observation in %d dimension, but classifier in %d" 75 % (len(observation), nb.dimensionality)) 76 77 # Calculate log P(observation|class) for every class. 78 n = len(nb.classes) 79 lp_observation_class = numpy.zeros(n) # array of log P(observation|class) 80 for i in range(n): 81 # log P(observation|class) = SUM_i log P(observation_i|class) 82 probs = [None] * len(observation) 83 for j in range(len(observation)): 84 probs[j] = nb.p_conditional[i][j].get(observation[j], 0) 85 lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300)) 86 lp_observation_class[i] = sum(lprobs) 87 88 # Calculate log P(class). 89 lp_prior = numpy.log(nb.p_prior) 90 91 # Calculate log P(observation). 92 lp_observation = 0.0 # P(observation) 93 if scale: # Only calculate this if requested. 94 # log P(observation) = log SUM_i P(observation|class_i)P(class_i) 95 obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class, -700, +700)) 96 lp_observation = numpy.log(sum(obs)) 97 98 # Calculate log P(class|observation). 99 lp_class_observation = {} # Dict of class : log P(class|observation) 100 for i in range(len(nb.classes)): 101 lp_class_observation[nb.classes[i]] = \ 102 lp_observation_class[i] + lp_prior[i] - lp_observation 103 104 return lp_class_observation
105 106
107 -def classify(nb, observation):
108 """classify(nb, observation) -> class 109 110 Classify an observation into a class. 111 112 """ 113 # The class is the one with the highest probability. 114 probs = calculate(nb, observation, scale=0) 115 max_prob = max_class = None 116 for klass in nb.classes: 117 if max_prob is None or probs[klass] > max_prob: 118 max_prob, max_class = probs[klass], klass 119 return max_class
120 121
122 -def train(training_set, results, priors=None, typecode=None):
123 """train(training_set, results[, priors]) -> NaiveBayes 124 125 Train a naive bayes classifier on a training set. training_set is a 126 list of observations. results is a list of the class assignments 127 for each observation. Thus, training_set and results must be the same 128 length. priors is an optional dictionary specifying the prior 129 probabilities for each type of result. If not specified, the priors 130 will be estimated from the training results. 131 132 """ 133 if not len(training_set): 134 raise ValueError("No data in the training set.") 135 if len(training_set) != len(results): 136 raise ValueError("training_set and results should be parallel lists.") 137 138 # If no typecode is specified, try to pick a reasonable one. If 139 # training_set is a Numeric array, then use that typecode. 140 # Otherwise, choose a reasonable default. 141 # XXX NOT IMPLEMENTED 142 143 # Check to make sure each vector in the training set has the same 144 # dimensionality. 145 dimensions = [len(x) for x in training_set] 146 if min(dimensions) != max(dimensions): 147 raise ValueError("observations have different dimensionality") 148 149 nb = NaiveBayes() 150 nb.dimensionality = dimensions[0] 151 152 # Get a list of all the classes, and 153 # estimate the prior probabilities for the classes. 154 if priors is not None: 155 percs = priors 156 nb.classes = list(set(results)) 157 else: 158 class_freq = _contents(results) 159 nb.classes = class_freq.keys() 160 percs = class_freq 161 nb.classes.sort() # keep it tidy 162 163 nb.p_prior = numpy.zeros(len(nb.classes)) 164 for i in range(len(nb.classes)): 165 nb.p_prior[i] = percs[nb.classes[i]] 166 167 # Collect all the observations in class. For each class, make a 168 # matrix of training instances versus dimensions. I might be able 169 # to optimize this with Numeric, if the training_set parameter 170 # were guaranteed to be a matrix. However, this may not be the 171 # case, because the client may be hacking up a sparse matrix or 172 # something. 173 c2i = {} # class to index of class 174 for index, key in enumerate(nb.classes): 175 c2i[key] = index 176 observations = [[] for c in nb.classes] # separate observations by class 177 for i in range(len(results)): 178 klass, obs = results[i], training_set[i] 179 observations[c2i[klass]].append(obs) 180 # Now make the observations Numeric matrics. 181 for i in range(len(observations)): 182 # XXX typecode must be specified! 183 observations[i] = numpy.asarray(observations[i], typecode) 184 185 # Calculate P(value|class,dim) for every class. 186 # This is a good loop to optimize. 187 nb.p_conditional = [] 188 for i in range(len(nb.classes)): 189 class_observations = observations[i] # observations for this class 190 nb.p_conditional.append([None] * nb.dimensionality) 191 for j in range(nb.dimensionality): 192 # Collect all the values in this dimension. 193 values = class_observations[:, j] 194 195 # Add pseudocounts here. This needs to be parameterized. 196 #values = list(values) + range(len(nb.classes)) # XXX add 1 197 198 # Estimate P(value|class,dim) 199 nb.p_conditional[i][j] = _contents(values) 200 return nb
201 202 if __name__ == "__main__": 203 # Car data from example 'Naive Bayes Classifier example' by Eric Meisner November 22, 2003 204 # http://www.inf.u-szeged.hu/~ormandi/teaching/mi2/02-naiveBayes-example.pdf 205 xcar=[ 206 ['Red', 'Sports', 'Domestic'], 207 ['Red', 'Sports', 'Domestic'], 208 ['Red', 'Sports', 'Domestic'], 209 ['Yellow', 'Sports', 'Domestic'], 210 ['Yellow', 'Sports', 'Imported'], 211 ['Yellow', 'SUV', 'Imported'], 212 ['Yellow', 'SUV', 'Imported'], 213 ['Yellow', 'SUV', 'Domestic'], 214 ['Red', 'SUV', 'Imported'], 215 ['Red', 'Sports', 'Imported'] 216 ] 217 218 ycar=[ 219 'Yes', 220 'No', 221 'Yes', 222 'No', 223 'Yes', 224 'No', 225 'Yes', 226 'No', 227 'No', 228 'Yes' 229 ] 230 231 carmodel = train(xcar, ycar) 232 carresult = classify(carmodel, ['Red', 'Sports', 'Domestic']) 233 print 'Is Yes?', carresult 234 carresult = classify(carmodel, ['Red', 'SUV', 'Domestic']) 235 print 'Is No?', carresult 236