Package Bio :: Module NaiveBayes
[hide private]
[frames] | no frames]

Source Code for Module Bio.NaiveBayes

  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This provides code for a general Naive Bayes learner. 
  7   
  8  Naive Bayes is a supervised classification algorithm that uses Bayes 
  9  rule to compute the fit between a new observation and some previously 
 10  observed data.  The observations are discrete feature vectors, with 
 11  the Bayes assumption that the features are independent.  Although this 
 12  is hardly ever true, the classifier works well enough in practice. 
 13   
 14  Glossary: 
 15  observation    A feature vector of discrete data. 
 16  class          A possible classification for an observation. 
 17   
 18   
 19  Classes: 
 20  NaiveBayes     Holds information for a naive Bayes classifier. 
 21   
 22  Functions: 
 23  train          Train a new naive Bayes classifier. 
 24  calculate      Calculate the probabilities of each class, given an observation. 
 25  classify       Classify an observation into a class. 
 26   
 27  """ 
 28   
 29  from __future__ import print_function 
 30   
 31  import numpy 
 32   
 33   
34 -def _contents(items):
35 term = 1.0/len(items) 36 counts = {} 37 for item in items: 38 counts[item] = counts.get(item, 0) + term 39 return counts
40 41
42 -class NaiveBayes(object):
43 """Holds information for a NaiveBayes classifier. 44 45 Members: 46 classes List of the possible classes of data. 47 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim) 48 p_prior List of the prior probabilities for every class. 49 dimensionality Dimensionality of the data. 50 51 """
52 - def __init__(self):
53 self.classes = [] 54 self.p_conditional = None 55 self.p_prior = [] 56 self.dimensionality = None
57 58
59 -def calculate(nb, observation, scale=0):
60 """calculate(nb, observation[, scale]) -> probability dict 61 62 Calculate log P(class|observation) for each class. nb is a NaiveBayes 63 classifier that has been trained. observation is a list representing 64 the observed data. scale is whether the probability should be 65 scaled by P(observation). By default, no scaling is done. The return 66 value is a dictionary where the keys is the class and the value is the 67 log probability of the class. 68 69 """ 70 # P(class|observation) = P(observation|class)*P(class)/P(observation) 71 # Taking the log: 72 # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) 73 74 # Make sure the observation has the right dimensionality. 75 if len(observation) != nb.dimensionality: 76 raise ValueError("observation in %d dimension, but classifier in %d" 77 % (len(observation), nb.dimensionality)) 78 79 # Calculate log P(observation|class) for every class. 80 n = len(nb.classes) 81 lp_observation_class = numpy.zeros(n) # array of log P(observation|class) 82 for i in range(n): 83 # log P(observation|class) = SUM_i log P(observation_i|class) 84 probs = [None] * len(observation) 85 for j in range(len(observation)): 86 probs[j] = nb.p_conditional[i][j].get(observation[j], 0) 87 lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300)) 88 lp_observation_class[i] = sum(lprobs) 89 90 # Calculate log P(class). 91 lp_prior = numpy.log(nb.p_prior) 92 93 # Calculate log P(observation). 94 lp_observation = 0.0 # P(observation) 95 if scale: # Only calculate this if requested. 96 # log P(observation) = log SUM_i P(observation|class_i)P(class_i) 97 obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class, -700, +700)) 98 lp_observation = numpy.log(sum(obs)) 99 100 # Calculate log P(class|observation). 101 lp_class_observation = {} # Dict of class : log P(class|observation) 102 for i in range(len(nb.classes)): 103 lp_class_observation[nb.classes[i]] = \ 104 lp_observation_class[i] + lp_prior[i] - lp_observation 105 106 return lp_class_observation
107 108
109 -def classify(nb, observation):
110 """classify(nb, observation) -> class 111 112 Classify an observation into a class. 113 114 """ 115 # The class is the one with the highest probability. 116 probs = calculate(nb, observation, scale=0) 117 max_prob = max_class = None 118 for klass in nb.classes: 119 if max_prob is None or probs[klass] > max_prob: 120 max_prob, max_class = probs[klass], klass 121 return max_class
122 123
124 -def train(training_set, results, priors=None, typecode=None):
125 """train(training_set, results[, priors]) -> NaiveBayes 126 127 Train a naive bayes classifier on a training set. training_set is a 128 list of observations. results is a list of the class assignments 129 for each observation. Thus, training_set and results must be the same 130 length. priors is an optional dictionary specifying the prior 131 probabilities for each type of result. If not specified, the priors 132 will be estimated from the training results. 133 134 """ 135 if not len(training_set): 136 raise ValueError("No data in the training set.") 137 if len(training_set) != len(results): 138 raise ValueError("training_set and results should be parallel lists.") 139 140 # If no typecode is specified, try to pick a reasonable one. If 141 # training_set is a Numeric array, then use that typecode. 142 # Otherwise, choose a reasonable default. 143 # XXX NOT IMPLEMENTED 144 145 # Check to make sure each vector in the training set has the same 146 # dimensionality. 147 dimensions = [len(x) for x in training_set] 148 if min(dimensions) != max(dimensions): 149 raise ValueError("observations have different dimensionality") 150 151 nb = NaiveBayes() 152 nb.dimensionality = dimensions[0] 153 154 # Get a list of all the classes, and 155 # estimate the prior probabilities for the classes. 156 if priors is not None: 157 percs = priors 158 nb.classes = list(set(results)) 159 else: 160 class_freq = _contents(results) 161 nb.classes = list(class_freq.keys()) 162 percs = class_freq 163 nb.classes.sort() # keep it tidy 164 165 nb.p_prior = numpy.zeros(len(nb.classes)) 166 for i in range(len(nb.classes)): 167 nb.p_prior[i] = percs[nb.classes[i]] 168 169 # Collect all the observations in class. For each class, make a 170 # matrix of training instances versus dimensions. I might be able 171 # to optimize this with Numeric, if the training_set parameter 172 # were guaranteed to be a matrix. However, this may not be the 173 # case, because the client may be hacking up a sparse matrix or 174 # something. 175 c2i = {} # class to index of class 176 for index, key in enumerate(nb.classes): 177 c2i[key] = index 178 observations = [[] for c in nb.classes] # separate observations by class 179 for i in range(len(results)): 180 klass, obs = results[i], training_set[i] 181 observations[c2i[klass]].append(obs) 182 # Now make the observations Numeric matrics. 183 for i in range(len(observations)): 184 # XXX typecode must be specified! 185 observations[i] = numpy.asarray(observations[i], typecode) 186 187 # Calculate P(value|class,dim) for every class. 188 # This is a good loop to optimize. 189 nb.p_conditional = [] 190 for i in range(len(nb.classes)): 191 class_observations = observations[i] # observations for this class 192 nb.p_conditional.append([None] * nb.dimensionality) 193 for j in range(nb.dimensionality): 194 # Collect all the values in this dimension. 195 values = class_observations[:, j] 196 197 # Add pseudocounts here. This needs to be parameterized. 198 #values = list(values) + range(len(nb.classes)) # XXX add 1 199 200 # Estimate P(value|class,dim) 201 nb.p_conditional[i][j] = _contents(values) 202 return nb
203 204 if __name__ == "__main__": 205 # Car data from example 'Naive Bayes Classifier example' by Eric Meisner November 22, 2003 206 # http://www.inf.u-szeged.hu/~ormandi/teaching/mi2/02-naiveBayes-example.pdf 207 xcar=[ 208 ['Red', 'Sports', 'Domestic'], 209 ['Red', 'Sports', 'Domestic'], 210 ['Red', 'Sports', 'Domestic'], 211 ['Yellow', 'Sports', 'Domestic'], 212 ['Yellow', 'Sports', 'Imported'], 213 ['Yellow', 'SUV', 'Imported'], 214 ['Yellow', 'SUV', 'Imported'], 215 ['Yellow', 'SUV', 'Domestic'], 216 ['Red', 'SUV', 'Imported'], 217 ['Red', 'Sports', 'Imported'] 218 ] 219 220 ycar=[ 221 'Yes', 222 'No', 223 'Yes', 224 'No', 225 'Yes', 226 'No', 227 'Yes', 228 'No', 229 'No', 230 'Yes' 231 ] 232 233 carmodel = train(xcar, ycar) 234 carresult = classify(carmodel, ['Red', 'Sports', 'Domestic']) 235 print('Is Yes? %s' % carresult) 236 carresult = classify(carmodel, ['Red', 'SUV', 'Domestic']) 237 print('Is No? %s' % carresult) 238