Package Bio :: Module NaiveBayes
[hide private]
[frames] | no frames]

Source Code for Module Bio.NaiveBayes

  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """General Naive Bayes learner. 
  7   
  8  Naive Bayes is a supervised classification algorithm that uses Bayes 
  9  rule to compute the fit between a new observation and some previously 
 10  observed data.  The observations are discrete feature vectors, with 
 11  the Bayes assumption that the features are independent.  Although this 
 12  is hardly ever true, the classifier works well enough in practice. 
 13   
 14  Glossary: 
 15   - observation - A feature vector of discrete data. 
 16   - class       - A possible classification for an observation. 
 17   
 18  Classes: 
 19   - NaiveBayes - Holds information for a naive Bayes classifier. 
 20   
 21  Functions: 
 22   - train     - Train a new naive Bayes classifier. 
 23   - calculate - Calculate the probabilities of each class, 
 24     given an observation. 
 25   - classify  - Classify an observation into a class. 
 26   
 27  """ 
 28   
 29  from __future__ import print_function 
 30   
 31  import numpy 
 32   
 33   
34 -def _contents(items):
35 """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE).""" 36 term = 1.0 / len(items) 37 counts = {} 38 for item in items: 39 counts[item] = counts.get(item, 0) + term 40 return counts
41 42
43 -class NaiveBayes(object):
44 """Hold information for a NaiveBayes classifier. 45 46 Attributes: 47 - classes - List of the possible classes of data. 48 - p_conditional - CLASS x DIM array of dicts of value -> ``P(value|class,dim)`` 49 - p_prior - List of the prior probabilities for every class. 50 - dimensionality - Dimensionality of the data. 51 52 """ 53
54 - def __init__(self):
55 """Initialize the class.""" 56 self.classes = [] 57 self.p_conditional = None 58 self.p_prior = [] 59 self.dimensionality = None
60 61
62 -def calculate(nb, observation, scale=False):
63 """Calculate the logarithmic conditional probability for each class. 64 65 Arguments: 66 - nb - A NaiveBayes classifier that has been trained. 67 - observation - A list representing the observed data. 68 - scale - Boolean to indicate whether the probability should be 69 scaled by ``P(observation)``. By default, no scaling is done. 70 71 A dictionary is returned where the key is the class and the value is 72 the log probability of the class. 73 """ 74 # P(class|observation) = P(observation|class)*P(class)/P(observation) 75 # Taking the log: 76 # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) 77 78 # Make sure the observation has the right dimensionality. 79 if len(observation) != nb.dimensionality: 80 raise ValueError("observation in {0} dimension, but classifier in {1}".format(len(observation), 81 nb.dimensionality)) 82 83 # Calculate log P(observation|class) for every class. 84 n = len(nb.classes) 85 lp_observation_class = numpy.zeros(n) # array of log P(observation|class) 86 for i in range(n): 87 # log P(observation|class) = SUM_i log P(observation_i|class) 88 probs = [None] * len(observation) 89 for j in range(len(observation)): 90 probs[j] = nb.p_conditional[i][j].get(observation[j], 0) 91 lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300)) 92 lp_observation_class[i] = sum(lprobs) 93 94 # Calculate log P(class). 95 lp_prior = numpy.log(nb.p_prior) 96 97 # Calculate log P(observation). 98 lp_observation = 0.0 # P(observation) 99 if scale: # Only calculate this if requested. 100 # log P(observation) = log SUM_i P(observation|class_i)P(class_i) 101 obs = numpy.exp(numpy.clip(lp_prior + lp_observation_class, -700, +700)) 102 lp_observation = numpy.log(sum(obs)) 103 104 # Calculate log P(class|observation). 105 lp_class_observation = {} # Dict of class : log P(class|observation) 106 for i in range(len(nb.classes)): 107 lp_class_observation[nb.classes[i]] = \ 108 lp_observation_class[i] + lp_prior[i] - lp_observation 109 110 return lp_class_observation
111 112
113 -def classify(nb, observation):
114 """Classify an observation into a class.""" 115 # The class is the one with the highest probability. 116 probs = calculate(nb, observation, scale=False) 117 max_prob = max_class = None 118 for klass in nb.classes: 119 if max_prob is None or probs[klass] > max_prob: 120 max_prob, max_class = probs[klass], klass 121 return max_class
122 123
124 -def train(training_set, results, priors=None, typecode=None):
125 """Train a NaiveBayes classifier on a training set. 126 127 Arguments: 128 - training_set - List of observations. 129 - results - List of the class assignments for each observation. 130 Thus, training_set and results must be the same length. 131 - priors - Optional dictionary specifying the prior probabilities 132 for each type of result. If not specified, the priors will 133 be estimated from the training results. 134 135 """ 136 if not len(training_set): 137 raise ValueError("No data in the training set.") 138 if len(training_set) != len(results): 139 raise ValueError("training_set and results should be parallel lists.") 140 141 # If no typecode is specified, try to pick a reasonable one. If 142 # training_set is a Numeric array, then use that typecode. 143 # Otherwise, choose a reasonable default. 144 # XXX NOT IMPLEMENTED 145 146 # Check to make sure each vector in the training set has the same 147 # dimensionality. 148 dimensions = [len(x) for x in training_set] 149 if min(dimensions) != max(dimensions): 150 raise ValueError("observations have different dimensionality") 151 152 nb = NaiveBayes() 153 nb.dimensionality = dimensions[0] 154 155 # Get a list of all the classes, and 156 # estimate the prior probabilities for the classes. 157 if priors is not None: 158 percs = priors 159 nb.classes = list(set(results)) 160 else: 161 class_freq = _contents(results) 162 nb.classes = list(class_freq.keys()) 163 percs = class_freq 164 nb.classes.sort() # keep it tidy 165 166 nb.p_prior = numpy.zeros(len(nb.classes)) 167 for i in range(len(nb.classes)): 168 nb.p_prior[i] = percs[nb.classes[i]] 169 170 # Collect all the observations in class. For each class, make a 171 # matrix of training instances versus dimensions. I might be able 172 # to optimize this with Numeric, if the training_set parameter 173 # were guaranteed to be a matrix. However, this may not be the 174 # case, because the client may be hacking up a sparse matrix or 175 # something. 176 c2i = {} # class to index of class 177 for index, key in enumerate(nb.classes): 178 c2i[key] = index 179 observations = [[] for c in nb.classes] # separate observations by class 180 for i in range(len(results)): 181 klass, obs = results[i], training_set[i] 182 observations[c2i[klass]].append(obs) 183 # Now make the observations Numeric matrix. 184 for i in range(len(observations)): 185 # XXX typecode must be specified! 186 observations[i] = numpy.asarray(observations[i], typecode) 187 188 # Calculate P(value|class,dim) for every class. 189 # This is a good loop to optimize. 190 nb.p_conditional = [] 191 for i in range(len(nb.classes)): 192 class_observations = observations[i] # observations for this class 193 nb.p_conditional.append([None] * nb.dimensionality) 194 for j in range(nb.dimensionality): 195 # Collect all the values in this dimension. 196 values = class_observations[:, j] 197 198 # Add pseudocounts here. This needs to be parameterized. 199 # values = list(values) + range(len(nb.classes)) # XXX add 1 200 201 # Estimate P(value|class,dim) 202 nb.p_conditional[i][j] = _contents(values) 203 return nb
204