1
2
3
4
5
6 """This provides code for a general Naive Bayes learner.
7
8 Naive Bayes is a supervised classification algorithm that uses Bayes
9 rule to compute the fit between a new observation and some previously
10 observed data. The observations are discrete feature vectors, with
11 the Bayes assumption that the features are independent. Although this
12 is hardly ever true, the classifier works well enough in practice.
13
14 Glossary:
15 observation A feature vector of discrete data.
16 class A possible classification for an observation.
17
18
19 Classes:
20 NaiveBayes Holds information for a naive Bayes classifier.
21
22 Functions:
23 train Train a new naive Bayes classifier.
24 calculate Calculate the probabilities of each class, given an observation.
25 classify Classify an observation into a class.
26
27 """
28
29 import numpy
30
31
32 -def _contents(items):
33 term = 1.0/len(items)
34 counts = {}
35 for item in items:
36 counts[item] = counts.get(item, 0) + term
37 return counts
38
39
41 """Holds information for a NaiveBayes classifier.
42
43 Members:
44 classes List of the possible classes of data.
45 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim)
46 p_prior List of the prior probabilities for every class.
47 dimensionality Dimensionality of the data.
48
49 """
51 self.classes = []
52 self.p_conditional = None
53 self.p_prior = []
54 self.dimensionality = None
55
56
58 """calculate(nb, observation[, scale]) -> probability dict
59
60 Calculate log P(class|observation) for each class. nb is a NaiveBayes
61 classifier that has been trained. observation is a list representing
62 the observed data. scale is whether the probability should be
63 scaled by P(observation). By default, no scaling is done. The return
64 value is a dictionary where the keys is the class and the value is the
65 log probability of the class.
66
67 """
68
69
70
71
72
73 if len(observation) != nb.dimensionality:
74 raise ValueError("observation in %d dimension, but classifier in %d"
75 % (len(observation), nb.dimensionality))
76
77
78 n = len(nb.classes)
79 lp_observation_class = numpy.zeros(n)
80 for i in range(n):
81
82 probs = [None] * len(observation)
83 for j in range(len(observation)):
84 probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
85 lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300))
86 lp_observation_class[i] = sum(lprobs)
87
88
89 lp_prior = numpy.log(nb.p_prior)
90
91
92 lp_observation = 0.0
93 if scale:
94
95 obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class, -700, +700))
96 lp_observation = numpy.log(sum(obs))
97
98
99 lp_class_observation = {}
100 for i in range(len(nb.classes)):
101 lp_class_observation[nb.classes[i]] = \
102 lp_observation_class[i] + lp_prior[i] - lp_observation
103
104 return lp_class_observation
105
106
108 """classify(nb, observation) -> class
109
110 Classify an observation into a class.
111
112 """
113
114 probs = calculate(nb, observation, scale=0)
115 max_prob = max_class = None
116 for klass in nb.classes:
117 if max_prob is None or probs[klass] > max_prob:
118 max_prob, max_class = probs[klass], klass
119 return max_class
120
121
122 -def train(training_set, results, priors=None, typecode=None):
123 """train(training_set, results[, priors]) -> NaiveBayes
124
125 Train a naive bayes classifier on a training set. training_set is a
126 list of observations. results is a list of the class assignments
127 for each observation. Thus, training_set and results must be the same
128 length. priors is an optional dictionary specifying the prior
129 probabilities for each type of result. If not specified, the priors
130 will be estimated from the training results.
131
132 """
133 if not len(training_set):
134 raise ValueError("No data in the training set.")
135 if len(training_set) != len(results):
136 raise ValueError("training_set and results should be parallel lists.")
137
138
139
140
141
142
143
144
145 dimensions = [len(x) for x in training_set]
146 if min(dimensions) != max(dimensions):
147 raise ValueError("observations have different dimensionality")
148
149 nb = NaiveBayes()
150 nb.dimensionality = dimensions[0]
151
152
153
154 if priors is not None:
155 percs = priors
156 nb.classes = list(set(results))
157 else:
158 class_freq = _contents(results)
159 nb.classes = class_freq.keys()
160 percs = class_freq
161 nb.classes.sort()
162
163 nb.p_prior = numpy.zeros(len(nb.classes))
164 for i in range(len(nb.classes)):
165 nb.p_prior[i] = percs[nb.classes[i]]
166
167
168
169
170
171
172
173 c2i = {}
174 for index, key in enumerate(nb.classes):
175 c2i[key] = index
176 observations = [[] for c in nb.classes]
177 for i in range(len(results)):
178 klass, obs = results[i], training_set[i]
179 observations[c2i[klass]].append(obs)
180
181 for i in range(len(observations)):
182
183 observations[i] = numpy.asarray(observations[i], typecode)
184
185
186
187 nb.p_conditional = []
188 for i in range(len(nb.classes)):
189 class_observations = observations[i]
190 nb.p_conditional.append([None] * nb.dimensionality)
191 for j in range(nb.dimensionality):
192
193 values = class_observations[:, j]
194
195
196
197
198
199 nb.p_conditional[i][j] = _contents(values)
200 return nb
201
202 if __name__ == "__main__":
203
204
205 xcar=[
206 ['Red', 'Sports', 'Domestic'],
207 ['Red', 'Sports', 'Domestic'],
208 ['Red', 'Sports', 'Domestic'],
209 ['Yellow', 'Sports', 'Domestic'],
210 ['Yellow', 'Sports', 'Imported'],
211 ['Yellow', 'SUV', 'Imported'],
212 ['Yellow', 'SUV', 'Imported'],
213 ['Yellow', 'SUV', 'Domestic'],
214 ['Red', 'SUV', 'Imported'],
215 ['Red', 'Sports', 'Imported']
216 ]
217
218 ycar=[
219 'Yes',
220 'No',
221 'Yes',
222 'No',
223 'Yes',
224 'No',
225 'Yes',
226 'No',
227 'No',
228 'Yes'
229 ]
230
231 carmodel = train(xcar, ycar)
232 carresult = classify(carmodel, ['Red', 'Sports', 'Domestic'])
233 print 'Is Yes?', carresult
234 carresult = classify(carmodel, ['Red', 'SUV', 'Domestic'])
235 print 'Is No?', carresult
236